commit
9285551de4
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "shift",
|
||||||
|
"params": {"min_shift_ms": -5,
|
||||||
|
"max_shift_ms": 5},
|
||||||
|
"prob": 1.0
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,39 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "noise",
|
||||||
|
"params": {"min_snr_dB": 40,
|
||||||
|
"max_snr_dB": 50,
|
||||||
|
"noise_manifest_path": "datasets/manifest.noise"},
|
||||||
|
"prob": 0.6
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "impulse",
|
||||||
|
"params": {"impulse_manifest_path": "datasets/manifest.impulse"},
|
||||||
|
"prob": 0.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "speed",
|
||||||
|
"params": {"min_speed_rate": 0.95,
|
||||||
|
"max_speed_rate": 1.05},
|
||||||
|
"prob": 0.5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "shift",
|
||||||
|
"params": {"min_shift_ms": -5,
|
||||||
|
"max_shift_ms": 5},
|
||||||
|
"prob": 1.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "volume",
|
||||||
|
"params": {"min_gain_dBFS": -10,
|
||||||
|
"max_gain_dBFS": 10},
|
||||||
|
"prob": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "bayesian_normal",
|
||||||
|
"params": {"target_db": -20,
|
||||||
|
"prior_db": -20,
|
||||||
|
"prior_samples": 100},
|
||||||
|
"prob": 0.0
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,35 @@
|
|||||||
|
"""Contains the impulse response augmentation model."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from data_utils.augmentor.base import AugmentorBase
|
||||||
|
from data_utils import utils
|
||||||
|
from data_utils.audio import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class ImpulseResponseAugmentor(AugmentorBase):
|
||||||
|
"""Augmentation model for adding impulse response effect.
|
||||||
|
|
||||||
|
:param rng: Random generator object.
|
||||||
|
:type rng: random.Random
|
||||||
|
:param impulse_manifest_path: Manifest path for impulse audio data.
|
||||||
|
:type impulse_manifest_path: basestring
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rng, impulse_manifest_path):
|
||||||
|
self._rng = rng
|
||||||
|
self._impulse_manifest = utils.read_manifest(
|
||||||
|
manifest_path=impulse_manifest_path)
|
||||||
|
|
||||||
|
def transform_audio(self, audio_segment):
|
||||||
|
"""Add impulse response effect.
|
||||||
|
|
||||||
|
Note that this is an in-place transformation.
|
||||||
|
|
||||||
|
:param audio_segment: Audio segment to add effects to.
|
||||||
|
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||||
|
"""
|
||||||
|
impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
|
||||||
|
impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
|
||||||
|
audio_segment.convolve(impulse_segment, allow_resample=True)
|
@ -0,0 +1,50 @@
|
|||||||
|
"""Contains the noise perturb augmentation model."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
from data_utils.augmentor.base import AugmentorBase
|
||||||
|
from data_utils import utils
|
||||||
|
from data_utils.audio import AudioSegment
|
||||||
|
|
||||||
|
|
||||||
|
class NoisePerturbAugmentor(AugmentorBase):
|
||||||
|
"""Augmentation model for adding background noise.
|
||||||
|
|
||||||
|
:param rng: Random generator object.
|
||||||
|
:type rng: random.Random
|
||||||
|
:param min_snr_dB: Minimal signal noise ratio, in decibels.
|
||||||
|
:type min_snr_dB: float
|
||||||
|
:param max_snr_dB: Maximal signal noise ratio, in decibels.
|
||||||
|
:type max_snr_dB: float
|
||||||
|
:param noise_manifest_path: Manifest path for noise audio data.
|
||||||
|
:type noise_manifest_path: basestring
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
|
||||||
|
self._min_snr_dB = min_snr_dB
|
||||||
|
self._max_snr_dB = max_snr_dB
|
||||||
|
self._rng = rng
|
||||||
|
self._noise_manifest = utils.read_manifest(
|
||||||
|
manifest_path=noise_manifest_path)
|
||||||
|
|
||||||
|
def transform_audio(self, audio_segment):
|
||||||
|
"""Add background noise audio.
|
||||||
|
|
||||||
|
Note that this is an in-place transformation.
|
||||||
|
|
||||||
|
:param audio_segment: Audio segment to add effects to.
|
||||||
|
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||||
|
"""
|
||||||
|
noise_json = self._rng.sample(self._noise_manifest, 1)[0]
|
||||||
|
if noise_json['duration'] < audio_segment.duration:
|
||||||
|
raise RuntimeError("The duration of sampled noise audio is smaller "
|
||||||
|
"than the audio segment to add effects to.")
|
||||||
|
diff_duration = noise_json['duration'] - audio_segment.duration
|
||||||
|
start = self._rng.uniform(0, diff_duration)
|
||||||
|
end = start + audio_segment.duration
|
||||||
|
noise_segment = AudioSegment.slice_from_file(
|
||||||
|
noise_json['audio_filepath'], start=start, end=end)
|
||||||
|
snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
|
||||||
|
audio_segment.add_noise(
|
||||||
|
noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
|
@ -0,0 +1,128 @@
|
|||||||
|
"""Prepare CHiME3 background data.
|
||||||
|
|
||||||
|
Download, unpack and create manifest files.
|
||||||
|
Manifest file is a json-format file with each line containing the
|
||||||
|
meta data (i.e. audio filepath, transcript and audio duration)
|
||||||
|
of each audio file in the data set.
|
||||||
|
"""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import distutils.util
|
||||||
|
import os
|
||||||
|
import wget
|
||||||
|
import zipfile
|
||||||
|
import argparse
|
||||||
|
import soundfile
|
||||||
|
import json
|
||||||
|
from paddle.v2.dataset.common import md5file
|
||||||
|
|
||||||
|
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
|
||||||
|
|
||||||
|
URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
|
||||||
|
MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--target_dir",
|
||||||
|
default=DATA_HOME + "/chime3_background",
|
||||||
|
type=str,
|
||||||
|
help="Directory to save the dataset. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--manifest_filepath",
|
||||||
|
default="manifest.chime3.background",
|
||||||
|
type=str,
|
||||||
|
help="Filepath for output manifests. (default: %(default)s)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def download(url, md5sum, target_dir, filename=None):
|
||||||
|
"""Download file from url to target_dir, and check md5sum."""
|
||||||
|
if filename == None:
|
||||||
|
filename = url.split("/")[-1]
|
||||||
|
if not os.path.exists(target_dir): os.makedirs(target_dir)
|
||||||
|
filepath = os.path.join(target_dir, filename)
|
||||||
|
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
|
||||||
|
print("Downloading %s ..." % url)
|
||||||
|
wget.download(url, target_dir)
|
||||||
|
print("\nMD5 Chesksum %s ..." % filepath)
|
||||||
|
if not md5file(filepath) == md5sum:
|
||||||
|
raise RuntimeError("MD5 checksum failed.")
|
||||||
|
else:
|
||||||
|
print("File exists, skip downloading. (%s)" % filepath)
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
|
def unpack(filepath, target_dir):
|
||||||
|
"""Unpack the file to the target_dir."""
|
||||||
|
print("Unpacking %s ..." % filepath)
|
||||||
|
if filepath.endswith('.zip'):
|
||||||
|
zip = zipfile.ZipFile(filepath, 'r')
|
||||||
|
zip.extractall(target_dir)
|
||||||
|
zip.close()
|
||||||
|
elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
|
||||||
|
tar = zipfile.open(filepath)
|
||||||
|
tar.extractall(target_dir)
|
||||||
|
tar.close()
|
||||||
|
else:
|
||||||
|
raise ValueError("File format is not supported for unpacking.")
|
||||||
|
|
||||||
|
|
||||||
|
def create_manifest(data_dir, manifest_path):
|
||||||
|
"""Create a manifest json file summarizing the data set, with each line
|
||||||
|
containing the meta data (i.e. audio filepath, transcription text, audio
|
||||||
|
duration) of each audio file within the data set.
|
||||||
|
"""
|
||||||
|
print("Creating manifest %s ..." % manifest_path)
|
||||||
|
json_lines = []
|
||||||
|
for subfolder, _, filelist in sorted(os.walk(data_dir)):
|
||||||
|
for filename in filelist:
|
||||||
|
if filename.endswith('.wav'):
|
||||||
|
filepath = os.path.join(data_dir, subfolder, filename)
|
||||||
|
audio_data, samplerate = soundfile.read(filepath)
|
||||||
|
duration = float(len(audio_data)) / samplerate
|
||||||
|
json_lines.append(
|
||||||
|
json.dumps({
|
||||||
|
'audio_filepath': filepath,
|
||||||
|
'duration': duration,
|
||||||
|
'text': ''
|
||||||
|
}))
|
||||||
|
with open(manifest_path, 'w') as out_file:
|
||||||
|
for line in json_lines:
|
||||||
|
out_file.write(line + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_chime3(url, md5sum, target_dir, manifest_path):
|
||||||
|
"""Download, unpack and create summmary manifest file."""
|
||||||
|
if not os.path.exists(os.path.join(target_dir, "CHiME3")):
|
||||||
|
# download
|
||||||
|
filepath = download(url, md5sum, target_dir,
|
||||||
|
"myairbridge-AG0Y3DNBE5IWRRTV.zip")
|
||||||
|
# unpack
|
||||||
|
unpack(filepath, target_dir)
|
||||||
|
unpack(
|
||||||
|
os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
|
||||||
|
unpack(
|
||||||
|
os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
|
||||||
|
unpack(
|
||||||
|
os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
|
||||||
|
unpack(
|
||||||
|
os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
|
||||||
|
else:
|
||||||
|
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||||
|
target_dir)
|
||||||
|
# create manifest json file
|
||||||
|
create_manifest(target_dir, manifest_path)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
prepare_chime3(
|
||||||
|
url=URL,
|
||||||
|
md5sum=MD5,
|
||||||
|
target_dir=args.target_dir,
|
||||||
|
manifest_path=args.manifest_filepath)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,10 @@
|
|||||||
|
cd noise
|
||||||
|
python chime3_background.py
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Prepare CHiME3 background noise failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
cd -
|
||||||
|
|
||||||
|
cat noise/manifest.* > manifest.noise
|
||||||
|
echo "All done."
|
@ -0,0 +1,94 @@
|
|||||||
|
"""Client-end for the ASR demo."""
|
||||||
|
from pynput import keyboard
|
||||||
|
import struct
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import pyaudio
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--host_ip",
|
||||||
|
default="localhost",
|
||||||
|
type=str,
|
||||||
|
help="Server IP address. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--host_port",
|
||||||
|
default=8086,
|
||||||
|
type=int,
|
||||||
|
help="Server Port. (default: %(default)s)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
is_recording = False
|
||||||
|
enable_trigger_record = True
|
||||||
|
|
||||||
|
|
||||||
|
def on_press(key):
|
||||||
|
"""On-press keyboard callback function."""
|
||||||
|
global is_recording, enable_trigger_record
|
||||||
|
if key == keyboard.Key.space:
|
||||||
|
if (not is_recording) and enable_trigger_record:
|
||||||
|
sys.stdout.write("Start Recording ... ")
|
||||||
|
sys.stdout.flush()
|
||||||
|
is_recording = True
|
||||||
|
|
||||||
|
|
||||||
|
def on_release(key):
|
||||||
|
"""On-release keyboard callback function."""
|
||||||
|
global is_recording, enable_trigger_record
|
||||||
|
if key == keyboard.Key.esc:
|
||||||
|
return False
|
||||||
|
elif key == keyboard.Key.space:
|
||||||
|
if is_recording == True:
|
||||||
|
is_recording = False
|
||||||
|
|
||||||
|
|
||||||
|
data_list = []
|
||||||
|
|
||||||
|
|
||||||
|
def callback(in_data, frame_count, time_info, status):
|
||||||
|
"""Audio recorder's stream callback function."""
|
||||||
|
global data_list, is_recording, enable_trigger_record
|
||||||
|
if is_recording:
|
||||||
|
data_list.append(in_data)
|
||||||
|
enable_trigger_record = False
|
||||||
|
elif len(data_list) > 0:
|
||||||
|
# Connect to server and send data
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.connect((args.host_ip, args.host_port))
|
||||||
|
sent = ''.join(data_list)
|
||||||
|
sock.sendall(struct.pack('>i', len(sent)) + sent)
|
||||||
|
print('Speech[length=%d] Sent.' % len(sent))
|
||||||
|
# Receive data from the server and shut down
|
||||||
|
received = sock.recv(1024)
|
||||||
|
print "Recognition Results: {}".format(received)
|
||||||
|
sock.close()
|
||||||
|
data_list = []
|
||||||
|
enable_trigger_record = True
|
||||||
|
return (in_data, pyaudio.paContinue)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# prepare audio recorder
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
stream = p.open(
|
||||||
|
format=pyaudio.paInt32,
|
||||||
|
channels=1,
|
||||||
|
rate=16000,
|
||||||
|
input=True,
|
||||||
|
stream_callback=callback)
|
||||||
|
stream.start_stream()
|
||||||
|
|
||||||
|
# prepare keyboard listener
|
||||||
|
with keyboard.Listener(
|
||||||
|
on_press=on_press, on_release=on_release) as listener:
|
||||||
|
listener.join()
|
||||||
|
|
||||||
|
# close up
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
p.terminate()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,245 @@
|
|||||||
|
"""Server-end for the ASR demo."""
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
import argparse
|
||||||
|
import distutils.util
|
||||||
|
from time import gmtime, strftime
|
||||||
|
import SocketServer
|
||||||
|
import struct
|
||||||
|
import wave
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
from utils import print_arguments
|
||||||
|
from data_utils.data import DataGenerator
|
||||||
|
from model import DeepSpeech2Model
|
||||||
|
from data_utils.utils import read_manifest
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--host_ip",
|
||||||
|
default="localhost",
|
||||||
|
type=str,
|
||||||
|
help="Server IP address. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--host_port",
|
||||||
|
default=8086,
|
||||||
|
type=int,
|
||||||
|
help="Server Port. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--speech_save_dir",
|
||||||
|
default="demo_cache",
|
||||||
|
type=str,
|
||||||
|
help="Directory for saving demo speech. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocab_filepath",
|
||||||
|
default='datasets/vocab/eng_vocab.txt',
|
||||||
|
type=str,
|
||||||
|
help="Vocabulary filepath. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--mean_std_filepath",
|
||||||
|
default='mean_std.npz',
|
||||||
|
type=str,
|
||||||
|
help="Manifest path for normalizer. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--warmup_manifest_path",
|
||||||
|
default='datasets/manifest.test',
|
||||||
|
type=str,
|
||||||
|
help="Manifest path for warmup test. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--specgram_type",
|
||||||
|
default='linear',
|
||||||
|
type=str,
|
||||||
|
help="Feature type of audio data: 'linear' (power spectrum)"
|
||||||
|
" or 'mfcc'. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_conv_layers",
|
||||||
|
default=2,
|
||||||
|
type=int,
|
||||||
|
help="Convolution layer number. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_rnn_layers",
|
||||||
|
default=3,
|
||||||
|
type=int,
|
||||||
|
help="RNN layer number. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--rnn_layer_size",
|
||||||
|
default=512,
|
||||||
|
type=int,
|
||||||
|
help="RNN layer cell number. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--use_gpu",
|
||||||
|
default=True,
|
||||||
|
type=distutils.util.strtobool,
|
||||||
|
help="Use gpu or not. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--model_filepath",
|
||||||
|
default='checkpoints/params.latest.tar.gz',
|
||||||
|
type=str,
|
||||||
|
help="Model filepath. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--decode_method",
|
||||||
|
default='beam_search',
|
||||||
|
type=str,
|
||||||
|
help="Method for ctc decoding: best_path or beam_search. "
|
||||||
|
"(default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--beam_size",
|
||||||
|
default=100,
|
||||||
|
type=int,
|
||||||
|
help="Width for beam search decoding. (default: %(default)d)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--language_model_path",
|
||||||
|
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
||||||
|
type=str,
|
||||||
|
help="Path for language model. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--alpha",
|
||||||
|
default=0.36,
|
||||||
|
type=float,
|
||||||
|
help="Parameter associated with language model. (default: %(default)f)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--beta",
|
||||||
|
default=0.25,
|
||||||
|
type=float,
|
||||||
|
help="Parameter associated with word count. (default: %(default)f)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--cutoff_prob",
|
||||||
|
default=0.99,
|
||||||
|
type=float,
|
||||||
|
help="The cutoff probability of pruning"
|
||||||
|
"in beam search. (default: %(default)f)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
class AsrTCPServer(SocketServer.TCPServer):
|
||||||
|
"""The ASR TCP Server."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
server_address,
|
||||||
|
RequestHandlerClass,
|
||||||
|
speech_save_dir,
|
||||||
|
audio_process_handler,
|
||||||
|
bind_and_activate=True):
|
||||||
|
self.speech_save_dir = speech_save_dir
|
||||||
|
self.audio_process_handler = audio_process_handler
|
||||||
|
SocketServer.TCPServer.__init__(
|
||||||
|
self, server_address, RequestHandlerClass, bind_and_activate=True)
|
||||||
|
|
||||||
|
|
||||||
|
class AsrRequestHandler(SocketServer.BaseRequestHandler):
|
||||||
|
"""The ASR request handler."""
|
||||||
|
|
||||||
|
def handle(self):
|
||||||
|
# receive data through TCP socket
|
||||||
|
chunk = self.request.recv(1024)
|
||||||
|
target_len = struct.unpack('>i', chunk[:4])[0]
|
||||||
|
data = chunk[4:]
|
||||||
|
while len(data) < target_len:
|
||||||
|
chunk = self.request.recv(1024)
|
||||||
|
data += chunk
|
||||||
|
# write to file
|
||||||
|
filename = self._write_to_file(data)
|
||||||
|
|
||||||
|
print("Received utterance[length=%d] from %s, saved to %s." %
|
||||||
|
(len(data), self.client_address[0], filename))
|
||||||
|
start_time = time.time()
|
||||||
|
transcript = self.server.audio_process_handler(filename)
|
||||||
|
finish_time = time.time()
|
||||||
|
print("Response Time: %f, Transcript: %s" %
|
||||||
|
(finish_time - start_time, transcript))
|
||||||
|
self.request.sendall(transcript)
|
||||||
|
|
||||||
|
def _write_to_file(self, data):
|
||||||
|
# prepare save dir and filename
|
||||||
|
if not os.path.exists(self.server.speech_save_dir):
|
||||||
|
os.mkdir(self.server.speech_save_dir)
|
||||||
|
timestamp = strftime("%Y%m%d%H%M%S", gmtime())
|
||||||
|
out_filename = os.path.join(
|
||||||
|
self.server.speech_save_dir,
|
||||||
|
timestamp + "_" + self.client_address[0] + ".wav")
|
||||||
|
# write to wav file
|
||||||
|
file = wave.open(out_filename, 'wb')
|
||||||
|
file.setnchannels(1)
|
||||||
|
file.setsampwidth(4)
|
||||||
|
file.setframerate(16000)
|
||||||
|
file.writeframes(data)
|
||||||
|
file.close()
|
||||||
|
return out_filename
|
||||||
|
|
||||||
|
|
||||||
|
def warm_up_test(audio_process_handler,
|
||||||
|
manifest_path,
|
||||||
|
num_test_cases,
|
||||||
|
random_seed=0):
|
||||||
|
"""Warming-up test."""
|
||||||
|
manifest = read_manifest(manifest_path)
|
||||||
|
rng = random.Random(random_seed)
|
||||||
|
samples = rng.sample(manifest, num_test_cases)
|
||||||
|
for idx, sample in enumerate(samples):
|
||||||
|
print("Warm-up Test Case %d: %s", idx, sample['audio_filepath'])
|
||||||
|
start_time = time.time()
|
||||||
|
transcript = audio_process_handler(sample['audio_filepath'])
|
||||||
|
finish_time = time.time()
|
||||||
|
print("Response Time: %f, Transcript: %s" %
|
||||||
|
(finish_time - start_time, transcript))
|
||||||
|
|
||||||
|
|
||||||
|
def start_server():
|
||||||
|
"""Start the ASR server"""
|
||||||
|
# prepare data generator
|
||||||
|
data_generator = DataGenerator(
|
||||||
|
vocab_filepath=args.vocab_filepath,
|
||||||
|
mean_std_filepath=args.mean_std_filepath,
|
||||||
|
augmentation_config='{}',
|
||||||
|
specgram_type=args.specgram_type,
|
||||||
|
num_threads=1)
|
||||||
|
# prepare ASR model
|
||||||
|
ds2_model = DeepSpeech2Model(
|
||||||
|
vocab_size=data_generator.vocab_size,
|
||||||
|
num_conv_layers=args.num_conv_layers,
|
||||||
|
num_rnn_layers=args.num_rnn_layers,
|
||||||
|
rnn_layer_size=args.rnn_layer_size,
|
||||||
|
pretrained_model_path=args.model_filepath)
|
||||||
|
|
||||||
|
# prepare ASR inference handler
|
||||||
|
def file_to_transcript(filename):
|
||||||
|
feature = data_generator.process_utterance(filename, "")
|
||||||
|
result_transcript = ds2_model.infer_batch(
|
||||||
|
infer_data=[feature],
|
||||||
|
decode_method=args.decode_method,
|
||||||
|
beam_alpha=args.alpha,
|
||||||
|
beam_beta=args.beta,
|
||||||
|
beam_size=args.beam_size,
|
||||||
|
cutoff_prob=args.cutoff_prob,
|
||||||
|
vocab_list=data_generator.vocab_list,
|
||||||
|
language_model_path=args.language_model_path,
|
||||||
|
num_processes=1)
|
||||||
|
return result_transcript[0]
|
||||||
|
|
||||||
|
# warming up with utterrances sampled from Librispeech
|
||||||
|
print('-----------------------------------------------------------')
|
||||||
|
print('Warming up ...')
|
||||||
|
warm_up_test(
|
||||||
|
audio_process_handler=file_to_transcript,
|
||||||
|
manifest_path=args.warmup_manifest_path,
|
||||||
|
num_test_cases=3)
|
||||||
|
print('-----------------------------------------------------------')
|
||||||
|
|
||||||
|
# start the server
|
||||||
|
server = AsrTCPServer(
|
||||||
|
server_address=(args.host_ip, args.host_port),
|
||||||
|
RequestHandlerClass=AsrRequestHandler,
|
||||||
|
speech_save_dir=args.speech_save_dir,
|
||||||
|
audio_process_handler=file_to_transcript)
|
||||||
|
print("ASR Server Started.")
|
||||||
|
server.serve_forever()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print_arguments(args)
|
||||||
|
paddle.init(use_gpu=args.use_gpu, trainer_count=1)
|
||||||
|
start_server()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -0,0 +1,177 @@
|
|||||||
|
"""Contains DeepSpeech2 layers."""
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
|
import paddle.v2 as paddle
|
||||||
|
|
||||||
|
|
||||||
|
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
||||||
|
padding, act):
|
||||||
|
"""Convolution layer with batch normalization.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
||||||
|
two image dimension.
|
||||||
|
:type filter_size: int|tuple|list
|
||||||
|
:param num_channels_in: Number of input channels.
|
||||||
|
:type num_channels_in: int
|
||||||
|
:type num_channels_out: Number of output channels.
|
||||||
|
:type num_channels_in: out
|
||||||
|
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||||
|
image dimension.
|
||||||
|
:type padding: int|tuple|list
|
||||||
|
:param act: Activation type.
|
||||||
|
:type act: BaseActivation
|
||||||
|
:return: Batch norm layer after convolution layer.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
conv_layer = paddle.layer.img_conv(
|
||||||
|
input=input,
|
||||||
|
filter_size=filter_size,
|
||||||
|
num_channels=num_channels_in,
|
||||||
|
num_filters=num_channels_out,
|
||||||
|
stride=stride,
|
||||||
|
padding=padding,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=False)
|
||||||
|
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
||||||
|
|
||||||
|
|
||||||
|
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
|
||||||
|
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||||
|
The batch normalization is only performed on input-state weights.
|
||||||
|
|
||||||
|
:param name: Name of the layer.
|
||||||
|
:type name: string
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param size: Number of RNN cells.
|
||||||
|
:type size: int
|
||||||
|
:param act: Activation type.
|
||||||
|
:type act: BaseActivation
|
||||||
|
:return: Bidirectional simple rnn layer.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
# input-hidden weights shared across bi-direcitonal rnn.
|
||||||
|
input_proj = paddle.layer.fc(
|
||||||
|
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
|
||||||
|
# batch norm is only performed on input-state projection
|
||||||
|
input_proj_bn = paddle.layer.batch_norm(
|
||||||
|
input=input_proj, act=paddle.activation.Linear())
|
||||||
|
# forward and backward in time
|
||||||
|
forward_simple_rnn = paddle.layer.recurrent(
|
||||||
|
input=input_proj_bn, act=act, reverse=False)
|
||||||
|
backward_simple_rnn = paddle.layer.recurrent(
|
||||||
|
input=input_proj_bn, act=act, reverse=True)
|
||||||
|
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
||||||
|
|
||||||
|
|
||||||
|
def conv_group(input, num_stacks):
|
||||||
|
"""Convolution group with stacked convolution layers.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param num_stacks: Number of stacked convolution layers.
|
||||||
|
:type num_stacks: int
|
||||||
|
:return: Output layer of the convolution group.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
conv = conv_bn_layer(
|
||||||
|
input=input,
|
||||||
|
filter_size=(11, 41),
|
||||||
|
num_channels_in=1,
|
||||||
|
num_channels_out=32,
|
||||||
|
stride=(3, 2),
|
||||||
|
padding=(5, 20),
|
||||||
|
act=paddle.activation.BRelu())
|
||||||
|
for i in xrange(num_stacks - 1):
|
||||||
|
conv = conv_bn_layer(
|
||||||
|
input=conv,
|
||||||
|
filter_size=(11, 21),
|
||||||
|
num_channels_in=32,
|
||||||
|
num_channels_out=32,
|
||||||
|
stride=(1, 2),
|
||||||
|
padding=(5, 10),
|
||||||
|
act=paddle.activation.BRelu())
|
||||||
|
output_num_channels = 32
|
||||||
|
output_height = 160 // pow(2, num_stacks) + 1
|
||||||
|
return conv, output_num_channels, output_height
|
||||||
|
|
||||||
|
|
||||||
|
def rnn_group(input, size, num_stacks):
|
||||||
|
"""RNN group with stacked bidirectional simple RNN layers.
|
||||||
|
|
||||||
|
:param input: Input layer.
|
||||||
|
:type input: LayerOutput
|
||||||
|
:param size: Number of RNN cells in each layer.
|
||||||
|
:type size: int
|
||||||
|
:param num_stacks: Number of stacked rnn layers.
|
||||||
|
:type num_stacks: int
|
||||||
|
:return: Output layer of the RNN group.
|
||||||
|
:rtype: LayerOutput
|
||||||
|
"""
|
||||||
|
output = input
|
||||||
|
for i in xrange(num_stacks):
|
||||||
|
output = bidirectional_simple_rnn_bn_layer(
|
||||||
|
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def deep_speech2(audio_data,
|
||||||
|
text_data,
|
||||||
|
dict_size,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=256):
|
||||||
|
"""
|
||||||
|
The whole DeepSpeech2 model structure (a simplified version).
|
||||||
|
|
||||||
|
:param audio_data: Audio spectrogram data layer.
|
||||||
|
:type audio_data: LayerOutput
|
||||||
|
:param text_data: Transcription text data layer.
|
||||||
|
:type text_data: LayerOutput
|
||||||
|
:param dict_size: Dictionary size for tokenized transcription.
|
||||||
|
:type dict_size: int
|
||||||
|
:param num_conv_layers: Number of stacking convolution layers.
|
||||||
|
:type num_conv_layers: int
|
||||||
|
:param num_rnn_layers: Number of stacking RNN layers.
|
||||||
|
:type num_rnn_layers: int
|
||||||
|
:param rnn_size: RNN layer size (number of RNN cells).
|
||||||
|
:type rnn_size: int
|
||||||
|
:return: A tuple of an output unnormalized log probability layer (
|
||||||
|
before softmax) and a ctc cost layer.
|
||||||
|
:rtype: tuple of LayerOutput
|
||||||
|
"""
|
||||||
|
# convolution group
|
||||||
|
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
||||||
|
input=audio_data, num_stacks=num_conv_layers)
|
||||||
|
# convert data form convolution feature map to sequence of vectors
|
||||||
|
conv2seq = paddle.layer.block_expand(
|
||||||
|
input=conv_group_output,
|
||||||
|
num_channels=conv_group_num_channels,
|
||||||
|
stride_x=1,
|
||||||
|
stride_y=1,
|
||||||
|
block_x=1,
|
||||||
|
block_y=conv_group_height)
|
||||||
|
# rnn group
|
||||||
|
rnn_group_output = rnn_group(
|
||||||
|
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
|
||||||
|
fc = paddle.layer.fc(
|
||||||
|
input=rnn_group_output,
|
||||||
|
size=dict_size + 1,
|
||||||
|
act=paddle.activation.Linear(),
|
||||||
|
bias_attr=True)
|
||||||
|
# probability distribution with softmax
|
||||||
|
log_probs = paddle.layer.mixed(
|
||||||
|
input=paddle.layer.identity_projection(input=fc),
|
||||||
|
act=paddle.activation.Softmax())
|
||||||
|
# ctc cost
|
||||||
|
ctc_loss = paddle.layer.warp_ctc(
|
||||||
|
input=fc,
|
||||||
|
label=text_data,
|
||||||
|
size=dict_size + 1,
|
||||||
|
blank=dict_size,
|
||||||
|
norm_by_times=True)
|
||||||
|
return log_probs, ctc_loss
|
@ -1,5 +1,5 @@
|
|||||||
wget==3.2
|
|
||||||
scipy==0.13.1
|
scipy==0.13.1
|
||||||
resampy==0.1.5
|
resampy==0.1.5
|
||||||
https://github.com/kpu/kenlm/archive/master.zip
|
SoundFile==0.9.0.post1
|
||||||
python_speech_features
|
python_speech_features
|
||||||
|
https://github.com/luotao1/kenlm/archive/master.zip
|
||||||
|
@ -0,0 +1,23 @@
|
|||||||
|
"""Test Setup."""
|
||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class TestSetup(unittest.TestCase):
|
||||||
|
def test_soundfile(self):
|
||||||
|
import soundfile as sf
|
||||||
|
# floating point data is typically limited to the interval [-1.0, 1.0],
|
||||||
|
# but smaller/larger values are supported as well
|
||||||
|
data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
|
||||||
|
[0.25, -0.25]])
|
||||||
|
file = 'test.wav'
|
||||||
|
sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
|
||||||
|
read, fs = sf.read(file)
|
||||||
|
self.assertTrue(np.all(read == data))
|
||||||
|
self.assertEqual(fs, 44100)
|
||||||
|
os.remove(file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Reference in new issue