commit
9285551de4
@ -0,0 +1,8 @@
|
||||
[
|
||||
{
|
||||
"type": "shift",
|
||||
"params": {"min_shift_ms": -5,
|
||||
"max_shift_ms": 5},
|
||||
"prob": 1.0
|
||||
}
|
||||
]
|
@ -0,0 +1,39 @@
|
||||
[
|
||||
{
|
||||
"type": "noise",
|
||||
"params": {"min_snr_dB": 40,
|
||||
"max_snr_dB": 50,
|
||||
"noise_manifest_path": "datasets/manifest.noise"},
|
||||
"prob": 0.6
|
||||
},
|
||||
{
|
||||
"type": "impulse",
|
||||
"params": {"impulse_manifest_path": "datasets/manifest.impulse"},
|
||||
"prob": 0.5
|
||||
},
|
||||
{
|
||||
"type": "speed",
|
||||
"params": {"min_speed_rate": 0.95,
|
||||
"max_speed_rate": 1.05},
|
||||
"prob": 0.5
|
||||
},
|
||||
{
|
||||
"type": "shift",
|
||||
"params": {"min_shift_ms": -5,
|
||||
"max_shift_ms": 5},
|
||||
"prob": 1.0
|
||||
},
|
||||
{
|
||||
"type": "volume",
|
||||
"params": {"min_gain_dBFS": -10,
|
||||
"max_gain_dBFS": 10},
|
||||
"prob": 0.0
|
||||
},
|
||||
{
|
||||
"type": "bayesian_normal",
|
||||
"params": {"target_db": -20,
|
||||
"prior_db": -20,
|
||||
"prior_samples": 100},
|
||||
"prob": 0.0
|
||||
}
|
||||
]
|
@ -0,0 +1,35 @@
|
||||
"""Contains the impulse response augmentation model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
from data_utils import utils
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class ImpulseResponseAugmentor(AugmentorBase):
|
||||
"""Augmentation model for adding impulse response effect.
|
||||
|
||||
:param rng: Random generator object.
|
||||
:type rng: random.Random
|
||||
:param impulse_manifest_path: Manifest path for impulse audio data.
|
||||
:type impulse_manifest_path: basestring
|
||||
"""
|
||||
|
||||
def __init__(self, rng, impulse_manifest_path):
|
||||
self._rng = rng
|
||||
self._impulse_manifest = utils.read_manifest(
|
||||
manifest_path=impulse_manifest_path)
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Add impulse response effect.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
|
||||
impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
|
||||
audio_segment.convolve(impulse_segment, allow_resample=True)
|
@ -0,0 +1,50 @@
|
||||
"""Contains the noise perturb augmentation model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
from data_utils import utils
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class NoisePerturbAugmentor(AugmentorBase):
|
||||
"""Augmentation model for adding background noise.
|
||||
|
||||
:param rng: Random generator object.
|
||||
:type rng: random.Random
|
||||
:param min_snr_dB: Minimal signal noise ratio, in decibels.
|
||||
:type min_snr_dB: float
|
||||
:param max_snr_dB: Maximal signal noise ratio, in decibels.
|
||||
:type max_snr_dB: float
|
||||
:param noise_manifest_path: Manifest path for noise audio data.
|
||||
:type noise_manifest_path: basestring
|
||||
"""
|
||||
|
||||
def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
|
||||
self._min_snr_dB = min_snr_dB
|
||||
self._max_snr_dB = max_snr_dB
|
||||
self._rng = rng
|
||||
self._noise_manifest = utils.read_manifest(
|
||||
manifest_path=noise_manifest_path)
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Add background noise audio.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
noise_json = self._rng.sample(self._noise_manifest, 1)[0]
|
||||
if noise_json['duration'] < audio_segment.duration:
|
||||
raise RuntimeError("The duration of sampled noise audio is smaller "
|
||||
"than the audio segment to add effects to.")
|
||||
diff_duration = noise_json['duration'] - audio_segment.duration
|
||||
start = self._rng.uniform(0, diff_duration)
|
||||
end = start + audio_segment.duration
|
||||
noise_segment = AudioSegment.slice_from_file(
|
||||
noise_json['audio_filepath'], start=start, end=end)
|
||||
snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
|
||||
audio_segment.add_noise(
|
||||
noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
|
@ -0,0 +1,128 @@
|
||||
"""Prepare CHiME3 background data.
|
||||
|
||||
Download, unpack and create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import distutils.util
|
||||
import os
|
||||
import wget
|
||||
import zipfile
|
||||
import argparse
|
||||
import soundfile
|
||||
import json
|
||||
from paddle.v2.dataset.common import md5file
|
||||
|
||||
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
|
||||
|
||||
URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
|
||||
MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/chime3_background",
|
||||
type=str,
|
||||
help="Directory to save the dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_filepath",
|
||||
default="manifest.chime3.background",
|
||||
type=str,
|
||||
help="Filepath for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def download(url, md5sum, target_dir, filename=None):
|
||||
"""Download file from url to target_dir, and check md5sum."""
|
||||
if filename == None:
|
||||
filename = url.split("/")[-1]
|
||||
if not os.path.exists(target_dir): os.makedirs(target_dir)
|
||||
filepath = os.path.join(target_dir, filename)
|
||||
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
|
||||
print("Downloading %s ..." % url)
|
||||
wget.download(url, target_dir)
|
||||
print("\nMD5 Chesksum %s ..." % filepath)
|
||||
if not md5file(filepath) == md5sum:
|
||||
raise RuntimeError("MD5 checksum failed.")
|
||||
else:
|
||||
print("File exists, skip downloading. (%s)" % filepath)
|
||||
return filepath
|
||||
|
||||
|
||||
def unpack(filepath, target_dir):
|
||||
"""Unpack the file to the target_dir."""
|
||||
print("Unpacking %s ..." % filepath)
|
||||
if filepath.endswith('.zip'):
|
||||
zip = zipfile.ZipFile(filepath, 'r')
|
||||
zip.extractall(target_dir)
|
||||
zip.close()
|
||||
elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
|
||||
tar = zipfile.open(filepath)
|
||||
tar.extractall(target_dir)
|
||||
tar.close()
|
||||
else:
|
||||
raise ValueError("File format is not supported for unpacking.")
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path):
|
||||
"""Create a manifest json file summarizing the data set, with each line
|
||||
containing the meta data (i.e. audio filepath, transcription text, audio
|
||||
duration) of each audio file within the data set.
|
||||
"""
|
||||
print("Creating manifest %s ..." % manifest_path)
|
||||
json_lines = []
|
||||
for subfolder, _, filelist in sorted(os.walk(data_dir)):
|
||||
for filename in filelist:
|
||||
if filename.endswith('.wav'):
|
||||
filepath = os.path.join(data_dir, subfolder, filename)
|
||||
audio_data, samplerate = soundfile.read(filepath)
|
||||
duration = float(len(audio_data)) / samplerate
|
||||
json_lines.append(
|
||||
json.dumps({
|
||||
'audio_filepath': filepath,
|
||||
'duration': duration,
|
||||
'text': ''
|
||||
}))
|
||||
with open(manifest_path, 'w') as out_file:
|
||||
for line in json_lines:
|
||||
out_file.write(line + '\n')
|
||||
|
||||
|
||||
def prepare_chime3(url, md5sum, target_dir, manifest_path):
|
||||
"""Download, unpack and create summmary manifest file."""
|
||||
if not os.path.exists(os.path.join(target_dir, "CHiME3")):
|
||||
# download
|
||||
filepath = download(url, md5sum, target_dir,
|
||||
"myairbridge-AG0Y3DNBE5IWRRTV.zip")
|
||||
# unpack
|
||||
unpack(filepath, target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
|
||||
else:
|
||||
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||
target_dir)
|
||||
# create manifest json file
|
||||
create_manifest(target_dir, manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
prepare_chime3(
|
||||
url=URL,
|
||||
md5sum=MD5,
|
||||
target_dir=args.target_dir,
|
||||
manifest_path=args.manifest_filepath)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,10 @@
|
||||
cd noise
|
||||
python chime3_background.py
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare CHiME3 background noise failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
cd -
|
||||
|
||||
cat noise/manifest.* > manifest.noise
|
||||
echo "All done."
|
@ -0,0 +1,94 @@
|
||||
"""Client-end for the ASR demo."""
|
||||
from pynput import keyboard
|
||||
import struct
|
||||
import socket
|
||||
import sys
|
||||
import argparse
|
||||
import pyaudio
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--host_ip",
|
||||
default="localhost",
|
||||
type=str,
|
||||
help="Server IP address. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--host_port",
|
||||
default=8086,
|
||||
type=int,
|
||||
help="Server Port. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
is_recording = False
|
||||
enable_trigger_record = True
|
||||
|
||||
|
||||
def on_press(key):
|
||||
"""On-press keyboard callback function."""
|
||||
global is_recording, enable_trigger_record
|
||||
if key == keyboard.Key.space:
|
||||
if (not is_recording) and enable_trigger_record:
|
||||
sys.stdout.write("Start Recording ... ")
|
||||
sys.stdout.flush()
|
||||
is_recording = True
|
||||
|
||||
|
||||
def on_release(key):
|
||||
"""On-release keyboard callback function."""
|
||||
global is_recording, enable_trigger_record
|
||||
if key == keyboard.Key.esc:
|
||||
return False
|
||||
elif key == keyboard.Key.space:
|
||||
if is_recording == True:
|
||||
is_recording = False
|
||||
|
||||
|
||||
data_list = []
|
||||
|
||||
|
||||
def callback(in_data, frame_count, time_info, status):
|
||||
"""Audio recorder's stream callback function."""
|
||||
global data_list, is_recording, enable_trigger_record
|
||||
if is_recording:
|
||||
data_list.append(in_data)
|
||||
enable_trigger_record = False
|
||||
elif len(data_list) > 0:
|
||||
# Connect to server and send data
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.connect((args.host_ip, args.host_port))
|
||||
sent = ''.join(data_list)
|
||||
sock.sendall(struct.pack('>i', len(sent)) + sent)
|
||||
print('Speech[length=%d] Sent.' % len(sent))
|
||||
# Receive data from the server and shut down
|
||||
received = sock.recv(1024)
|
||||
print "Recognition Results: {}".format(received)
|
||||
sock.close()
|
||||
data_list = []
|
||||
enable_trigger_record = True
|
||||
return (in_data, pyaudio.paContinue)
|
||||
|
||||
|
||||
def main():
|
||||
# prepare audio recorder
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(
|
||||
format=pyaudio.paInt32,
|
||||
channels=1,
|
||||
rate=16000,
|
||||
input=True,
|
||||
stream_callback=callback)
|
||||
stream.start_stream()
|
||||
|
||||
# prepare keyboard listener
|
||||
with keyboard.Listener(
|
||||
on_press=on_press, on_release=on_release) as listener:
|
||||
listener.join()
|
||||
|
||||
# close up
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,245 @@
|
||||
"""Server-end for the ASR demo."""
|
||||
import os
|
||||
import time
|
||||
import random
|
||||
import argparse
|
||||
import distutils.util
|
||||
from time import gmtime, strftime
|
||||
import SocketServer
|
||||
import struct
|
||||
import wave
|
||||
import paddle.v2 as paddle
|
||||
from utils import print_arguments
|
||||
from data_utils.data import DataGenerator
|
||||
from model import DeepSpeech2Model
|
||||
from data_utils.utils import read_manifest
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--host_ip",
|
||||
default="localhost",
|
||||
type=str,
|
||||
help="Server IP address. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--host_port",
|
||||
default=8086,
|
||||
type=int,
|
||||
help="Server Port. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--speech_save_dir",
|
||||
default="demo_cache",
|
||||
type=str,
|
||||
help="Directory for saving demo speech. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--vocab_filepath",
|
||||
default='datasets/vocab/eng_vocab.txt',
|
||||
type=str,
|
||||
help="Vocabulary filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--mean_std_filepath",
|
||||
default='mean_std.npz',
|
||||
type=str,
|
||||
help="Manifest path for normalizer. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--warmup_manifest_path",
|
||||
default='datasets/manifest.test',
|
||||
type=str,
|
||||
help="Manifest path for warmup test. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--specgram_type",
|
||||
default='linear',
|
||||
type=str,
|
||||
help="Feature type of audio data: 'linear' (power spectrum)"
|
||||
" or 'mfcc'. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_conv_layers",
|
||||
default=2,
|
||||
type=int,
|
||||
help="Convolution layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_rnn_layers",
|
||||
default=3,
|
||||
type=int,
|
||||
help="RNN layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--rnn_layer_size",
|
||||
default=512,
|
||||
type=int,
|
||||
help="RNN layer cell number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--use_gpu",
|
||||
default=True,
|
||||
type=distutils.util.strtobool,
|
||||
help="Use gpu or not. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--model_filepath",
|
||||
default='checkpoints/params.latest.tar.gz',
|
||||
type=str,
|
||||
help="Model filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--decode_method",
|
||||
default='beam_search',
|
||||
type=str,
|
||||
help="Method for ctc decoding: best_path or beam_search. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--beam_size",
|
||||
default=100,
|
||||
type=int,
|
||||
help="Width for beam search decoding. (default: %(default)d)")
|
||||
parser.add_argument(
|
||||
"--language_model_path",
|
||||
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
||||
type=str,
|
||||
help="Path for language model. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
default=0.36,
|
||||
type=float,
|
||||
help="Parameter associated with language model. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--beta",
|
||||
default=0.25,
|
||||
type=float,
|
||||
help="Parameter associated with word count. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--cutoff_prob",
|
||||
default=0.99,
|
||||
type=float,
|
||||
help="The cutoff probability of pruning"
|
||||
"in beam search. (default: %(default)f)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
class AsrTCPServer(SocketServer.TCPServer):
|
||||
"""The ASR TCP Server."""
|
||||
|
||||
def __init__(self,
|
||||
server_address,
|
||||
RequestHandlerClass,
|
||||
speech_save_dir,
|
||||
audio_process_handler,
|
||||
bind_and_activate=True):
|
||||
self.speech_save_dir = speech_save_dir
|
||||
self.audio_process_handler = audio_process_handler
|
||||
SocketServer.TCPServer.__init__(
|
||||
self, server_address, RequestHandlerClass, bind_and_activate=True)
|
||||
|
||||
|
||||
class AsrRequestHandler(SocketServer.BaseRequestHandler):
|
||||
"""The ASR request handler."""
|
||||
|
||||
def handle(self):
|
||||
# receive data through TCP socket
|
||||
chunk = self.request.recv(1024)
|
||||
target_len = struct.unpack('>i', chunk[:4])[0]
|
||||
data = chunk[4:]
|
||||
while len(data) < target_len:
|
||||
chunk = self.request.recv(1024)
|
||||
data += chunk
|
||||
# write to file
|
||||
filename = self._write_to_file(data)
|
||||
|
||||
print("Received utterance[length=%d] from %s, saved to %s." %
|
||||
(len(data), self.client_address[0], filename))
|
||||
start_time = time.time()
|
||||
transcript = self.server.audio_process_handler(filename)
|
||||
finish_time = time.time()
|
||||
print("Response Time: %f, Transcript: %s" %
|
||||
(finish_time - start_time, transcript))
|
||||
self.request.sendall(transcript)
|
||||
|
||||
def _write_to_file(self, data):
|
||||
# prepare save dir and filename
|
||||
if not os.path.exists(self.server.speech_save_dir):
|
||||
os.mkdir(self.server.speech_save_dir)
|
||||
timestamp = strftime("%Y%m%d%H%M%S", gmtime())
|
||||
out_filename = os.path.join(
|
||||
self.server.speech_save_dir,
|
||||
timestamp + "_" + self.client_address[0] + ".wav")
|
||||
# write to wav file
|
||||
file = wave.open(out_filename, 'wb')
|
||||
file.setnchannels(1)
|
||||
file.setsampwidth(4)
|
||||
file.setframerate(16000)
|
||||
file.writeframes(data)
|
||||
file.close()
|
||||
return out_filename
|
||||
|
||||
|
||||
def warm_up_test(audio_process_handler,
|
||||
manifest_path,
|
||||
num_test_cases,
|
||||
random_seed=0):
|
||||
"""Warming-up test."""
|
||||
manifest = read_manifest(manifest_path)
|
||||
rng = random.Random(random_seed)
|
||||
samples = rng.sample(manifest, num_test_cases)
|
||||
for idx, sample in enumerate(samples):
|
||||
print("Warm-up Test Case %d: %s", idx, sample['audio_filepath'])
|
||||
start_time = time.time()
|
||||
transcript = audio_process_handler(sample['audio_filepath'])
|
||||
finish_time = time.time()
|
||||
print("Response Time: %f, Transcript: %s" %
|
||||
(finish_time - start_time, transcript))
|
||||
|
||||
|
||||
def start_server():
|
||||
"""Start the ASR server"""
|
||||
# prepare data generator
|
||||
data_generator = DataGenerator(
|
||||
vocab_filepath=args.vocab_filepath,
|
||||
mean_std_filepath=args.mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
specgram_type=args.specgram_type,
|
||||
num_threads=1)
|
||||
# prepare ASR model
|
||||
ds2_model = DeepSpeech2Model(
|
||||
vocab_size=data_generator.vocab_size,
|
||||
num_conv_layers=args.num_conv_layers,
|
||||
num_rnn_layers=args.num_rnn_layers,
|
||||
rnn_layer_size=args.rnn_layer_size,
|
||||
pretrained_model_path=args.model_filepath)
|
||||
|
||||
# prepare ASR inference handler
|
||||
def file_to_transcript(filename):
|
||||
feature = data_generator.process_utterance(filename, "")
|
||||
result_transcript = ds2_model.infer_batch(
|
||||
infer_data=[feature],
|
||||
decode_method=args.decode_method,
|
||||
beam_alpha=args.alpha,
|
||||
beam_beta=args.beta,
|
||||
beam_size=args.beam_size,
|
||||
cutoff_prob=args.cutoff_prob,
|
||||
vocab_list=data_generator.vocab_list,
|
||||
language_model_path=args.language_model_path,
|
||||
num_processes=1)
|
||||
return result_transcript[0]
|
||||
|
||||
# warming up with utterrances sampled from Librispeech
|
||||
print('-----------------------------------------------------------')
|
||||
print('Warming up ...')
|
||||
warm_up_test(
|
||||
audio_process_handler=file_to_transcript,
|
||||
manifest_path=args.warmup_manifest_path,
|
||||
num_test_cases=3)
|
||||
print('-----------------------------------------------------------')
|
||||
|
||||
# start the server
|
||||
server = AsrTCPServer(
|
||||
server_address=(args.host_ip, args.host_port),
|
||||
RequestHandlerClass=AsrRequestHandler,
|
||||
speech_save_dir=args.speech_save_dir,
|
||||
audio_process_handler=file_to_transcript)
|
||||
print("ASR Server Started.")
|
||||
server.serve_forever()
|
||||
|
||||
|
||||
def main():
|
||||
print_arguments(args)
|
||||
paddle.init(use_gpu=args.use_gpu, trainer_count=1)
|
||||
start_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,177 @@
|
||||
"""Contains DeepSpeech2 layers."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.v2 as paddle
|
||||
|
||||
|
||||
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
||||
padding, act):
|
||||
"""Convolution layer with batch normalization.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
||||
two image dimension.
|
||||
:type filter_size: int|tuple|list
|
||||
:param num_channels_in: Number of input channels.
|
||||
:type num_channels_in: int
|
||||
:type num_channels_out: Number of output channels.
|
||||
:type num_channels_in: out
|
||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||
image dimension.
|
||||
:type padding: int|tuple|list
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Batch norm layer after convolution layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv_layer = paddle.layer.img_conv(
|
||||
input=input,
|
||||
filter_size=filter_size,
|
||||
num_channels=num_channels_in,
|
||||
num_filters=num_channels_out,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
||||
|
||||
|
||||
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
# input-hidden weights shared across bi-direcitonal rnn.
|
||||
input_proj = paddle.layer.fc(
|
||||
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
input_proj_bn = paddle.layer.batch_norm(
|
||||
input=input_proj, act=paddle.activation.Linear())
|
||||
# forward and backward in time
|
||||
forward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=False)
|
||||
backward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=True)
|
||||
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
||||
|
||||
|
||||
def conv_group(input, num_stacks):
|
||||
"""Convolution group with stacked convolution layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param num_stacks: Number of stacked convolution layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the convolution group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv = conv_bn_layer(
|
||||
input=input,
|
||||
filter_size=(11, 41),
|
||||
num_channels_in=1,
|
||||
num_channels_out=32,
|
||||
stride=(3, 2),
|
||||
padding=(5, 20),
|
||||
act=paddle.activation.BRelu())
|
||||
for i in xrange(num_stacks - 1):
|
||||
conv = conv_bn_layer(
|
||||
input=conv,
|
||||
filter_size=(11, 21),
|
||||
num_channels_in=32,
|
||||
num_channels_out=32,
|
||||
stride=(1, 2),
|
||||
padding=(5, 10),
|
||||
act=paddle.activation.BRelu())
|
||||
output_num_channels = 32
|
||||
output_height = 160 // pow(2, num_stacks) + 1
|
||||
return conv, output_num_channels, output_height
|
||||
|
||||
|
||||
def rnn_group(input, size, num_stacks):
|
||||
"""RNN group with stacked bidirectional simple RNN layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
output = input
|
||||
for i in xrange(num_stacks):
|
||||
output = bidirectional_simple_rnn_bn_layer(
|
||||
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
|
||||
return output
|
||||
|
||||
|
||||
def deep_speech2(audio_data,
|
||||
text_data,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=256):
|
||||
"""
|
||||
The whole DeepSpeech2 model structure (a simplified version).
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: LayerOutput
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: LayerOutput
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (number of RNN cells).
|
||||
:type rnn_size: int
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
# convolution group
|
||||
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
||||
input=audio_data, num_stacks=num_conv_layers)
|
||||
# convert data form convolution feature map to sequence of vectors
|
||||
conv2seq = paddle.layer.block_expand(
|
||||
input=conv_group_output,
|
||||
num_channels=conv_group_num_channels,
|
||||
stride_x=1,
|
||||
stride_y=1,
|
||||
block_x=1,
|
||||
block_y=conv_group_height)
|
||||
# rnn group
|
||||
rnn_group_output = rnn_group(
|
||||
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
|
||||
fc = paddle.layer.fc(
|
||||
input=rnn_group_output,
|
||||
size=dict_size + 1,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=True)
|
||||
# probability distribution with softmax
|
||||
log_probs = paddle.layer.mixed(
|
||||
input=paddle.layer.identity_projection(input=fc),
|
||||
act=paddle.activation.Softmax())
|
||||
# ctc cost
|
||||
ctc_loss = paddle.layer.warp_ctc(
|
||||
input=fc,
|
||||
label=text_data,
|
||||
size=dict_size + 1,
|
||||
blank=dict_size,
|
||||
norm_by_times=True)
|
||||
return log_probs, ctc_loss
|
@ -1,5 +1,5 @@
|
||||
wget==3.2
|
||||
scipy==0.13.1
|
||||
resampy==0.1.5
|
||||
https://github.com/kpu/kenlm/archive/master.zip
|
||||
SoundFile==0.9.0.post1
|
||||
python_speech_features
|
||||
https://github.com/luotao1/kenlm/archive/master.zip
|
||||
|
@ -0,0 +1,23 @@
|
||||
"""Test Setup."""
|
||||
import unittest
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
class TestSetup(unittest.TestCase):
|
||||
def test_soundfile(self):
|
||||
import soundfile as sf
|
||||
# floating point data is typically limited to the interval [-1.0, 1.0],
|
||||
# but smaller/larger values are supported as well
|
||||
data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
|
||||
[0.25, -0.25]])
|
||||
file = 'test.wav'
|
||||
sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
|
||||
read, fs = sf.read(file)
|
||||
self.assertTrue(np.all(read == data))
|
||||
self.assertEqual(fs, 44100)
|
||||
os.remove(file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in new issue