1. Refine data_utils/data.py, reuse process_utterance function. 2. Modified README. 3. Implement uploading data in cloud/upload_data.py 4. Merge branch 'develop' of https://github.com/PaddlePaddle/models into ds2_pcloudpull/2/head
commit
c00db21e69
@ -1,61 +0,0 @@
|
||||
"""
|
||||
This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.
|
||||
|
||||
Steps:
|
||||
1. Read original manifest and get the local path of sound files.
|
||||
2. Tar all local sound files into one tar file.
|
||||
3. Modify original manifest to remove the local path information.
|
||||
|
||||
Finally, we will get a tar file and a manifest with sound file name, duration
|
||||
and text.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
import sys
|
||||
import argparse
|
||||
sys.path.append('../')
|
||||
from data_utils.utils import read_manifest
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--manifest_path",
|
||||
default="../datasets/manifest.train",
|
||||
type=str,
|
||||
help="Manifest of target data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_tar_path",
|
||||
default="./tmp/cloud.train.tar",
|
||||
type=str,
|
||||
help="Output tar file path. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--out_manifest_path",
|
||||
default="./tmp/cloud.train.manifest",
|
||||
type=str,
|
||||
help="Manifest of output data. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def gen_pcloud_data(manifest_path, out_tar_path, out_manifest_path):
|
||||
'''
|
||||
1. According manifest, tar sound files into out_tar_path
|
||||
2. Generate a new manifest for output tar file
|
||||
'''
|
||||
out_tar = tarfile.open(out_tar_path, 'w')
|
||||
manifest = read_manifest(manifest_path)
|
||||
results = []
|
||||
for json_data in manifest:
|
||||
sound_file = json_data['audio_filepath']
|
||||
filename = os.path.basename(sound_file)
|
||||
out_tar.add(sound_file, arcname=filename)
|
||||
json_data['audio_filepath'] = filename
|
||||
results.append("%s\n" % json.dumps(json_data))
|
||||
with open(out_manifest_path, 'w') as out_manifest:
|
||||
out_manifest.writelines(results)
|
||||
out_manifest.close()
|
||||
out_tar.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
gen_pcloud_data(args.manifest_path, args.out_tar_path,
|
||||
args.out_manifest_path)
|
@ -0,0 +1,147 @@
|
||||
"""
|
||||
This tool is used for preparing data for DeepSpeech2 trainning on paddle cloud.
|
||||
|
||||
Steps:
|
||||
1. Read original manifest and get the local path of sound files.
|
||||
2. Tar all local sound files into one tar file.
|
||||
3. Modify original manifest to remove the local path information.
|
||||
|
||||
Finally, we will get a tar file and a manifest with sound file name, duration
|
||||
and text.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import tarfile
|
||||
import sys
|
||||
import argparse
|
||||
import shutil
|
||||
sys.path.append('../')
|
||||
from data_utils.utils import read_manifest
|
||||
from subprocess import call
|
||||
|
||||
TRAIN_TAR = "cloud.train.tar"
|
||||
TRAIN_MANIFEST = "cloud.train.manifest"
|
||||
TEST_TAR = "cloud.test.tar"
|
||||
TEST_MANIFEST = "cloud.test.manifest"
|
||||
VOCAB_FILE = "vocab.txt"
|
||||
MEAN_STD_FILE = "mean_std.npz"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--train_manifest_path",
|
||||
default="../datasets/manifest.train",
|
||||
type=str,
|
||||
help="Manifest file of train data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--test_manifest_path",
|
||||
default="../datasets/manifest.test",
|
||||
type=str,
|
||||
help="Manifest file of test data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--vocab_file",
|
||||
default="../datasets/vocab/eng_vocab.txt",
|
||||
type=str,
|
||||
help="Vocab file to be uploaded to paddlecloud. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--mean_std_file",
|
||||
default="../mean_std.npz",
|
||||
type=str,
|
||||
help="mean_std file to be uploaded to paddlecloud. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--cloud_data_path",
|
||||
required=True,
|
||||
default="",
|
||||
type=str,
|
||||
help="Destination path on paddlecloud. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
parser.add_argument(
|
||||
"--local_tmp_path",
|
||||
default="./tmp/",
|
||||
type=str,
|
||||
help="Local directory for storing temporary data. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def pack_data(manifest_path, out_tar_path, out_manifest_path):
|
||||
'''
|
||||
1. According manifest, tar sound files into out_tar_path
|
||||
2. Generate a new manifest for output tar file
|
||||
'''
|
||||
out_tar = tarfile.open(out_tar_path, 'w')
|
||||
manifest = read_manifest(manifest_path)
|
||||
results = []
|
||||
for json_data in manifest:
|
||||
sound_file = json_data['audio_filepath']
|
||||
filename = os.path.basename(sound_file)
|
||||
out_tar.add(sound_file, arcname=filename)
|
||||
json_data['audio_filepath'] = filename
|
||||
results.append("%s\n" % json.dumps(json_data))
|
||||
with open(out_manifest_path, 'w') as out_manifest:
|
||||
out_manifest.writelines(results)
|
||||
out_manifest.close()
|
||||
out_tar.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cloud_train_manifest = "%s/%s" % (args.cloud_data_path, TRAIN_MANIFEST)
|
||||
cloud_train_tar = "%s/%s" % (args.cloud_data_path, TRAIN_TAR)
|
||||
cloud_test_manifest = "%s/%s" % (args.cloud_data_path, TEST_MANIFEST)
|
||||
cloud_test_tar = "%s/%s" % (args.cloud_data_path, TEST_TAR)
|
||||
cloud_vocab_file = "%s/%s" % (args.cloud_data_path, VOCAB_FILE)
|
||||
cloud_mean_file = "%s/%s" % (args.cloud_data_path, MEAN_STD_FILE)
|
||||
|
||||
local_train_manifest = "%s/%s" % (args.local_tmp_path, TRAIN_MANIFEST)
|
||||
local_train_tar = "%s/%s" % (args.local_tmp_path, TRAIN_TAR)
|
||||
local_test_manifest = "%s/%s" % (args.local_tmp_path, TEST_MANIFEST)
|
||||
local_test_tar = "%s/%s" % (args.local_tmp_path, TEST_TAR)
|
||||
|
||||
if os.path.exists(args.local_tmp_path):
|
||||
shutil.rmtree(args.local_tmp_path)
|
||||
os.makedirs(args.local_tmp_path)
|
||||
|
||||
ret = 1
|
||||
# train data
|
||||
if args.train_manifest_path != "":
|
||||
ret = call(['paddlecloud', 'ls', cloud_train_manifest])
|
||||
if ret != 0:
|
||||
print "%s does't exist" % cloud_train_manifest
|
||||
pack_data(args.train_manifest_path, local_train_tar,
|
||||
local_train_manifest)
|
||||
call([
|
||||
'paddlecloud', 'cp', local_train_manifest, cloud_train_manifest
|
||||
])
|
||||
call(['paddlecloud', 'cp', local_train_tar, cloud_train_tar])
|
||||
|
||||
# test data
|
||||
if args.test_manifest_path != "":
|
||||
try:
|
||||
ret = call(['paddlecloud', 'ls', cloud_test_manifest])
|
||||
except Exception:
|
||||
ret = 1
|
||||
if ret != 0:
|
||||
pack_data(args.test_manifest_path, local_test_tar,
|
||||
local_test_manifest)
|
||||
call(
|
||||
['paddlecloud', 'cp', local_test_manifest, cloud_test_manifest])
|
||||
call(['paddlecloud', 'cp', local_test_tar, cloud_test_tar])
|
||||
|
||||
# vocab file
|
||||
if args.vocab_file != "":
|
||||
try:
|
||||
ret = call(['paddlecloud', 'ls', cloud_vocab_file])
|
||||
except Exception:
|
||||
ret = 1
|
||||
if ret != 0:
|
||||
call(['paddlecloud', 'cp', args.vocab_file, cloud_vocab_file])
|
||||
|
||||
# mean_std file
|
||||
if args.mean_std_file != "":
|
||||
try:
|
||||
ret = call(['paddlecloud', 'ls', cloud_mean_file])
|
||||
except Exception:
|
||||
ret = 1
|
||||
if ret != 0:
|
||||
call(['paddlecloud', 'cp', args.mean_std_file, cloud_mean_file])
|
||||
|
||||
os.removedirs(args.local_tmp_path)
|
@ -0,0 +1,8 @@
|
||||
[
|
||||
{
|
||||
"type": "shift",
|
||||
"params": {"min_shift_ms": -5,
|
||||
"max_shift_ms": 5},
|
||||
"prob": 1.0
|
||||
}
|
||||
]
|
@ -0,0 +1,39 @@
|
||||
[
|
||||
{
|
||||
"type": "noise",
|
||||
"params": {"min_snr_dB": 40,
|
||||
"max_snr_dB": 50,
|
||||
"noise_manifest_path": "datasets/manifest.noise"},
|
||||
"prob": 0.6
|
||||
},
|
||||
{
|
||||
"type": "impulse",
|
||||
"params": {"impulse_manifest_path": "datasets/manifest.impulse"},
|
||||
"prob": 0.5
|
||||
},
|
||||
{
|
||||
"type": "speed",
|
||||
"params": {"min_speed_rate": 0.95,
|
||||
"max_speed_rate": 1.05},
|
||||
"prob": 0.5
|
||||
},
|
||||
{
|
||||
"type": "shift",
|
||||
"params": {"min_shift_ms": -5,
|
||||
"max_shift_ms": 5},
|
||||
"prob": 1.0
|
||||
},
|
||||
{
|
||||
"type": "volume",
|
||||
"params": {"min_gain_dBFS": -10,
|
||||
"max_gain_dBFS": 10},
|
||||
"prob": 0.0
|
||||
},
|
||||
{
|
||||
"type": "bayesian_normal",
|
||||
"params": {"target_db": -20,
|
||||
"prior_db": -20,
|
||||
"prior_samples": 100},
|
||||
"prob": 0.0
|
||||
}
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,35 @@
|
||||
"""Contains the impulse response augmentation model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
from data_utils import utils
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class ImpulseResponseAugmentor(AugmentorBase):
|
||||
"""Augmentation model for adding impulse response effect.
|
||||
|
||||
:param rng: Random generator object.
|
||||
:type rng: random.Random
|
||||
:param impulse_manifest_path: Manifest path for impulse audio data.
|
||||
:type impulse_manifest_path: basestring
|
||||
"""
|
||||
|
||||
def __init__(self, rng, impulse_manifest_path):
|
||||
self._rng = rng
|
||||
self._impulse_manifest = utils.read_manifest(
|
||||
manifest_path=impulse_manifest_path)
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Add impulse response effect.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
|
||||
impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
|
||||
audio_segment.convolve(impulse_segment, allow_resample=True)
|
Binary file not shown.
@ -0,0 +1,50 @@
|
||||
"""Contains the noise perturb augmentation model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
from data_utils import utils
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class NoisePerturbAugmentor(AugmentorBase):
|
||||
"""Augmentation model for adding background noise.
|
||||
|
||||
:param rng: Random generator object.
|
||||
:type rng: random.Random
|
||||
:param min_snr_dB: Minimal signal noise ratio, in decibels.
|
||||
:type min_snr_dB: float
|
||||
:param max_snr_dB: Maximal signal noise ratio, in decibels.
|
||||
:type max_snr_dB: float
|
||||
:param noise_manifest_path: Manifest path for noise audio data.
|
||||
:type noise_manifest_path: basestring
|
||||
"""
|
||||
|
||||
def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
|
||||
self._min_snr_dB = min_snr_dB
|
||||
self._max_snr_dB = max_snr_dB
|
||||
self._rng = rng
|
||||
self._noise_manifest = utils.read_manifest(
|
||||
manifest_path=noise_manifest_path)
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Add background noise audio.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
noise_json = self._rng.sample(self._noise_manifest, 1)[0]
|
||||
if noise_json['duration'] < audio_segment.duration:
|
||||
raise RuntimeError("The duration of sampled noise audio is smaller "
|
||||
"than the audio segment to add effects to.")
|
||||
diff_duration = noise_json['duration'] - audio_segment.duration
|
||||
start = self._rng.uniform(0, diff_duration)
|
||||
end = start + audio_segment.duration
|
||||
noise_segment = AudioSegment.slice_from_file(
|
||||
noise_json['audio_filepath'], start=start, end=end)
|
||||
snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
|
||||
audio_segment.add_noise(
|
||||
noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,128 @@
|
||||
"""Prepare CHiME3 background data.
|
||||
|
||||
Download, unpack and create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import distutils.util
|
||||
import os
|
||||
import wget
|
||||
import zipfile
|
||||
import argparse
|
||||
import soundfile
|
||||
import json
|
||||
from paddle.v2.dataset.common import md5file
|
||||
|
||||
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
|
||||
|
||||
URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
|
||||
MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/chime3_background",
|
||||
type=str,
|
||||
help="Directory to save the dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_filepath",
|
||||
default="manifest.chime3.background",
|
||||
type=str,
|
||||
help="Filepath for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def download(url, md5sum, target_dir, filename=None):
|
||||
"""Download file from url to target_dir, and check md5sum."""
|
||||
if filename == None:
|
||||
filename = url.split("/")[-1]
|
||||
if not os.path.exists(target_dir): os.makedirs(target_dir)
|
||||
filepath = os.path.join(target_dir, filename)
|
||||
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
|
||||
print("Downloading %s ..." % url)
|
||||
wget.download(url, target_dir)
|
||||
print("\nMD5 Chesksum %s ..." % filepath)
|
||||
if not md5file(filepath) == md5sum:
|
||||
raise RuntimeError("MD5 checksum failed.")
|
||||
else:
|
||||
print("File exists, skip downloading. (%s)" % filepath)
|
||||
return filepath
|
||||
|
||||
|
||||
def unpack(filepath, target_dir):
|
||||
"""Unpack the file to the target_dir."""
|
||||
print("Unpacking %s ..." % filepath)
|
||||
if filepath.endswith('.zip'):
|
||||
zip = zipfile.ZipFile(filepath, 'r')
|
||||
zip.extractall(target_dir)
|
||||
zip.close()
|
||||
elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
|
||||
tar = zipfile.open(filepath)
|
||||
tar.extractall(target_dir)
|
||||
tar.close()
|
||||
else:
|
||||
raise ValueError("File format is not supported for unpacking.")
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path):
|
||||
"""Create a manifest json file summarizing the data set, with each line
|
||||
containing the meta data (i.e. audio filepath, transcription text, audio
|
||||
duration) of each audio file within the data set.
|
||||
"""
|
||||
print("Creating manifest %s ..." % manifest_path)
|
||||
json_lines = []
|
||||
for subfolder, _, filelist in sorted(os.walk(data_dir)):
|
||||
for filename in filelist:
|
||||
if filename.endswith('.wav'):
|
||||
filepath = os.path.join(data_dir, subfolder, filename)
|
||||
audio_data, samplerate = soundfile.read(filepath)
|
||||
duration = float(len(audio_data)) / samplerate
|
||||
json_lines.append(
|
||||
json.dumps({
|
||||
'audio_filepath': filepath,
|
||||
'duration': duration,
|
||||
'text': ''
|
||||
}))
|
||||
with open(manifest_path, 'w') as out_file:
|
||||
for line in json_lines:
|
||||
out_file.write(line + '\n')
|
||||
|
||||
|
||||
def prepare_chime3(url, md5sum, target_dir, manifest_path):
|
||||
"""Download, unpack and create summmary manifest file."""
|
||||
if not os.path.exists(os.path.join(target_dir, "CHiME3")):
|
||||
# download
|
||||
filepath = download(url, md5sum, target_dir,
|
||||
"myairbridge-AG0Y3DNBE5IWRRTV.zip")
|
||||
# unpack
|
||||
unpack(filepath, target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
|
||||
unpack(
|
||||
os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
|
||||
else:
|
||||
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||
target_dir)
|
||||
# create manifest json file
|
||||
create_manifest(target_dir, manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
prepare_chime3(
|
||||
url=URL,
|
||||
md5sum=MD5,
|
||||
target_dir=args.target_dir,
|
||||
manifest_path=args.manifest_filepath)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,10 @@
|
||||
cd noise
|
||||
python chime3_background.py
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare CHiME3 background noise failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
cd -
|
||||
|
||||
cat noise/manifest.* > manifest.noise
|
||||
echo "All done."
|
@ -0,0 +1,94 @@
|
||||
"""Client-end for the ASR demo."""
|
||||
from pynput import keyboard
|
||||
import struct
|
||||
import socket
|
||||
import sys
|
||||
import argparse
|
||||
import pyaudio
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--host_ip",
|
||||
default="localhost",
|
||||
type=str,
|
||||
help="Server IP address. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--host_port",
|
||||
default=8086,
|
||||
type=int,
|
||||
help="Server Port. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
is_recording = False
|
||||
enable_trigger_record = True
|
||||
|
||||
|
||||
def on_press(key):
|
||||
"""On-press keyboard callback function."""
|
||||
global is_recording, enable_trigger_record
|
||||
if key == keyboard.Key.space:
|
||||
if (not is_recording) and enable_trigger_record:
|
||||
sys.stdout.write("Start Recording ... ")
|
||||
sys.stdout.flush()
|
||||
is_recording = True
|
||||
|
||||
|
||||
def on_release(key):
|
||||
"""On-release keyboard callback function."""
|
||||
global is_recording, enable_trigger_record
|
||||
if key == keyboard.Key.esc:
|
||||
return False
|
||||
elif key == keyboard.Key.space:
|
||||
if is_recording == True:
|
||||
is_recording = False
|
||||
|
||||
|
||||
data_list = []
|
||||
|
||||
|
||||
def callback(in_data, frame_count, time_info, status):
|
||||
"""Audio recorder's stream callback function."""
|
||||
global data_list, is_recording, enable_trigger_record
|
||||
if is_recording:
|
||||
data_list.append(in_data)
|
||||
enable_trigger_record = False
|
||||
elif len(data_list) > 0:
|
||||
# Connect to server and send data
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.connect((args.host_ip, args.host_port))
|
||||
sent = ''.join(data_list)
|
||||
sock.sendall(struct.pack('>i', len(sent)) + sent)
|
||||
print('Speech[length=%d] Sent.' % len(sent))
|
||||
# Receive data from the server and shut down
|
||||
received = sock.recv(1024)
|
||||
print "Recognition Results: {}".format(received)
|
||||
sock.close()
|
||||
data_list = []
|
||||
enable_trigger_record = True
|
||||
return (in_data, pyaudio.paContinue)
|
||||
|
||||
|
||||
def main():
|
||||
# prepare audio recorder
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(
|
||||
format=pyaudio.paInt32,
|
||||
channels=1,
|
||||
rate=16000,
|
||||
input=True,
|
||||
stream_callback=callback)
|
||||
stream.start_stream()
|
||||
|
||||
# prepare keyboard listener
|
||||
with keyboard.Listener(
|
||||
on_press=on_press, on_release=on_release) as listener:
|
||||
listener.join()
|
||||
|
||||
# close up
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,245 @@
|
||||
"""Server-end for the ASR demo."""
|
||||
import os
|
||||
import time
|
||||
import random
|
||||
import argparse
|
||||
import distutils.util
|
||||
from time import gmtime, strftime
|
||||
import SocketServer
|
||||
import struct
|
||||
import wave
|
||||
import paddle.v2 as paddle
|
||||
from utils import print_arguments
|
||||
from data_utils.data import DataGenerator
|
||||
from model import DeepSpeech2Model
|
||||
from data_utils.utils import read_manifest
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--host_ip",
|
||||
default="localhost",
|
||||
type=str,
|
||||
help="Server IP address. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--host_port",
|
||||
default=8086,
|
||||
type=int,
|
||||
help="Server Port. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--speech_save_dir",
|
||||
default="demo_cache",
|
||||
type=str,
|
||||
help="Directory for saving demo speech. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--vocab_filepath",
|
||||
default='datasets/vocab/eng_vocab.txt',
|
||||
type=str,
|
||||
help="Vocabulary filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--mean_std_filepath",
|
||||
default='mean_std.npz',
|
||||
type=str,
|
||||
help="Manifest path for normalizer. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--warmup_manifest_path",
|
||||
default='datasets/manifest.test',
|
||||
type=str,
|
||||
help="Manifest path for warmup test. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--specgram_type",
|
||||
default='linear',
|
||||
type=str,
|
||||
help="Feature type of audio data: 'linear' (power spectrum)"
|
||||
" or 'mfcc'. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_conv_layers",
|
||||
default=2,
|
||||
type=int,
|
||||
help="Convolution layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_rnn_layers",
|
||||
default=3,
|
||||
type=int,
|
||||
help="RNN layer number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--rnn_layer_size",
|
||||
default=512,
|
||||
type=int,
|
||||
help="RNN layer cell number. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--use_gpu",
|
||||
default=True,
|
||||
type=distutils.util.strtobool,
|
||||
help="Use gpu or not. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--model_filepath",
|
||||
default='checkpoints/params.latest.tar.gz',
|
||||
type=str,
|
||||
help="Model filepath. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--decode_method",
|
||||
default='beam_search',
|
||||
type=str,
|
||||
help="Method for ctc decoding: best_path or beam_search. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--beam_size",
|
||||
default=100,
|
||||
type=int,
|
||||
help="Width for beam search decoding. (default: %(default)d)")
|
||||
parser.add_argument(
|
||||
"--language_model_path",
|
||||
default="lm/data/common_crawl_00.prune01111.trie.klm",
|
||||
type=str,
|
||||
help="Path for language model. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--alpha",
|
||||
default=0.36,
|
||||
type=float,
|
||||
help="Parameter associated with language model. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--beta",
|
||||
default=0.25,
|
||||
type=float,
|
||||
help="Parameter associated with word count. (default: %(default)f)")
|
||||
parser.add_argument(
|
||||
"--cutoff_prob",
|
||||
default=0.99,
|
||||
type=float,
|
||||
help="The cutoff probability of pruning"
|
||||
"in beam search. (default: %(default)f)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
class AsrTCPServer(SocketServer.TCPServer):
|
||||
"""The ASR TCP Server."""
|
||||
|
||||
def __init__(self,
|
||||
server_address,
|
||||
RequestHandlerClass,
|
||||
speech_save_dir,
|
||||
audio_process_handler,
|
||||
bind_and_activate=True):
|
||||
self.speech_save_dir = speech_save_dir
|
||||
self.audio_process_handler = audio_process_handler
|
||||
SocketServer.TCPServer.__init__(
|
||||
self, server_address, RequestHandlerClass, bind_and_activate=True)
|
||||
|
||||
|
||||
class AsrRequestHandler(SocketServer.BaseRequestHandler):
|
||||
"""The ASR request handler."""
|
||||
|
||||
def handle(self):
|
||||
# receive data through TCP socket
|
||||
chunk = self.request.recv(1024)
|
||||
target_len = struct.unpack('>i', chunk[:4])[0]
|
||||
data = chunk[4:]
|
||||
while len(data) < target_len:
|
||||
chunk = self.request.recv(1024)
|
||||
data += chunk
|
||||
# write to file
|
||||
filename = self._write_to_file(data)
|
||||
|
||||
print("Received utterance[length=%d] from %s, saved to %s." %
|
||||
(len(data), self.client_address[0], filename))
|
||||
start_time = time.time()
|
||||
transcript = self.server.audio_process_handler(filename)
|
||||
finish_time = time.time()
|
||||
print("Response Time: %f, Transcript: %s" %
|
||||
(finish_time - start_time, transcript))
|
||||
self.request.sendall(transcript)
|
||||
|
||||
def _write_to_file(self, data):
|
||||
# prepare save dir and filename
|
||||
if not os.path.exists(self.server.speech_save_dir):
|
||||
os.mkdir(self.server.speech_save_dir)
|
||||
timestamp = strftime("%Y%m%d%H%M%S", gmtime())
|
||||
out_filename = os.path.join(
|
||||
self.server.speech_save_dir,
|
||||
timestamp + "_" + self.client_address[0] + ".wav")
|
||||
# write to wav file
|
||||
file = wave.open(out_filename, 'wb')
|
||||
file.setnchannels(1)
|
||||
file.setsampwidth(4)
|
||||
file.setframerate(16000)
|
||||
file.writeframes(data)
|
||||
file.close()
|
||||
return out_filename
|
||||
|
||||
|
||||
def warm_up_test(audio_process_handler,
|
||||
manifest_path,
|
||||
num_test_cases,
|
||||
random_seed=0):
|
||||
"""Warming-up test."""
|
||||
manifest = read_manifest(manifest_path)
|
||||
rng = random.Random(random_seed)
|
||||
samples = rng.sample(manifest, num_test_cases)
|
||||
for idx, sample in enumerate(samples):
|
||||
print("Warm-up Test Case %d: %s", idx, sample['audio_filepath'])
|
||||
start_time = time.time()
|
||||
transcript = audio_process_handler(sample['audio_filepath'])
|
||||
finish_time = time.time()
|
||||
print("Response Time: %f, Transcript: %s" %
|
||||
(finish_time - start_time, transcript))
|
||||
|
||||
|
||||
def start_server():
|
||||
"""Start the ASR server"""
|
||||
# prepare data generator
|
||||
data_generator = DataGenerator(
|
||||
vocab_filepath=args.vocab_filepath,
|
||||
mean_std_filepath=args.mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
specgram_type=args.specgram_type,
|
||||
num_threads=1)
|
||||
# prepare ASR model
|
||||
ds2_model = DeepSpeech2Model(
|
||||
vocab_size=data_generator.vocab_size,
|
||||
num_conv_layers=args.num_conv_layers,
|
||||
num_rnn_layers=args.num_rnn_layers,
|
||||
rnn_layer_size=args.rnn_layer_size,
|
||||
pretrained_model_path=args.model_filepath)
|
||||
|
||||
# prepare ASR inference handler
|
||||
def file_to_transcript(filename):
|
||||
feature = data_generator.process_utterance(filename, "")
|
||||
result_transcript = ds2_model.infer_batch(
|
||||
infer_data=[feature],
|
||||
decode_method=args.decode_method,
|
||||
beam_alpha=args.alpha,
|
||||
beam_beta=args.beta,
|
||||
beam_size=args.beam_size,
|
||||
cutoff_prob=args.cutoff_prob,
|
||||
vocab_list=data_generator.vocab_list,
|
||||
language_model_path=args.language_model_path,
|
||||
num_processes=1)
|
||||
return result_transcript[0]
|
||||
|
||||
# warming up with utterrances sampled from Librispeech
|
||||
print('-----------------------------------------------------------')
|
||||
print('Warming up ...')
|
||||
warm_up_test(
|
||||
audio_process_handler=file_to_transcript,
|
||||
manifest_path=args.warmup_manifest_path,
|
||||
num_test_cases=3)
|
||||
print('-----------------------------------------------------------')
|
||||
|
||||
# start the server
|
||||
server = AsrTCPServer(
|
||||
server_address=(args.host_ip, args.host_port),
|
||||
RequestHandlerClass=AsrRequestHandler,
|
||||
speech_save_dir=args.speech_save_dir,
|
||||
audio_process_handler=file_to_transcript)
|
||||
print("ASR Server Started.")
|
||||
server.serve_forever()
|
||||
|
||||
|
||||
def main():
|
||||
print_arguments(args)
|
||||
paddle.init(use_gpu=args.use_gpu, trainer_count=1)
|
||||
start_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,177 @@
|
||||
"""Contains DeepSpeech2 layers."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.v2 as paddle
|
||||
|
||||
|
||||
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
|
||||
padding, act):
|
||||
"""Convolution layer with batch normalization.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
|
||||
two image dimension.
|
||||
:type filter_size: int|tuple|list
|
||||
:param num_channels_in: Number of input channels.
|
||||
:type num_channels_in: int
|
||||
:type num_channels_out: Number of output channels.
|
||||
:type num_channels_in: out
|
||||
:param padding: The x dimension of the padding. Or input a tuple for two
|
||||
image dimension.
|
||||
:type padding: int|tuple|list
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Batch norm layer after convolution layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv_layer = paddle.layer.img_conv(
|
||||
input=input,
|
||||
filter_size=filter_size,
|
||||
num_channels=num_channels_in,
|
||||
num_filters=num_channels_out,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=False)
|
||||
return paddle.layer.batch_norm(input=conv_layer, act=act)
|
||||
|
||||
|
||||
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: BaseActivation
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
# input-hidden weights shared across bi-direcitonal rnn.
|
||||
input_proj = paddle.layer.fc(
|
||||
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
input_proj_bn = paddle.layer.batch_norm(
|
||||
input=input_proj, act=paddle.activation.Linear())
|
||||
# forward and backward in time
|
||||
forward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=False)
|
||||
backward_simple_rnn = paddle.layer.recurrent(
|
||||
input=input_proj_bn, act=act, reverse=True)
|
||||
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
||||
|
||||
|
||||
def conv_group(input, num_stacks):
|
||||
"""Convolution group with stacked convolution layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param num_stacks: Number of stacked convolution layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the convolution group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
conv = conv_bn_layer(
|
||||
input=input,
|
||||
filter_size=(11, 41),
|
||||
num_channels_in=1,
|
||||
num_channels_out=32,
|
||||
stride=(3, 2),
|
||||
padding=(5, 20),
|
||||
act=paddle.activation.BRelu())
|
||||
for i in xrange(num_stacks - 1):
|
||||
conv = conv_bn_layer(
|
||||
input=conv,
|
||||
filter_size=(11, 21),
|
||||
num_channels_in=32,
|
||||
num_channels_out=32,
|
||||
stride=(1, 2),
|
||||
padding=(5, 10),
|
||||
act=paddle.activation.BRelu())
|
||||
output_num_channels = 32
|
||||
output_height = 160 // pow(2, num_stacks) + 1
|
||||
return conv, output_num_channels, output_height
|
||||
|
||||
|
||||
def rnn_group(input, size, num_stacks):
|
||||
"""RNN group with stacked bidirectional simple RNN layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: LayerOutput
|
||||
:param size: Number of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: LayerOutput
|
||||
"""
|
||||
output = input
|
||||
for i in xrange(num_stacks):
|
||||
output = bidirectional_simple_rnn_bn_layer(
|
||||
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
|
||||
return output
|
||||
|
||||
|
||||
def deep_speech2(audio_data,
|
||||
text_data,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=256):
|
||||
"""
|
||||
The whole DeepSpeech2 model structure (a simplified version).
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: LayerOutput
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: LayerOutput
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (number of RNN cells).
|
||||
:type rnn_size: int
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
# convolution group
|
||||
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
|
||||
input=audio_data, num_stacks=num_conv_layers)
|
||||
# convert data form convolution feature map to sequence of vectors
|
||||
conv2seq = paddle.layer.block_expand(
|
||||
input=conv_group_output,
|
||||
num_channels=conv_group_num_channels,
|
||||
stride_x=1,
|
||||
stride_y=1,
|
||||
block_x=1,
|
||||
block_y=conv_group_height)
|
||||
# rnn group
|
||||
rnn_group_output = rnn_group(
|
||||
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
|
||||
fc = paddle.layer.fc(
|
||||
input=rnn_group_output,
|
||||
size=dict_size + 1,
|
||||
act=paddle.activation.Linear(),
|
||||
bias_attr=True)
|
||||
# probability distribution with softmax
|
||||
log_probs = paddle.layer.mixed(
|
||||
input=paddle.layer.identity_projection(input=fc),
|
||||
act=paddle.activation.Softmax())
|
||||
# ctc cost
|
||||
ctc_loss = paddle.layer.warp_ctc(
|
||||
input=fc,
|
||||
label=text_data,
|
||||
size=dict_size + 1,
|
||||
blank=dict_size,
|
||||
norm_by_times=True)
|
||||
return log_probs, ctc_loss
|
@ -1,35 +0,0 @@
|
||||
DATA_PATH=$1
|
||||
MODEL_PATH=$2
|
||||
#setted by user
|
||||
TRAIN_MANI=${DATA_PATH}/cloud.train.manifest
|
||||
#setted by user
|
||||
DEV_MANI=${DATA_PATH}/cloud.test.manifest
|
||||
#setted by user
|
||||
TRAIN_TAR=${DATA_PATH}/cloud.train.tar
|
||||
#setted by user
|
||||
DEV_TAR=${DATA_PATH}/cloud.test.tar
|
||||
#setted by user
|
||||
VOCAB_PATH=${DATA_PATH}/eng_vocab.txt
|
||||
#setted by user
|
||||
MEAN_STD_FILE=${DATA_PATH}/mean_std.npz
|
||||
|
||||
# split train data for each pcloud node
|
||||
python ./cloud/split_data.py \
|
||||
--in_manifest_path=$TRAIN_MANI \
|
||||
--data_tar_path=$TRAIN_TAR \
|
||||
--out_manifest_path='./local.train.manifest'
|
||||
|
||||
# split dev data for each pcloud node
|
||||
python ./cloud/split_data.py \
|
||||
--in_manifest_path=$DEV_MANI \
|
||||
--data_tar_path=$DEV_TAR \
|
||||
--out_manifest_path='./local.test.manifest'
|
||||
|
||||
python train.py \
|
||||
--use_gpu=1 \
|
||||
--trainer_count=4 \
|
||||
--batch_size=256 \
|
||||
--mean_std_filepath=$MEAN_STD_FILE \
|
||||
--train_manifest_path='./local.train.manifest' \
|
||||
--dev_manifest_path='./local.test.manifest' \
|
||||
--vocab_filepath=$VOCAB_PATH \
|
@ -1,5 +1,5 @@
|
||||
wget==3.2
|
||||
scipy==0.13.1
|
||||
resampy==0.1.5
|
||||
https://github.com/kpu/kenlm/archive/master.zip
|
||||
SoundFile==0.9.0.post1
|
||||
python_speech_features
|
||||
https://github.com/luotao1/kenlm/archive/master.zip
|
||||
|
@ -0,0 +1,23 @@
|
||||
"""Test Setup."""
|
||||
import unittest
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
class TestSetup(unittest.TestCase):
|
||||
def test_soundfile(self):
|
||||
import soundfile as sf
|
||||
# floating point data is typically limited to the interval [-1.0, 1.0],
|
||||
# but smaller/larger values are supported as well
|
||||
data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5],
|
||||
[0.25, -0.25]])
|
||||
file = 'test.wav'
|
||||
sf.write(file, data, 44100, format='WAV', subtype='FLOAT')
|
||||
read, fs = sf.read(file)
|
||||
self.assertTrue(np.all(read == data))
|
||||
self.assertEqual(fs, 44100)
|
||||
os.remove(file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
@ -0,0 +1,19 @@
|
||||
"""Set up paths for DS2"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
|
||||
|
||||
def add_path(path):
|
||||
if path not in sys.path:
|
||||
sys.path.insert(0, path)
|
||||
|
||||
|
||||
this_dir = os.path.dirname(__file__)
|
||||
|
||||
# Add project path to PYTHONPATH
|
||||
proj_path = os.path.join(this_dir, '..')
|
||||
add_path(proj_path)
|
@ -0,0 +1,59 @@
|
||||
"""Build vocabulary from manifest files.
|
||||
|
||||
Each item in vocabulary file is a character.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
from collections import Counter
|
||||
import os.path
|
||||
import _init_paths
|
||||
from data_utils import utils
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--manifest_paths",
|
||||
type=str,
|
||||
help="Manifest paths for building vocabulary."
|
||||
"You can provide multiple manifest files.",
|
||||
nargs='+',
|
||||
required=True)
|
||||
parser.add_argument(
|
||||
"--count_threshold",
|
||||
default=0,
|
||||
type=int,
|
||||
help="Characters whose counts are below the threshold will be truncated. "
|
||||
"(default: %(default)i)")
|
||||
parser.add_argument(
|
||||
"--vocab_path",
|
||||
default='datasets/vocab/zh_vocab.txt',
|
||||
type=str,
|
||||
help="File path to write the vocabulary. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def count_manifest(counter, manifest_path):
|
||||
manifest_jsons = utils.read_manifest(manifest_path)
|
||||
for line_json in manifest_jsons:
|
||||
for char in line_json['text']:
|
||||
counter.update(char)
|
||||
|
||||
|
||||
def main():
|
||||
counter = Counter()
|
||||
for manifest_path in args.manifest_paths:
|
||||
count_manifest(counter, manifest_path)
|
||||
|
||||
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
|
||||
with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
|
||||
for char, count in count_sorted:
|
||||
if count < args.count_threshold: break
|
||||
fout.write(char + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in new issue