From c7676286ab99d3b30d1c79e44ee30e20d65bd302 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 26 Jul 2017 16:25:11 +0800 Subject: [PATCH 01/30] install libsndfile from /usr to thirdparty --- .gitignore | 1 + README.md | 3 --- requirements.txt | 3 ++- setup.sh | 13 +++++-------- 4 files changed, 8 insertions(+), 12 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..0e0f559f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +thirdparty diff --git a/README.md b/README.md index 3010c0e53..22d0c5386 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,6 @@ Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory sh setup.sh export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH ``` - -For some machines, we also need to install libsndfile1. Details to be added. - ## Usage ### Preparing Data diff --git a/requirements.txt b/requirements.txt index 721fa2811..3f73ea8b8 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ wget==3.2 scipy==0.13.1 resampy==0.1.5 -https://github.com/kpu/kenlm/archive/master.zip +SoundFile==0.9.0.post1 python_speech_features +https://github.com/kpu/kenlm/archive/master.zip diff --git a/setup.sh b/setup.sh index 8cba91ecd..854f879e9 100644 --- a/setup.sh +++ b/setup.sh @@ -9,7 +9,9 @@ if [ $? != 0 ]; then exit 1 fi -# install package Soundfile +# install package libsndfile +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +mkdir thirdparty curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" if [ $? != 0 ]; then echo "Download libsndfile-1.0.28.tar.gz failed !!!" @@ -17,15 +19,10 @@ if [ $? != 0 ]; then fi tar -zxvf libsndfile-1.0.28.tar.gz cd libsndfile-1.0.28 -./configure && make && make install -cd - +./configure --prefix=$DIR/thirdparty/libsndfile && make && make install +cd .. rm -rf libsndfile-1.0.28 rm libsndfile-1.0.28.tar.gz -pip install SoundFile==0.9.0.post1 -if [ $? != 0 ]; then - echo "Install SoundFile failed !!!" - exit 1 -fi # prepare ./checkpoints mkdir checkpoints From 7d7984b8fb67854c58d1106dc8038a864b56d7a1 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 27 Jul 2017 15:40:08 +0800 Subject: [PATCH 02/30] add test for libsndfile installation --- tests/test_setup.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/test_setup.py diff --git a/tests/test_setup.py b/tests/test_setup.py new file mode 100644 index 000000000..bd6fabb0a --- /dev/null +++ b/tests/test_setup.py @@ -0,0 +1,12 @@ +"""Test Setup.""" +import unittest + + +class TestSetup(unittest.TestCase): + # test the installation of libsndfile library + def test_soundfile(self): + import soundfile + + +if __name__ == '__main__': + unittest.main() From b72b70e54c2b5a0e80a48926e437a51dc44a9256 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 28 Jul 2017 16:19:44 +0800 Subject: [PATCH 03/30] add soundfile read/write unitest --- .gitignore | 1 - setup.sh | 24 +++++++++++++----------- tests/test_setup.py | 15 +++++++++++++-- 3 files changed, 26 insertions(+), 14 deletions(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 0e0f559f1..000000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -thirdparty diff --git a/setup.sh b/setup.sh index 854f879e9..4d451a6f1 100644 --- a/setup.sh +++ b/setup.sh @@ -10,19 +10,21 @@ if [ $? != 0 ]; then fi # install package libsndfile -DIR="$( cd "$(dirname "$0")" ; pwd -P )" -mkdir thirdparty -curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" +python -c "import soundfile" if [ $? != 0 ]; then - echo "Download libsndfile-1.0.28.tar.gz failed !!!" - exit 1 + echo "Install package libsndfile into default system path." + curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz" + if [ $? != 0 ]; then + echo "Download libsndfile-1.0.28.tar.gz failed !!!" + exit 1 + fi + tar -zxvf libsndfile-1.0.28.tar.gz + cd libsndfile-1.0.28 + ./configure && make && make install + cd .. + rm -rf libsndfile-1.0.28 + rm libsndfile-1.0.28.tar.gz fi -tar -zxvf libsndfile-1.0.28.tar.gz -cd libsndfile-1.0.28 -./configure --prefix=$DIR/thirdparty/libsndfile && make && make install -cd .. -rm -rf libsndfile-1.0.28 -rm libsndfile-1.0.28.tar.gz # prepare ./checkpoints mkdir checkpoints diff --git a/tests/test_setup.py b/tests/test_setup.py index bd6fabb0a..71a46afb7 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -1,11 +1,22 @@ """Test Setup.""" import unittest +import numpy as np +import os class TestSetup(unittest.TestCase): - # test the installation of libsndfile library def test_soundfile(self): - import soundfile + import soundfile as sf + # floating point data is typically limited to the interval [-1.0, 1.0], + # but smaller/larger values are supported as well + data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5], + [0.25, -0.25]]) + file = 'test.wav' + sf.write(file, data, 44100, format='WAV', subtype='FLOAT') + read, fs = sf.read(file) + assert np.all(read == data) + assert fs == 44100 + os.remove(file) if __name__ == '__main__': From de212572ed8e9d0167e643a52f5556ff6b97dba1 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 28 Jul 2017 18:10:18 +0800 Subject: [PATCH 04/30] update unittest with comments --- tests/test_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_setup.py b/tests/test_setup.py index 71a46afb7..18b9c1a0c 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -14,8 +14,8 @@ class TestSetup(unittest.TestCase): file = 'test.wav' sf.write(file, data, 44100, format='WAV', subtype='FLOAT') read, fs = sf.read(file) - assert np.all(read == data) - assert fs == 44100 + self.assertTrue(np.all(read == data)) + self.assertEqual(fs, 44100) os.remove(file) From 92eacf548bf5ca278a2ad741dd9c901ca6d23a8f Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 31 Jul 2017 21:57:07 +0800 Subject: [PATCH 05/30] Update default config params and result display for evaluator.py and infer.py for DS2. --- evaluate.py | 26 ++++++++++++++++++-------- infer.py | 9 +++++++-- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/evaluate.py b/evaluate.py index 19eabf4e5..1d758687b 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,6 +4,7 @@ from __future__ import division from __future__ import print_function import distutils.util +import sys import argparse import gzip import paddle.v2 as paddle @@ -12,13 +13,19 @@ from model import deep_speech2 from decoder import * from lm.lm_scorer import LmScorer from error_rate import wer +import utils parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--batch_size", - default=100, + default=128, type=int, help="Minibatch size for evaluation. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=8, + type=int, + help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_conv_layers", default=2, @@ -58,8 +65,8 @@ parser.add_argument( "--decode_method", default='beam_search', type=str, - help="Method for ctc decoding, best_path or beam_search. (default: %(default)s)" -) + help="Method for ctc decoding, best_path or beam_search. " + "(default: %(default)s)") parser.add_argument( "--language_model_path", default="lm/data/common_crawl_00.prune01111.trie.klm", @@ -67,12 +74,12 @@ parser.add_argument( help="Path for language model. (default: %(default)s)") parser.add_argument( "--alpha", - default=0.26, + default=0.36, type=float, help="Parameter associated with language model. (default: %(default)f)") parser.add_argument( "--beta", - default=0.1, + default=0.25, type=float, help="Parameter associated with word count. (default: %(default)f)") parser.add_argument( @@ -191,7 +198,7 @@ def evaluate(): blank_id=len(data_generator.vocab_list), num_processes=args.num_processes_beam_search, ext_scoring_func=ext_scorer, - cutoff_prob=args.cutoff_prob, ) + cutoff_prob=args.cutoff_prob) for i, beam_search_result in enumerate(beam_search_results): wer_sum += wer(target_transcription[i], beam_search_result[0][1]) @@ -199,12 +206,15 @@ def evaluate(): else: raise ValueError("Decoding method [%s] is not supported." % decode_method) + print("WER (%d/?) = %f" % (wer_counter, wer_sum / wer_counter)) - print("Final WER = %f" % (wer_sum / wer_counter)) + print("Final WER (%d/%d) = %f" % (wer_counter, wer_counter, + wer_sum / wer_counter)) def main(): - paddle.init(use_gpu=args.use_gpu, trainer_count=1) + utils.print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) evaluate() diff --git a/infer.py b/infer.py index 817526302..ad3fdc4d7 100644 --- a/infer.py +++ b/infer.py @@ -57,6 +57,11 @@ parser.add_argument( type=str, help="Feature type of audio data: 'linear' (power spectrum)" " or 'mfcc'. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=8, + type=int, + help="Trainer number. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -208,7 +213,7 @@ def infer(): wer_cur = wer(target_transcription[i], beam_search_result[0][1]) wer_sum += wer_cur wer_counter += 1 - print("cur wer = %f , average wer = %f" % + print("Current WER = %f , Average WER = %f" % (wer_cur, wer_sum / wer_counter)) else: raise ValueError("Decoding method [%s] is not supported." % @@ -217,7 +222,7 @@ def infer(): def main(): utils.print_arguments(args) - paddle.init(use_gpu=args.use_gpu, trainer_count=1) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) infer() From e2f954f5e258957ef0cd8145da9c36d94a543a28 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 1 Aug 2017 15:27:41 +0800 Subject: [PATCH 06/30] make kenlm install more robust --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3f73ea8b8..eb6022599 100755 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ scipy==0.13.1 resampy==0.1.5 SoundFile==0.9.0.post1 python_speech_features -https://github.com/kpu/kenlm/archive/master.zip +https://github.com/luotao1/kenlm/archive/master.zip From 8122dd9c2999ac451e5a02e22f67d1ba09bfb51c Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 1 Aug 2017 16:21:46 +0800 Subject: [PATCH 07/30] Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. --- evaluate.py | 107 ++++++--------------- infer.py | 106 +++++---------------- layer.py | 155 ++++++++++++++++++++++++++++++ model.py | 265 +++++++++++++++++++++++++++------------------------- train.py | 121 +++++++----------------- tune.py | 102 ++++++++------------ 6 files changed, 415 insertions(+), 441 deletions(-) create mode 100644 layer.py diff --git a/evaluate.py b/evaluate.py index 1d758687b..fb7211fc2 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,14 +4,11 @@ from __future__ import division from __future__ import print_function import distutils.util -import sys import argparse -import gzip +import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import deep_speech2 -from decoder import * -from lm.lm_scorer import LmScorer +from model import DeepSpeech2Model from error_rate import wer import utils @@ -119,37 +116,12 @@ args = parser.parse_args() def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" - # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_threads_data) - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) - output_probs = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.model_filepath)) - - # prepare infer data batch_reader = data_generator.batch_reader_creator( manifest_path=args.decode_manifest_path, batch_size=args.batch_size, @@ -157,59 +129,34 @@ def evaluate(): sortagrad=False, shuffle_method=None) - # define inferer - inferer = paddle.inference.Inference( - output_layer=output_probs, parameters=parameters) - - # initialize external scorer for beam search decoding - if args.decode_method == 'beam_search': - ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path) + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) - wer_counter, wer_sum = 0, 0.0 + wer_sum, num_ins = 0.0, 0 for infer_data in batch_reader(): - # run inference - infer_results = inferer.infer(input=infer_data) - num_steps = len(infer_results) // len(infer_data) - probs_split = [ - infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(0, len(infer_data)) + result_transcripts = ds2_model.infer_batch( + infer_data=infer_data, + decode_method=args.decode_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=args.num_processes_beam_search) + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in infer_data ] - # target transcription - target_transcription = [ - ''.join([ - data_generator.vocab_list[index] for index in infer_data[i][1] - ]) for i, probs in enumerate(probs_split) - ] - # decode and print - # best path decode - if args.decode_method == "best_path": - for i, probs in enumerate(probs_split): - output_transcription = ctc_best_path_decoder( - probs_seq=probs, vocabulary=data_generator.vocab_list) - wer_sum += wer(target_transcription[i], output_transcription) - wer_counter += 1 - # beam search decode - elif args.decode_method == "beam_search": - # beam search using multiple processes - beam_search_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=data_generator.vocab_list, - beam_size=args.beam_size, - blank_id=len(data_generator.vocab_list), - num_processes=args.num_processes_beam_search, - ext_scoring_func=ext_scorer, - cutoff_prob=args.cutoff_prob) - for i, beam_search_result in enumerate(beam_search_results): - wer_sum += wer(target_transcription[i], - beam_search_result[0][1]) - wer_counter += 1 - else: - raise ValueError("Decoding method [%s] is not supported." % - decode_method) - print("WER (%d/?) = %f" % (wer_counter, wer_sum / wer_counter)) - - print("Final WER (%d/%d) = %f" % (wer_counter, wer_counter, - wer_sum / wer_counter)) + for target, result in zip(target_transcripts, result_transcripts): + wer_sum += wer(target, result) + num_ins += 1 + print("WER (%d/?) = %f" % (num_ins, wer_sum / num_ins)) + print("Final WER (%d/%d) = %f" % (num_ins, num_ins, wer_sum / num_ins)) def main(): diff --git a/infer.py b/infer.py index ad3fdc4d7..ec65cc748 100644 --- a/infer.py +++ b/infer.py @@ -4,14 +4,11 @@ from __future__ import division from __future__ import print_function import argparse -import gzip import distutils.util import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import deep_speech2 -from decoder import * -from lm.lm_scorer import LmScorer +from model import DeepSpeech2Model from error_rate import wer import utils @@ -124,37 +121,12 @@ args = parser.parse_args() def infer(): """Inference for DeepSpeech2.""" - # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_threads_data) - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) - output_probs = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.model_filepath)) - - # prepare infer data batch_reader = data_generator.batch_reader_creator( manifest_path=args.decode_manifest_path, batch_size=args.num_samples, @@ -163,61 +135,31 @@ def infer(): shuffle_method=None) infer_data = batch_reader().next() - # run inference - infer_results = paddle.infer( - output_layer=output_probs, parameters=parameters, input=infer_data) - num_steps = len(infer_results) // len(infer_data) - probs_split = [ - infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(len(infer_data)) - ] + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) + result_transcripts = ds2_model.infer_batch( + infer_data=infer_data, + decode_method=args.decode_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=args.num_processes_beam_search) - # targe transcription - target_transcription = [ - ''.join( - [data_generator.vocab_list[index] for index in infer_data[i][1]]) - for i, probs in enumerate(probs_split) + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in infer_data ] - - ## decode and print - # best path decode - wer_sum, wer_counter = 0, 0 - if args.decode_method == "best_path": - for i, probs in enumerate(probs_split): - best_path_transcription = ctc_best_path_decoder( - probs_seq=probs, vocabulary=data_generator.vocab_list) - print("\nTarget Transcription: %s\nOutput Transcription: %s" % - (target_transcription[i], best_path_transcription)) - wer_cur = wer(target_transcription[i], best_path_transcription) - wer_sum += wer_cur - wer_counter += 1 - print("cur wer = %f, average wer = %f" % - (wer_cur, wer_sum / wer_counter)) - # beam search decode - elif args.decode_method == "beam_search": - ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path) - beam_search_batch_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=data_generator.vocab_list, - beam_size=args.beam_size, - blank_id=len(data_generator.vocab_list), - num_processes=args.num_processes_beam_search, - cutoff_prob=args.cutoff_prob, - ext_scoring_func=ext_scorer, ) - for i, beam_search_result in enumerate(beam_search_batch_results): - print("\nTarget Transcription:\t%s" % target_transcription[i]) - for index in xrange(args.num_results_per_sample): - result = beam_search_result[index] - #output: index, log prob, beam result - print("Beam %d: %f \t%s" % (index, result[0], result[1])) - wer_cur = wer(target_transcription[i], beam_search_result[0][1]) - wer_sum += wer_cur - wer_counter += 1 - print("Current WER = %f , Average WER = %f" % - (wer_cur, wer_sum / wer_counter)) - else: - raise ValueError("Decoding method [%s] is not supported." % - decode_method) + for target, result in zip(target_transcripts, result_transcripts): + print("\nTarget Transcription: %s\nOutput Transcription: %s" % + (target, result)) + print("Current wer = %f" % wer(target, result)) def main(): diff --git a/layer.py b/layer.py new file mode 100644 index 000000000..7b0273389 --- /dev/null +++ b/layer.py @@ -0,0 +1,155 @@ +"""Contains DeepSpeech2 layers.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.v2 as paddle + +DISABLE_CUDNN_BATCH_NORM = True + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + """ + Convolution layer with batch normalization. + """ + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + if DISABLE_CUDNN_BATCH_NORM: + # temopary patch, need to be removed. + return paddle.layer.batch_norm( + input=conv_layer, act=act, batch_norm_type="batch_norm") + else: + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectional_simple_rnn_bn_layer(name, input, size, act): + """ + Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + """ + # input-hidden weights shared across bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) + # batch norm is only performed on input-state projection + if DISABLE_CUDNN_BATCH_NORM: + # temopary patch, need to be removed. + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, + act=paddle.activation.Linear(), + batch_norm_type="batch_norm") + else: + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) + + +def conv_group(input, num_stacks): + """ + Convolution group with several stacking convolution layers. + """ + conv = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + for i in xrange(num_stacks - 1): + conv = conv_bn_layer( + input=conv, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + output_num_channels = 32 + output_height = 160 // pow(2, num_stacks) + 1 + return conv, output_num_channels, output_height + + +def rnn_group(input, size, num_stacks): + """ + RNN group with several stacking RNN layers. + """ + output = input + for i in xrange(num_stacks): + output = bidirectional_simple_rnn_bn_layer( + name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + return output + + +def deep_speech2(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256): + """ + The whole DeepSpeech2 model structure (a simplified version). + + :param audio_data: Audio spectrogram data layer. + :type audio_data: LayerOutput + :param text_data: Transcription text data layer. + :type text_data: LayerOutput + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (number of RNN cells). + :type rnn_size: int + :param is_inference: False in the training mode, and True in the + inferene mode. + :type is_inference: bool + :return: If is_inference set False, return a ctc cost layer; + if is_inference set True, return a sequence layer of output + probability distribution. + :rtype: tuple of LayerOutput + """ + # convolution group + conv_group_output, conv_group_num_channels, conv_group_height = conv_group( + input=audio_data, num_stacks=num_conv_layers) + # convert data form convolution feature map to sequence of vectors + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=conv_group_num_channels, + stride_x=1, + stride_y=1, + block_x=1, + block_y=conv_group_height) + # rnn group + rnn_group_output = rnn_group( + input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + # probability distribution with softmax + log_probs = paddle.layer.mixed( + input=paddle.layer.identity_projection(input=fc), + act=paddle.activation.Softmax()) + # ctc cost + ctc_loss = paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) + return log_probs, ctc_loss diff --git a/model.py b/model.py index cb0b4ecbb..d1efabb75 100644 --- a/model.py +++ b/model.py @@ -3,141 +3,150 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys +import os +import time +import gzip +from decoder import * +from lm.lm_scorer import LmScorer import paddle.v2 as paddle +from layer import * -def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, - padding, act): - """ - Convolution layer with batch normalization. - """ - conv_layer = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=num_channels_in, - num_filters=num_channels_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=conv_layer, act=act) +class DeepSpeech2Model(object): + def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, + rnn_layer_size, pretrained_model_path): + self._create_network(vocab_size, num_conv_layers, num_rnn_layers, + rnn_layer_size) + self._create_parameters(pretrained_model_path) + self._inferer = None + self._ext_scorer = None + def train(self, + train_batch_reader, + dev_batch_reader, + feeding_dict, + learning_rate, + gradient_clipping, + num_passes, + num_iterations_print=100, + output_model_dir='checkpoints'): + # prepare optimizer and trainer + optimizer = paddle.optimizer.Adam( + learning_rate=learning_rate, + gradient_clipping_threshold=gradient_clipping) + trainer = paddle.trainer.SGD( + cost=self._loss, + parameters=self._parameters, + update_equation=optimizer) -def bidirectional_simple_rnn_bn_layer(name, input, size, act): - """ - Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - """ - # input-hidden weights shared across bi-direcitonal rnn. - input_proj = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) - # forward and backward in time - forward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=False) - backward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=True) - return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) + # create event handler + def event_handler(event): + global start_time, cost_sum, cost_counter + if isinstance(event, paddle.event.EndIteration): + cost_sum += event.cost + cost_counter += 1 + if (event.batch_id + 1) % num_iterations_print == 0: + output_model_path = os.path.join(output_model_dir, + "params.latest.tar.gz") + with gzip.open(output_model_path, 'w') as f: + self._parameters.to_tar(f) + print("\nPass: %d, Batch: %d, TrainCost: %f" % + (event.pass_id, event.batch_id + 1, + cost_sum / cost_counter)) + cost_sum, cost_counter = 0.0, 0 + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.BeginPass): + start_time = time.time() + cost_sum, cost_counter = 0.0, 0 + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=dev_batch_reader, feeding=feeding_dict) + output_model_path = os.path.join( + output_model_dir, "params.pass-%d.tar.gz" % event.pass_id) + with gzip.open(output_model_path, 'w') as f: + self._parameters.to_tar(f) + print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % + (time.time() - start_time, event.pass_id, result.cost)) + # run train + trainer.train( + reader=train_batch_reader, + event_handler=event_handler, + num_passes=num_passes, + feeding=feeding_dict) -def conv_group(input, num_stacks): - """ - Convolution group with several stacking convolution layers. - """ - conv = conv_bn_layer( - input=input, - filter_size=(11, 41), - num_channels_in=1, - num_channels_out=32, - stride=(3, 2), - padding=(5, 20), - act=paddle.activation.BRelu()) - for i in xrange(num_stacks - 1): - conv = conv_bn_layer( - input=conv, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - output_num_channels = 32 - output_height = 160 // pow(2, num_stacks) + 1 - return conv, output_num_channels, output_height + def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, + beam_size, cutoff_prob, vocab_list, language_model_path, + num_processes): + # define inferer + if self._inferer == None: + self._inferer = paddle.inference.Inference( + output_layer=self._log_probs, parameters=self._parameters) + # run inference + infer_results = self._inferer.infer(input=infer_data) + num_steps = len(infer_results) // len(infer_data) + probs_split = [ + infer_results[i * num_steps:(i + 1) * num_steps] + for i in xrange(0, len(infer_data)) + ] + # run decoder + results = [] + if decode_method == "best_path": + # best path decode + for i, probs in enumerate(probs_split): + output_transcription = ctc_best_path_decoder( + probs_seq=probs, vocabulary=data_generator.vocab_list) + results.append(output_transcription) + elif decode_method == "beam_search": + # initialize external scorer + if self._ext_scorer == None: + self._ext_scorer = LmScorer(beam_alpha, beam_beta, + language_model_path) + self._loaded_lm_path = language_model_path + else: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + assert self._loaded_lm_path == language_model_path + # beam search decode + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=vocab_list, + beam_size=beam_size, + blank_id=len(vocab_list), + num_processes=num_processes, + ext_scoring_func=self._ext_scorer, + cutoff_prob=cutoff_prob) + results = [result[0][1] for result in beam_search_results] + else: + raise ValueError("Decoding method [%s] is not supported." % + decode_method) + return results -def rnn_group(input, size, num_stacks): - """ - RNN group with several stacking RNN layers. - """ - output = input - for i in xrange(num_stacks): - output = bidirectional_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) - return output + def _create_parameters(self, model_path=None): + if model_path is None: + self._parameters = paddle.parameters.create(self._loss) + else: + self._parameters = paddle.parameters.Parameters.from_tar( + gzip.open(model_path)) - -def deep_speech2(audio_data, - text_data, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=256, - is_inference=False): - """ - The whole DeepSpeech2 model structure (a simplified version). - - :param audio_data: Audio spectrogram data layer. - :type audio_data: LayerOutput - :param text_data: Transcription text data layer. - :type text_data: LayerOutput - :param dict_size: Dictionary size for tokenized transcription. - :type dict_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_size: RNN layer size (number of RNN cells). - :type rnn_size: int - :param is_inference: False in the training mode, and True in the - inferene mode. - :type is_inference: bool - :return: If is_inference set False, return a ctc cost layer; - if is_inference set True, return a sequence layer of output - probability distribution. - :rtype: tuple of LayerOutput - """ - # convolution group - conv_group_output, conv_group_num_channels, conv_group_height = conv_group( - input=audio_data, num_stacks=num_conv_layers) - # convert data form convolution feature map to sequence of vectors - conv2seq = paddle.layer.block_expand( - input=conv_group_output, - num_channels=conv_group_num_channels, - stride_x=1, - stride_y=1, - block_x=1, - block_y=conv_group_height) - # rnn group - rnn_group_output = rnn_group( - input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) - fc = paddle.layer.fc( - input=rnn_group_output, - size=dict_size + 1, - act=paddle.activation.Linear(), - bias_attr=True) - if is_inference: - # probability distribution with softmax - return paddle.layer.mixed( - input=paddle.layer.identity_projection(input=fc), - act=paddle.activation.Softmax()) - else: - # ctc cost - return paddle.layer.warp_ctc( - input=fc, - label=text_data, - size=dict_size + 1, - blank=dict_size, - norm_by_times=True) + def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, + rnn_layer_size): + # paddle.data_type.dense_array is used for variable batch input. + # The size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be induced during training. + audio_data = paddle.layer.data( + name="audio_spectrogram", + type=paddle.data_type.dense_array(161 * 161)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(vocab_size)) + self._log_probs, self._loss = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=vocab_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_layer_size) diff --git a/train.py b/train.py index 6481074c6..45f7a6d9d 100644 --- a/train.py +++ b/train.py @@ -3,15 +3,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import sys -import os import argparse -import gzip -import time import distutils.util import multiprocessing import paddle.v2 as paddle -from model import deep_speech2 +from model import DeepSpeech2Model from data_utils.data import DataGenerator import utils @@ -23,6 +19,12 @@ parser.add_argument( default=200, type=int, help="Training pass number. (default: %(default)s)") +parser.add_argument( + "--num_iterations_print", + default=100, + type=int, + help="Number of iterations for every train cost printing. " + "(default: %(default)s)") parser.add_argument( "--num_conv_layers", default=2, @@ -127,100 +129,47 @@ args = parser.parse_args() def train(): """DeepSpeech2 training.""" - - # initialize data generator - def data_generator(): - return DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config, - max_duration=args.max_duration, - min_duration=args.min_duration, - specgram_type=args.specgram_type, - num_threads=args.num_threads_data) - - train_generator = data_generator() - test_generator = data_generator() - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence( - train_generator.vocab_size)) - cost = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=train_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=False) - - # create/load parameters and optimizer - if args.init_model_path is None: - parameters = paddle.parameters.create(cost) - else: - if not os.path.isfile(args.init_model_path): - raise IOError("Invalid model!") - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.init_model_path)) - optimizer = paddle.optimizer.Adam( - learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - - # prepare data reader + train_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config=args.augmentation_config, + max_duration=args.max_duration, + min_duration=args.min_duration, + specgram_type=args.specgram_type, + num_threads=args.num_threads_data) + dev_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config="{}", + specgram_type=args.specgram_type, + num_threads=args.num_threads_data) train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, shuffle_method=args.shuffle_method) - test_batch_reader = test_generator.batch_reader_creator( + dev_batch_reader = dev_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, shuffle_method=None) - # create event handler - def event_handler(event): - global start_time, cost_sum, cost_counter - if isinstance(event, paddle.event.EndIteration): - cost_sum += event.cost - cost_counter += 1 - if (event.batch_id + 1) % 100 == 0: - print("\nPass: %d, Batch: %d, TrainCost: %f" % ( - event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) - cost_sum, cost_counter = 0.0, 0 - with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f: - parameters.to_tar(f) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.BeginPass): - start_time = time.time() - cost_sum, cost_counter = 0.0, 0 - if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=test_batch_reader, feeding=test_generator.feeding) - print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % - (time.time() - start_time, event.pass_id, result.cost)) - with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id, - 'w') as f: - parameters.to_tar(f) - - # run train - trainer.train( - reader=train_batch_reader, - event_handler=event_handler, + ds2_model = DeepSpeech2Model( + vocab_size=train_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.init_model_path) + ds2_model.train( + train_batch_reader=train_batch_reader, + dev_batch_reader=dev_batch_reader, + feeding_dict=train_generator.feeding, + learning_rate=args.adam_learning_rate, + gradient_clipping=400, num_passes=args.num_passes, - feeding=train_generator.feeding) + num_iterations_print=args.num_iterations_print) def main(): diff --git a/tune.py b/tune.py index 2fcca4862..f414622e3 100644 --- a/tune.py +++ b/tune.py @@ -3,14 +3,13 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np import distutils.util import argparse -import gzip +import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import deep_speech2 -from decoder import * -from lm.lm_scorer import LmScorer +from model import DeepSpeech2Model from error_rate import wer import utils @@ -40,6 +39,11 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=8, + type=int, + help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", default=multiprocessing.cpu_count(), @@ -62,10 +66,10 @@ parser.add_argument( type=str, help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( - "--decode_manifest_path", + "--tune_manifest_path", default='datasets/manifest.test', type=str, - help="Manifest path for decoding. (default: %(default)s)") + help="Manifest path for tuning. (default: %(default)s)") parser.add_argument( "--model_filepath", default='checkpoints/params.latest.tar.gz', @@ -127,96 +131,64 @@ args = parser.parse_args() def tune(): """Tune parameters alpha and beta on one minibatch.""" - if not args.num_alphas >= 0: raise ValueError("num_alphas must be non-negative!") - if not args.num_betas >= 0: raise ValueError("num_betas must be non-negative!") - # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_threads_data) - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) - output_probs = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.model_filepath)) - - # prepare infer data batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.tune_manifest_path, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) - # get one batch data for tuning - infer_data = batch_reader().next() - - # run inference - infer_results = paddle.infer( - output_layer=output_probs, parameters=parameters, input=infer_data) - num_steps = len(infer_results) // len(infer_data) - probs_split = [ - infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(0, len(infer_data)) + tune_data = batch_reader().next() + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in tune_data ] + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) + # create grid for search cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) params_grid = [(alpha, beta) for alpha in cand_alphas for beta in cand_betas] - ext_scorer = LmScorer(args.alpha_from, args.beta_from, - args.language_model_path) ## tune parameters in loop for alpha, beta in params_grid: - wer_sum, wer_counter = 0, 0 - # reset scorer - ext_scorer.reset_params(alpha, beta) - # beam search using multiple processes - beam_search_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=data_generator.vocab_list, + result_transcripts = ds2_model.infer_batch( + infer_data=tune_data, + decode_method='beam_search', + beam_alpha=alpha, + beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, - blank_id=len(data_generator.vocab_list), - num_processes=args.num_processes_beam_search, - ext_scoring_func=ext_scorer, ) - for i, beam_search_result in enumerate(beam_search_results): - target_transcription = ''.join([ - data_generator.vocab_list[index] for index in infer_data[i][1] - ]) - wer_sum += wer(target_transcription, beam_search_result[0][1]) - wer_counter += 1 - + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=args.num_processes_beam_search) + wer_sum, num_ins = 0.0, 0 + for target, result in zip(target_transcripts, result_transcripts): + wer_sum += wer(target, result) + num_ins += 1 print("alpha = %f\tbeta = %f\tWER = %f" % - (alpha, beta, wer_sum / wer_counter)) + (alpha, beta, wer_sum / num_ins)) def main(): - paddle.init(use_gpu=args.use_gpu, trainer_count=1) + utils.print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) tune() From 5e20dfd4fb67eb10d64baede976c06314c5e8d37 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 1 Aug 2017 17:30:30 +0800 Subject: [PATCH 08/30] change the wget method in run.sh of deep_speech2 --- datasets/librispeech/librispeech.py | 4 ++-- requirements.txt | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py index 87e52ae4a..7e941f0ea 100644 --- a/datasets/librispeech/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -11,7 +11,7 @@ from __future__ import print_function import distutils.util import os -import wget +import sys import tarfile import argparse import soundfile @@ -66,7 +66,7 @@ def download(url, md5sum, target_dir): filepath = os.path.join(target_dir, url.split("/")[-1]) if not (os.path.exists(filepath) and md5file(filepath) == md5sum): print("Downloading %s ..." % url) - wget.download(url, target_dir) + os.system("wget -c " + url + " -P " + target_dir) print("\nMD5 Chesksum %s ..." % filepath) if not md5file(filepath) == md5sum: raise RuntimeError("MD5 checksum failed.") diff --git a/requirements.txt b/requirements.txt index eb6022599..131f75ff4 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -wget==3.2 scipy==0.13.1 resampy==0.1.5 SoundFile==0.9.0.post1 From a48469b9b6debe5f7f4b0160c8dd402812228175 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 2 Aug 2017 15:24:29 +0800 Subject: [PATCH 09/30] add the requirement for cuDNN version in README --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 22d0c5386..62b051714 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,20 @@ ## Installation -Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. +### Prerequisites + + - **Python = 2.7** only supported; + - **cuDNN >= 6.0** is required to utilize NVIDIA GPU platform in the installation of PaddlePaddle, and the **CUDA toolkit** with proper version suitable for cuDNN. The cuDNN library below 6.0 is found to yield a fatal error in batch normalization when handling utterances with long duration in inference. + +### Setup ``` sh setup.sh export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH ``` + +Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. + ## Usage ### Preparing Data From 526e18b11964b00ced661e0119244d7bf8e0229a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 2 Aug 2017 23:50:46 +0800 Subject: [PATCH 10/30] Add function docs for layer.py and model.py and update other details. --- decoder.py | 2 +- infer.py | 2 +- layer.py | 84 ++++++++++++++++++++++++++++++++++-------------------- model.py | 74 +++++++++++++++++++++++++++++++++++++++++++++-- setup.sh | 3 -- train.py | 8 +++++- tune.py | 4 +-- 7 files changed, 136 insertions(+), 41 deletions(-) diff --git a/decoder.py b/decoder.py index a1fadc2c8..8f2e0508d 100644 --- a/decoder.py +++ b/decoder.py @@ -205,9 +205,9 @@ def ctc_beam_search_decoder_batch(probs_split, :type num_processes: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. + :type cutoff_prob: float :param num_processes: Number of parallel processes. :type num_processes: int - :type cutoff_prob: float :param ext_scoring_func: External scoring function for partially decoded sentence, e.g. word count or language model. diff --git a/infer.py b/infer.py index ec65cc748..bc77dab70 100644 --- a/infer.py +++ b/infer.py @@ -40,7 +40,7 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=1, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( diff --git a/layer.py b/layer.py index 7b0273389..3b492645d 100644 --- a/layer.py +++ b/layer.py @@ -5,13 +5,27 @@ from __future__ import print_function import paddle.v2 as paddle -DISABLE_CUDNN_BATCH_NORM = True - def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding, act): - """ - Convolution layer with batch normalization. + """Convolution layer with batch normalization. + + :param input: Input layer. + :type input: LayerOutput + :param filter_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type filter_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :type num_channels_out: Number of output channels. + :type num_channels_in: out + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type. + :type act: BaseActivation + :return: Batch norm layer after convolution layer. + :rtype: LayerOutput """ conv_layer = paddle.layer.img_conv( input=input, @@ -22,32 +36,30 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding=padding, act=paddle.activation.Linear(), bias_attr=False) - if DISABLE_CUDNN_BATCH_NORM: - # temopary patch, need to be removed. - return paddle.layer.batch_norm( - input=conv_layer, act=act, batch_norm_type="batch_norm") - else: - return paddle.layer.batch_norm(input=conv_layer, act=act) + return paddle.layer.batch_norm(input=conv_layer, act=act) def bidirectional_simple_rnn_bn_layer(name, input, size, act): - """ - Bidirectonal simple rnn layer with sequence-wise batch normalization. + """Bidirectonal simple rnn layer with sequence-wise batch normalization. The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput """ # input-hidden weights shared across bi-direcitonal rnn. input_proj = paddle.layer.fc( input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) # batch norm is only performed on input-state projection - if DISABLE_CUDNN_BATCH_NORM: - # temopary patch, need to be removed. - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, - act=paddle.activation.Linear(), - batch_norm_type="batch_norm") - else: - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) # forward and backward in time forward_simple_rnn = paddle.layer.recurrent( input=input_proj_bn, act=act, reverse=False) @@ -57,8 +69,14 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): def conv_group(input, num_stacks): - """ - Convolution group with several stacking convolution layers. + """Convolution group with stacked convolution layers. + + :param input: Input layer. + :type input: LayerOutput + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + :return: Output layer of the convolution group. + :rtype: LayerOutput """ conv = conv_bn_layer( input=input, @@ -83,8 +101,16 @@ def conv_group(input, num_stacks): def rnn_group(input, size, num_stacks): - """ - RNN group with several stacking RNN layers. + """RNN group with stacked bidirectional simple RNN layers. + + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :return: Output layer of the RNN group. + :rtype: LayerOutput """ output = input for i in xrange(num_stacks): @@ -114,12 +140,8 @@ def deep_speech2(audio_data, :type num_rnn_layers: int :param rnn_size: RNN layer size (number of RNN cells). :type rnn_size: int - :param is_inference: False in the training mode, and True in the - inferene mode. - :type is_inference: bool - :return: If is_inference set False, return a ctc cost layer; - if is_inference set True, return a sequence layer of output - probability distribution. + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ # convolution group diff --git a/model.py b/model.py index d1efabb75..f5333f170 100644 --- a/model.py +++ b/model.py @@ -14,6 +14,21 @@ from layer import * class DeepSpeech2Model(object): + """DeepSpeech2Model class. + + :param vocab_size: Decoding vocabulary size. + :type vocab_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_layer_size: RNN layer size (number of RNN cells). + :type rnn_layer_size: int + :param pretrained_model_path: Pretrained model path. If None, will train + from stratch. + :type pretrained_model_path: basestring|None + """ + def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, pretrained_model_path): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, @@ -29,8 +44,33 @@ class DeepSpeech2Model(object): learning_rate, gradient_clipping, num_passes, - num_iterations_print=100, - output_model_dir='checkpoints'): + output_model_dir, + num_iterations_print=100): + """Train the model. + + :param train_batch_reader: Train data reader. + :type train_batch_reader: callable + :param dev_batch_reader: Validation data reader. + :type dev_batch_reader: callable + :param feeding_dict: Feeding is a map of field name and tuple index + of the data that reader returns. + :type feeding_dict: dict|list + :param learning_rate: Learning rate for ADAM optimizer. + :type learning_rate: float + :param gradient_clipping: Gradient clipping threshold. + :type gradient_clipping: float + :param num_passes: Number of training epochs. + :type num_passes: int + :param num_iterations_print: Number of training iterations for printing + a training loss. + :type rnn_iteratons_print: int + :param output_model_dir: Directory for saving the model (every pass). + :type output_model_dir: basestring + """ + # prepare model output directory + if not os.path.exists(output_model_dir): + os.mkdir(output_model_dir) + # prepare optimizer and trainer optimizer = paddle.optimizer.Adam( learning_rate=learning_rate, @@ -81,6 +121,34 @@ class DeepSpeech2Model(object): def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): + """Model inference. Infer the transcription for a batch of speech + utterances. + + :param infer_data: List of utterances to infer, with each utterance a + tuple of audio features and transcription text (empty + string). + :type infer_data: list + :param decode_method: Decoding method name, 'best_path' or + 'beam search'. + :param decode_method: string + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param language_model_path: Filepath for language model. + :type language_model_path: basestring|None + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of basestring + """ # define inferer if self._inferer == None: self._inferer = paddle.inference.Inference( @@ -126,6 +194,7 @@ class DeepSpeech2Model(object): return results def _create_parameters(self, model_path=None): + """Load or create model parameters.""" if model_path is None: self._parameters = paddle.parameters.create(self._loss) else: @@ -134,6 +203,7 @@ class DeepSpeech2Model(object): def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size): + """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape # of input batch data will be induced during training. diff --git a/setup.sh b/setup.sh index 4d451a6f1..7f4272550 100644 --- a/setup.sh +++ b/setup.sh @@ -26,7 +26,4 @@ if [ $? != 0 ]; then rm libsndfile-1.0.28.tar.gz fi -# prepare ./checkpoints -mkdir checkpoints - echo "Install all dependencies successfully." diff --git a/train.py b/train.py index 45f7a6d9d..080f57d2d 100644 --- a/train.py +++ b/train.py @@ -116,6 +116,11 @@ parser.add_argument( help="If set None, the training will start from scratch. " "Otherwise, the training will resume from " "the existing model of this path. (default: %(default)s)") +parser.add_argument( + "--output_model_dir", + default="./checkpoints", + type=str, + help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", default='[{"type": "shift", ' @@ -169,7 +174,8 @@ def train(): learning_rate=args.adam_learning_rate, gradient_clipping=400, num_passes=args.num_passes, - num_iterations_print=args.num_iterations_print) + num_iterations_print=args.num_iterations_print, + output_model_dir=args.output_model_dir) def main(): diff --git a/tune.py b/tune.py index f414622e3..a17be30fa 100644 --- a/tune.py +++ b/tune.py @@ -46,7 +46,7 @@ parser.add_argument( help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=1, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( @@ -67,7 +67,7 @@ parser.add_argument( help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--tune_manifest_path", - default='datasets/manifest.test', + default='datasets/manifest.dev', type=str, help="Manifest path for tuning. (default: %(default)s)") parser.add_argument( From 0ebf36b98fb8484b44bb512e76f6b94a1799a1c2 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 2 Aug 2017 21:45:08 +0800 Subject: [PATCH 11/30] Add a realtime ASR demo for users to test their own voice with mic. --- data_utils/audio.py | 2 + data_utils/data.py | 29 +++--- demo_client.py | 75 ++++++++++++++++ demo_server.py | 208 ++++++++++++++++++++++++++++++++++++++++++++ infer.py | 13 +-- model.py | 10 +++ requirements.txt | 2 + 7 files changed, 320 insertions(+), 19 deletions(-) create mode 100644 demo_client.py create mode 100644 demo_server.py mode change 100755 => 100644 requirements.txt diff --git a/data_utils/audio.py b/data_utils/audio.py index 3891f5b92..29fdd0bd8 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -64,6 +64,8 @@ class AudioSegment(object): :rtype: AudioSegment """ samples, sample_rate = soundfile.read(file, dtype='float32') + print(samples) + print(sample_rate) return cls(samples, sample_rate) @classmethod diff --git a/data_utils/data.py b/data_utils/data.py index d01ca8cc7..fe064b806 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -83,6 +83,23 @@ class DataGenerator(object): self._rng = random.Random(random_seed) self._epoch = 0 + def process_utterance(self, filename, transcript): + """Load, augment, featurize and normalize for speech data. + + :param filename: Audio filepath + :type filename: basestring + :param transcript: Transcription text. + :type transcript: basestring + :return: Tuple of audio feature tensor and list of token ids for + transcription. + :rtype: tuple of (2darray, list) + """ + speech_segment = SpeechSegment.from_file(filename, transcript) + self._augmentation_pipeline.transform_audio(speech_segment) + specgram, text_ids = self._speech_featurizer.featurize(speech_segment) + specgram = self._normalizer.apply(specgram) + return specgram, text_ids + def batch_reader_creator(self, manifest_path, batch_size, @@ -198,14 +215,6 @@ class DataGenerator(object): """ return self._speech_featurizer.vocab_list - def _process_utterance(self, filename, transcript): - """Load, augment, featurize and normalize for speech data.""" - speech_segment = SpeechSegment.from_file(filename, transcript) - self._augmentation_pipeline.transform_audio(speech_segment) - specgram, text_ids = self._speech_featurizer.featurize(speech_segment) - specgram = self._normalizer.apply(specgram) - return specgram, text_ids - def _instance_reader_creator(self, manifest): """ Instance reader creator. Create a callable function to produce @@ -220,8 +229,8 @@ class DataGenerator(object): yield instance def mapper(instance): - return self._process_utterance(instance["audio_filepath"], - instance["text"]) + return self.process_utterance(instance["audio_filepath"], + instance["text"]) return paddle.reader.xmap_readers( mapper, reader, self._num_threads, 1024, order=True) diff --git a/demo_client.py b/demo_client.py new file mode 100644 index 000000000..97649fd48 --- /dev/null +++ b/demo_client.py @@ -0,0 +1,75 @@ +from pynput import keyboard +import struct +import socket +import sys +import pyaudio + +HOST, PORT = "10.104.18.14", 8086 + +is_recording = False +enable_trigger_record = True + + +def on_press(key): + global is_recording, enable_trigger_record + if key == keyboard.Key.space: + if (not is_recording) and enable_trigger_record: + sys.stdout.write("Start Recording ... ") + sys.stdout.flush() + is_recording = True + + +def on_release(key): + global is_recording, enable_trigger_record + if key == keyboard.Key.esc: + return False + elif key == keyboard.Key.space: + if is_recording == True: + is_recording = False + + +data_list = [] + + +def callback(in_data, frame_count, time_info, status): + global data_list, is_recording, enable_trigger_record + if is_recording: + data_list.append(in_data) + enable_trigger_record = False + elif len(data_list) > 0: + # Connect to server and send data + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((HOST, PORT)) + sent = ''.join(data_list) + sock.sendall(struct.pack('>i', len(sent)) + sent) + print('Speech[length=%d] Sent.' % len(sent)) + # Receive data from the server and shut down + received = sock.recv(1024) + print "Recognition Results: {}".format(received) + sock.close() + data_list = [] + enable_trigger_record = True + return (in_data, pyaudio.paContinue) + + +def main(): + p = pyaudio.PyAudio() + stream = p.open( + format=pyaudio.paInt32, + channels=1, + rate=16000, + input=True, + stream_callback=callback) + stream.start_stream() + + with keyboard.Listener( + on_press=on_press, on_release=on_release) as listener: + listener.join() + + stream.stop_stream() + stream.close() + p.terminate() + + +if __name__ == "__main__": + main() diff --git a/demo_server.py b/demo_server.py new file mode 100644 index 000000000..4a3feb138 --- /dev/null +++ b/demo_server.py @@ -0,0 +1,208 @@ +import os +import time +import argparse +import distutils.util +from time import gmtime, strftime +import SocketServer +import struct +import wave +import pyaudio +import paddle.v2 as paddle +from data_utils.data import DataGenerator +from model import DeepSpeech2Model +import utils + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--host_ip", + default="10.104.18.14", + type=str, + help="Server IP address. (default: %(default)s)") +parser.add_argument( + "--host_port", + default=8086, + type=int, + help="Server Port. (default: %(default)s)") +parser.add_argument( + "--speech_save_dir", + default="demo_cache", + type=str, + help="Directory for saving demo speech. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='datasets/vocab/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--mean_std_filepath", + default='mean_std.npz', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--specgram_type", + default='linear', + type=str, + help="Feature type of audio data: 'linear' (power spectrum)" + " or 'mfcc'. (default: %(default)s)") +parser.add_argument( + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") +parser.add_argument( + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") +parser.add_argument( + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--model_filepath", + default='checkpoints/params.latest.tar.gz', + type=str, + help="Model filepath. (default: %(default)s)") +parser.add_argument( + "--decode_method", + default='beam_search', + type=str, + help="Method for ctc decoding: best_path or beam_search. " + "(default: %(default)s)") +parser.add_argument( + "--beam_size", + default=500, + type=int, + help="Width for beam search decoding. (default: %(default)d)") +parser.add_argument( + "--language_model_path", + default="lm/data/common_crawl_00.prune01111.trie.klm", + type=str, + help="Path for language model. (default: %(default)s)") +parser.add_argument( + "--alpha", + default=0.36, + type=float, + help="Parameter associated with language model. (default: %(default)f)") +parser.add_argument( + "--beta", + default=0.25, + type=float, + help="Parameter associated with word count. (default: %(default)f)") +parser.add_argument( + "--cutoff_prob", + default=0.99, + type=float, + help="The cutoff probability of pruning" + "in beam search. (default: %(default)f)") +args = parser.parse_args() + + +class AsrTCPServer(SocketServer.TCPServer): + def __init__(self, + server_address, + RequestHandlerClass, + speech_save_dir, + audio_process_handler, + bind_and_activate=True): + self.speech_save_dir = speech_save_dir + self.audio_process_handler = audio_process_handler + SocketServer.TCPServer.__init__( + self, server_address, RequestHandlerClass, bind_and_activate=True) + + +class AsrRequestHandler(SocketServer.BaseRequestHandler): + """The ASR request handler. + """ + + def handle(self): + # receive data through TCP socket + chunk = self.request.recv(1024) + target_len = struct.unpack('>i', chunk[:4])[0] + data = chunk[4:] + while len(data) < target_len: + chunk = self.request.recv(1024) + data += chunk + # write to file + filename = self._write_to_file(data) + + print("Received utterance[length=%d] from %s, saved to %s." % + (len(data), self.client_address[0], filename)) + #filename = "/home/work/.cache/paddle/dataset/speech/Libri/train-other-500/LibriSpeech/train-other-500/811/130143/811-130143-0025.flac" + start_time = time.time() + transcript = self.server.audio_process_handler(filename) + finish_time = time.time() + print("Response Time: %f, Transcript: %s" % + (finish_time - start_time, transcript)) + self.request.sendall(transcript) + + def _write_to_file(self, data): + # prepare save dir and filename + if not os.path.exists(self.server.speech_save_dir): + os.mkdir(self.server.speech_save_dir) + timestamp = strftime("%Y%m%d%H%M%S", gmtime()) + out_filename = os.path.join( + self.server.speech_save_dir, + timestamp + "_" + self.client_address[0] + "_" + ".wav") + # write to wav file + file = wave.open(out_filename, 'wb') + file.setnchannels(1) + file.setsampwidth(4) + file.setframerate(16000) + file.writeframes(data) + file.close() + return out_filename + + +def start_server(): + data_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config='{}', + specgram_type=args.specgram_type, + num_threads=1) + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) + + def file_to_transcript(filename): + feature = data_generator.process_utterance(filename, "") + result_transcript = ds2_model.infer_batch( + infer_data=[feature], + decode_method=args.decode_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=1) + return result_transcript[0] + + server = AsrTCPServer( + server_address=(args.host_ip, args.host_port), + RequestHandlerClass=AsrRequestHandler, + speech_save_dir=args.speech_save_dir, + audio_process_handler=file_to_transcript) + + print("ASR Server Started.") + server.serve_forever() + + +def main(): + utils.print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=1) + start_server() + + +if __name__ == "__main__": + main() diff --git a/infer.py b/infer.py index bc77dab70..8fd27dce4 100644 --- a/infer.py +++ b/infer.py @@ -83,18 +83,13 @@ parser.add_argument( "--decode_method", default='beam_search', type=str, - help="Method for ctc decoding: best_path or beam_search. (default: %(default)s)" -) + help="Method for ctc decoding: best_path or beam_search. " + "(default: %(default)s)") parser.add_argument( "--beam_size", default=500, type=int, help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--num_results_per_sample", - default=1, - type=int, - help="Number of output per sample in beam search. (default: %(default)d)") parser.add_argument( "--language_model_path", default="lm/data/common_crawl_00.prune01111.trie.klm", @@ -102,12 +97,12 @@ parser.add_argument( help="Path for language model. (default: %(default)s)") parser.add_argument( "--alpha", - default=0.26, + default=0.36, type=float, help="Parameter associated with language model. (default: %(default)f)") parser.add_argument( "--beta", - default=0.1, + default=0.25, type=float, help="Parameter associated with word count. (default: %(default)f)") parser.add_argument( diff --git a/model.py b/model.py index f5333f170..c8766deb1 100644 --- a/model.py +++ b/model.py @@ -35,6 +35,7 @@ class DeepSpeech2Model(object): rnn_layer_size) self._create_parameters(pretrained_model_path) self._inferer = None + self._loss_inferer = None self._ext_scorer = None def train(self, @@ -118,6 +119,14 @@ class DeepSpeech2Model(object): num_passes=num_passes, feeding=feeding_dict) + def infer_loss_batch(self, infer_data): + # define inferer + if self._loss_inferer == None: + self._loss_inferer = paddle.inference.Inference( + output_layer=self._loss, parameters=self._parameters) + # run inference + return self._loss_inferer.infer(input=infer_data) + def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): @@ -187,6 +196,7 @@ class DeepSpeech2Model(object): num_processes=num_processes, ext_scoring_func=self._ext_scorer, cutoff_prob=cutoff_prob) + results = [result[0][1] for result in beam_search_results] else: raise ValueError("Decoding method [%s] is not supported." % diff --git a/requirements.txt b/requirements.txt old mode 100755 new mode 100644 index 131f75ff4..9297f659c --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ resampy==0.1.5 SoundFile==0.9.0.post1 python_speech_features https://github.com/luotao1/kenlm/archive/master.zip +pyaudio +pynput From cb9370f30862217c3666baabe94441fba72493a7 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 3 Aug 2017 12:00:24 +0800 Subject: [PATCH 12/30] Add warming-up to demo_server.py for DS2 and clean codes. --- data_utils/audio.py | 2 -- demo_server.py | 40 +++++++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/data_utils/audio.py b/data_utils/audio.py index 29fdd0bd8..3891f5b92 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -64,8 +64,6 @@ class AudioSegment(object): :rtype: AudioSegment """ samples, sample_rate = soundfile.read(file, dtype='float32') - print(samples) - print(sample_rate) return cls(samples, sample_rate) @classmethod diff --git a/demo_server.py b/demo_server.py index 4a3feb138..85f694834 100644 --- a/demo_server.py +++ b/demo_server.py @@ -1,5 +1,6 @@ import os import time +import random import argparse import distutils.util from time import gmtime, strftime @@ -8,9 +9,10 @@ import struct import wave import pyaudio import paddle.v2 as paddle +from utils import print_arguments from data_utils.data import DataGenerator from model import DeepSpeech2Model -import utils +from data_utils.utils import read_manifest parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -38,6 +40,11 @@ parser.add_argument( default='mean_std.npz', type=str, help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--warmup_manifest_path", + default='datasets/manifest.test', + type=str, + help="Manifest path for warmup test. (default: %(default)s)") parser.add_argument( "--specgram_type", default='linear', @@ -77,7 +84,7 @@ parser.add_argument( "(default: %(default)s)") parser.add_argument( "--beam_size", - default=500, + default=100, type=int, help="Width for beam search decoding. (default: %(default)d)") parser.add_argument( @@ -134,7 +141,6 @@ class AsrRequestHandler(SocketServer.BaseRequestHandler): print("Received utterance[length=%d] from %s, saved to %s." % (len(data), self.client_address[0], filename)) - #filename = "/home/work/.cache/paddle/dataset/speech/Libri/train-other-500/LibriSpeech/train-other-500/811/130143/811-130143-0025.flac" start_time = time.time() transcript = self.server.audio_process_handler(filename) finish_time = time.time() @@ -149,7 +155,7 @@ class AsrRequestHandler(SocketServer.BaseRequestHandler): timestamp = strftime("%Y%m%d%H%M%S", gmtime()) out_filename = os.path.join( self.server.speech_save_dir, - timestamp + "_" + self.client_address[0] + "_" + ".wav") + timestamp + "_" + self.client_address[0] + ".wav") # write to wav file file = wave.open(out_filename, 'wb') file.setnchannels(1) @@ -160,6 +166,22 @@ class AsrRequestHandler(SocketServer.BaseRequestHandler): return out_filename +def warm_up_test(audio_process_handler, + manifest_path, + num_test_cases, + random_seed=0): + manifest = read_manifest(manifest_path) + rng = random.Random(random_seed) + samples = rng.sample(manifest, num_test_cases) + for idx, sample in enumerate(samples): + print("Warm-up Test Case %d: %s", idx, sample['audio_filepath']) + start_time = time.time() + transcript = audio_process_handler(sample['audio_filepath']) + finish_time = time.time() + print("Response Time: %f, Transcript: %s" % + (finish_time - start_time, transcript)) + + def start_server(): data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, @@ -188,6 +210,14 @@ def start_server(): num_processes=1) return result_transcript[0] + print('-----------------------------------------------------------') + print('Warming up ...') + warm_up_test( + audio_process_handler=file_to_transcript, + manifest_path=args.warmup_manifest_path, + num_test_cases=3) + print('-----------------------------------------------------------') + server = AsrTCPServer( server_address=(args.host_ip, args.host_port), RequestHandlerClass=AsrRequestHandler, @@ -199,7 +229,7 @@ def start_server(): def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=1) start_server() From a4c2dd7de2c6da129f34c8ae61db1655e812761a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 3 Aug 2017 12:08:11 +0800 Subject: [PATCH 13/30] Add function docs and comments to demo_server.py and demo_client.py. --- demo_client.py | 6 ++++++ demo_server.py | 13 ++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/demo_client.py b/demo_client.py index 97649fd48..a789d8164 100644 --- a/demo_client.py +++ b/demo_client.py @@ -11,6 +11,7 @@ enable_trigger_record = True def on_press(key): + """On-press keyboard callback function.""" global is_recording, enable_trigger_record if key == keyboard.Key.space: if (not is_recording) and enable_trigger_record: @@ -20,6 +21,7 @@ def on_press(key): def on_release(key): + """On-release keyboard callback function.""" global is_recording, enable_trigger_record if key == keyboard.Key.esc: return False @@ -32,6 +34,7 @@ data_list = [] def callback(in_data, frame_count, time_info, status): + """Audio recorder's stream callback function.""" global data_list, is_recording, enable_trigger_record if is_recording: data_list.append(in_data) @@ -53,6 +56,7 @@ def callback(in_data, frame_count, time_info, status): def main(): + # prepare audio recorder p = pyaudio.PyAudio() stream = p.open( format=pyaudio.paInt32, @@ -62,10 +66,12 @@ def main(): stream_callback=callback) stream.start_stream() + # prepare keyboard listener with keyboard.Listener( on_press=on_press, on_release=on_release) as listener: listener.join() + # close up stream.stop_stream() stream.close() p.terminate() diff --git a/demo_server.py b/demo_server.py index 85f694834..d6c0de40a 100644 --- a/demo_server.py +++ b/demo_server.py @@ -112,6 +112,8 @@ args = parser.parse_args() class AsrTCPServer(SocketServer.TCPServer): + """The ASR TCP Server.""" + def __init__(self, server_address, RequestHandlerClass, @@ -125,8 +127,7 @@ class AsrTCPServer(SocketServer.TCPServer): class AsrRequestHandler(SocketServer.BaseRequestHandler): - """The ASR request handler. - """ + """The ASR request handler.""" def handle(self): # receive data through TCP socket @@ -170,6 +171,7 @@ def warm_up_test(audio_process_handler, manifest_path, num_test_cases, random_seed=0): + """Warming-up test.""" manifest = read_manifest(manifest_path) rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) @@ -183,12 +185,15 @@ def warm_up_test(audio_process_handler, def start_server(): + """Start the ASR server""" + # prepare data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) + # prepare ASR model ds2_model = DeepSpeech2Model( vocab_size=data_generator.vocab_size, num_conv_layers=args.num_conv_layers, @@ -196,6 +201,7 @@ def start_server(): rnn_layer_size=args.rnn_layer_size, pretrained_model_path=args.model_filepath) + # prepare ASR inference handler def file_to_transcript(filename): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( @@ -210,6 +216,7 @@ def start_server(): num_processes=1) return result_transcript[0] + # warming up with utterrances sampled from Librispeech print('-----------------------------------------------------------') print('Warming up ...') warm_up_test( @@ -218,12 +225,12 @@ def start_server(): num_test_cases=3) print('-----------------------------------------------------------') + # start the server server = AsrTCPServer( server_address=(args.host_ip, args.host_port), RequestHandlerClass=AsrRequestHandler, speech_save_dir=args.speech_save_dir, audio_process_handler=file_to_transcript) - print("ASR Server Started.") server.serve_forever() From b57d244363d997c394aebbfc0f1ab49310fd1ae4 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 3 Aug 2017 12:26:33 +0800 Subject: [PATCH 14/30] Add ASR demo usage to README.md for DS2. --- README.md | 16 ++++++++++++++++ demo_client.py | 17 +++++++++++++++-- demo_server.py | 3 ++- 3 files changed, 33 insertions(+), 3 deletions(-) mode change 100644 => 100755 README.md diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 62b051714..27dc64887 --- a/README.md +++ b/README.md @@ -143,3 +143,19 @@ python tune.py --help ``` Then reset parameters with the tuning result before inference or evaluating. + +### Playing with the ASR Demo + +A real-time ASR demo (`demo_server.py` and `demo_client.py`) are prepared for users to try out the ASR model with their own voice. After a model and language model is prepared, we can first start the demo server: + +``` +CUDA_VISIBLE_DEVICES=0 python demo_server.py +``` +And then in another console, start the client: + +``` +python demo_client.py +``` +On the client console, press and hold "white-space" key and start talking, then release the "white-space" key when you finish your speech. The decoding results (infered transcription) will be displayed. + +If you would like to start server and client in two machines. Please use `--host_ip` and `--host_port` to indicate the actual IP address and port, for both `demo_server.py` and `demo_client.py`. diff --git a/demo_client.py b/demo_client.py index a789d8164..ddf4dd1bf 100644 --- a/demo_client.py +++ b/demo_client.py @@ -1,10 +1,23 @@ +"""Client-end for the ASR demo.""" from pynput import keyboard import struct import socket import sys +import argparse import pyaudio -HOST, PORT = "10.104.18.14", 8086 +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--host_ip", + default="localhost", + type=str, + help="Server IP address. (default: %(default)s)") +parser.add_argument( + "--host_port", + default=8086, + type=int, + help="Server Port. (default: %(default)s)") +args = parser.parse_args() is_recording = False enable_trigger_record = True @@ -42,7 +55,7 @@ def callback(in_data, frame_count, time_info, status): elif len(data_list) > 0: # Connect to server and send data sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.connect((HOST, PORT)) + sock.connect((args.host_ip, args.host_port)) sent = ''.join(data_list) sock.sendall(struct.pack('>i', len(sent)) + sent) print('Speech[length=%d] Sent.' % len(sent)) diff --git a/demo_server.py b/demo_server.py index d6c0de40a..8a55e7265 100644 --- a/demo_server.py +++ b/demo_server.py @@ -1,3 +1,4 @@ +"""Server-end for the ASR demo.""" import os import time import random @@ -17,7 +18,7 @@ from data_utils.utils import read_manifest parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--host_ip", - default="10.104.18.14", + default="localhost", type=str, help="Server IP address. (default: %(default)s)") parser.add_argument( From c0b3281e58a8b0f8bc1ab5772dd1483ff5caf391 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 3 Aug 2017 19:00:17 +0800 Subject: [PATCH 15/30] Remove pynput and pyaudio packages from requriements.txt and add installation tips to README.md. --- README.md | 21 +++++++++++++++++++-- demo_server.py | 1 - requirements.txt | 2 -- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 27dc64887..39dba0cd1 100755 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - **Python = 2.7** only supported; - **cuDNN >= 6.0** is required to utilize NVIDIA GPU platform in the installation of PaddlePaddle, and the **CUDA toolkit** with proper version suitable for cuDNN. The cuDNN library below 6.0 is found to yield a fatal error in batch normalization when handling utterances with long duration in inference. -### Setup +### Setup for Training & Evaluation ``` sh setup.sh @@ -16,6 +16,19 @@ export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/li Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. +### Setup for Demo + +Please do the following extra installation before run `demo_client.py` to try the realtime ASR demo. However there is no need to install them for the computer running the demo's server-end (`demo_server.py`). For details of running the ASR demo, please refer to the [section](#playing-with-the-asr-demo). + +For example, on MAC OS X: + +``` +brew install portaudio +pip install pyaudio +pip install pynput +``` + + ## Usage ### Preparing Data @@ -158,4 +171,8 @@ python demo_client.py ``` On the client console, press and hold "white-space" key and start talking, then release the "white-space" key when you finish your speech. The decoding results (infered transcription) will be displayed. -If you would like to start server and client in two machines. Please use `--host_ip` and `--host_port` to indicate the actual IP address and port, for both `demo_server.py` and `demo_client.py`. +If you would like to start the server and the client in two machines. Please use `--host_ip` and `--host_port` to indicate the actual IP address and port, for both `demo_server.py` and `demo_client.py`. + +Notice that `demo_client.py` should be started in your local computer with microphone hardware, while `demo_server.py` can be started in any remote server as well as the same local computer. IP address and port should be properly set for server-client communication. + +For running `demo_client.py`, please first finish the [extra installation steps](#setup-for-demo). diff --git a/demo_server.py b/demo_server.py index 8a55e7265..c7e7e94a4 100644 --- a/demo_server.py +++ b/demo_server.py @@ -8,7 +8,6 @@ from time import gmtime, strftime import SocketServer import struct import wave -import pyaudio import paddle.v2 as paddle from utils import print_arguments from data_utils.data import DataGenerator diff --git a/requirements.txt b/requirements.txt index 9297f659c..131f75ff4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,3 @@ resampy==0.1.5 SoundFile==0.9.0.post1 python_speech_features https://github.com/luotao1/kenlm/archive/master.zip -pyaudio -pynput From 94db28e088319bf44bb8f4f11b232e6ace9c0300 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 3 Aug 2017 21:06:27 +0800 Subject: [PATCH 16/30] Add function doc for infer_batch_loss() function in model.py for DS2. --- model.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/model.py b/model.py index c8766deb1..c2e440b3a 100644 --- a/model.py +++ b/model.py @@ -120,6 +120,16 @@ class DeepSpeech2Model(object): feeding=feeding_dict) def infer_loss_batch(self, infer_data): + """Model inference. Infer the ctc loss for a batch of speech + utterances. + + :param infer_data: List of utterances to infer, with each utterance a + tuple of audio features and transcription text (empty + string). + :type infer_data: list + :return: List of ctc loss. + :rtype: List of float + """ # define inferer if self._loss_inferer == None: self._loss_inferer = paddle.inference.Inference( From f4375ef125cf496d87bb92e9991da12039488077 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 7 Aug 2017 19:41:59 +0800 Subject: [PATCH 17/30] Update README.md with code reviews for DS2. --- README.md | 36 ++++++++++++++---------------------- model.py | 6 +++--- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 39dba0cd1..96fbb7d09 100755 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ - **Python = 2.7** only supported; - **cuDNN >= 6.0** is required to utilize NVIDIA GPU platform in the installation of PaddlePaddle, and the **CUDA toolkit** with proper version suitable for cuDNN. The cuDNN library below 6.0 is found to yield a fatal error in batch normalization when handling utterances with long duration in inference. -### Setup for Training & Evaluation +### Setup ``` sh setup.sh @@ -16,19 +16,6 @@ export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/li Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. -### Setup for Demo - -Please do the following extra installation before run `demo_client.py` to try the realtime ASR demo. However there is no need to install them for the computer running the demo's server-end (`demo_server.py`). For details of running the ASR demo, please refer to the [section](#playing-with-the-asr-demo). - -For example, on MAC OS X: - -``` -brew install portaudio -pip install pyaudio -pip install pynput -``` - - ## Usage ### Preparing Data @@ -159,20 +146,25 @@ Then reset parameters with the tuning result before inference or evaluating. ### Playing with the ASR Demo -A real-time ASR demo (`demo_server.py` and `demo_client.py`) are prepared for users to try out the ASR model with their own voice. After a model and language model is prepared, we can first start the demo server: +A real-time ASR demo is built for users to try out the ASR model with their own voice. Please do the following installation on the machine you'd like to run the demo's client (no need for the machine running the demo's server). + +For example, on MAC OS X: + +``` +brew install portaudio +pip install pyaudio +pip install pynput +``` +After a model and language model is prepared, we can first start the demo's server: ``` CUDA_VISIBLE_DEVICES=0 python demo_server.py ``` -And then in another console, start the client: +And then in another console, start the demo's client: ``` python demo_client.py ``` -On the client console, press and hold "white-space" key and start talking, then release the "white-space" key when you finish your speech. The decoding results (infered transcription) will be displayed. - -If you would like to start the server and the client in two machines. Please use `--host_ip` and `--host_port` to indicate the actual IP address and port, for both `demo_server.py` and `demo_client.py`. - -Notice that `demo_client.py` should be started in your local computer with microphone hardware, while `demo_server.py` can be started in any remote server as well as the same local computer. IP address and port should be properly set for server-client communication. +On the client console, press and hold the "white-space" key on the keyboard to start talking, until you finish your speech and then release the "white-space" key. The decoding results (infered transcription) will be displayed. -For running `demo_client.py`, please first finish the [extra installation steps](#setup-for-demo). +It could be possible to start the server and the client in two seperate machines, e.g. `demo_client.py` is usually started in a machine with a microphone hardware, while `demo_server.py` is usually started in a remote server with powerful GPUs. Please first make sure that these two machines have network access to each other, and then use `--host_ip` and `--host_port` to indicate the server machine's actual IP address (instead of the `localhost` as default) and TCP port, in both `demo_server.py` and `demo_client.py`. diff --git a/model.py b/model.py index c2e440b3a..2eb7c3594 100644 --- a/model.py +++ b/model.py @@ -143,9 +143,9 @@ class DeepSpeech2Model(object): """Model inference. Infer the transcription for a batch of speech utterances. - :param infer_data: List of utterances to infer, with each utterance a - tuple of audio features and transcription text (empty - string). + :param infer_data: List of utterances to infer, with each utterance + consisting of a tuple of audio features and + transcription text (empty string). :type infer_data: list :param decode_method: Decoding method name, 'best_path' or 'beam search'. From 6df0f9bc4441c6fa34684923130c1115567b6b7f Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 8 Aug 2017 12:15:18 +0800 Subject: [PATCH 18/30] Reset default multi-thread/process number to half of cpu count() for speedup. --- data_utils/data.py | 2 +- evaluate.py | 4 ++-- infer.py | 2 +- train.py | 2 +- tune.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data_utils/data.py b/data_utils/data.py index fe064b806..34f32019c 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -65,7 +65,7 @@ class DataGenerator(object): max_freq=None, specgram_type='linear', use_dB_normalization=True, - num_threads=multiprocessing.cpu_count(), + num_threads=multiprocessing.cpu_count() // 2, random_seed=0): self._max_duration = max_duration self._min_duration = min_duration diff --git a/evaluate.py b/evaluate.py index fb7211fc2..592b7b527 100644 --- a/evaluate.py +++ b/evaluate.py @@ -45,12 +45,12 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=multiprocessing.cpu_count() // 2, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--num_processes_beam_search", - default=multiprocessing.cpu_count(), + default=multiprocessing.cpu_count() // 2, type=int, help="Number of cpu processes for beam search. (default: %(default)s)") parser.add_argument( diff --git a/infer.py b/infer.py index 8fd27dce4..df5953e59 100644 --- a/infer.py +++ b/infer.py @@ -45,7 +45,7 @@ parser.add_argument( help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--num_processes_beam_search", - default=multiprocessing.cpu_count(), + default=multiprocessing.cpu_count() // 2, type=int, help="Number of cpu processes for beam search. (default: %(default)s)") parser.add_argument( diff --git a/train.py b/train.py index 080f57d2d..aff619379 100644 --- a/train.py +++ b/train.py @@ -86,7 +86,7 @@ parser.add_argument( help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=multiprocessing.cpu_count() // 2, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( diff --git a/tune.py b/tune.py index a17be30fa..328d67a11 100644 --- a/tune.py +++ b/tune.py @@ -51,7 +51,7 @@ parser.add_argument( help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--num_processes_beam_search", - default=multiprocessing.cpu_count(), + default=multiprocessing.cpu_count() // 2, type=int, help="Number of cpu processes for beam search. (default: %(default)s)") parser.add_argument( From 961f6a29630ab64696828a8746e0bdd968ab83e8 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 8 Aug 2017 15:46:03 +0800 Subject: [PATCH 19/30] Accelerate mfcc computation for DS2. --- data_utils/featurizer/audio_featurizer.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 271e535b6..00f0e8a35 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -166,21 +166,18 @@ class AudioFeaturizer(object): "window size.") # compute 13 cepstral coefficients, and the first one is replaced # by log(frame energy) - mfcc_feat = mfcc( - signal=samples, - samplerate=sample_rate, - winlen=0.001 * window_ms, - winstep=0.001 * stride_ms, - highfreq=max_freq) + mfcc_feat = np.transpose( + mfcc( + signal=samples, + samplerate=sample_rate, + winlen=0.001 * window_ms, + winstep=0.001 * stride_ms, + highfreq=max_freq)) # Deltas d_mfcc_feat = delta(mfcc_feat, 2) # Deltas-Deltas dd_mfcc_feat = delta(d_mfcc_feat, 2) # concat above three features - concat_mfcc_feat = [ - np.concatenate((mfcc_feat[i], d_mfcc_feat[i], dd_mfcc_feat[i])) - for i in xrange(len(mfcc_feat)) - ] - # transpose to be consistent with the linear specgram situation - concat_mfcc_feat = np.transpose(concat_mfcc_feat) + concat_mfcc_feat = np.concatenate( + (mfcc_feat, d_mfcc_feat, dd_mfcc_feat)) return concat_mfcc_feat From ad82c8771231d5cd78a7b14e8e1f83b034072542 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 29 Jun 2017 12:27:54 +0800 Subject: [PATCH 20/30] Add NoisePerturbAugmentor and CHiME3 data preparation. --- data_utils/augmentor/augmentation.py | 3 + data_utils/augmentor/noise_perturb.py | 47 +++++++ .../online_bayesian_normalization.py | 0 data_utils/augmentor/resample.py | 0 datasets/noise/chime3_background.py | 128 ++++++++++++++++++ datasets/run_all.sh | 9 ++ 6 files changed, 187 insertions(+) create mode 100644 data_utils/augmentor/noise_perturb.py mode change 100755 => 100644 data_utils/augmentor/online_bayesian_normalization.py mode change 100755 => 100644 data_utils/augmentor/resample.py create mode 100644 datasets/noise/chime3_background.py diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index 9dced4731..8a50e4400 100644 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -8,6 +8,7 @@ import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor +from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor from data_utils.augmentor.resample import ResampleAugmentor from data_utils.augmentor.online_bayesian_normalization import \ OnlineBayesianNormalizationAugmentor @@ -89,5 +90,7 @@ class AugmentationPipeline(object): return ResampleAugmentor(self._rng, **params) elif augmentor_type == "bayesian_normal": return OnlineBayesianNormalizationAugmentor(self._rng, **params) + elif augmentor_type == "noise": + return NoisePerturbAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py new file mode 100644 index 000000000..c97ab8432 --- /dev/null +++ b/data_utils/augmentor/noise_perturb.py @@ -0,0 +1,47 @@ +"""Contains the noise perturb augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase +from data_utils import utils +from data_utils.speech import SpeechSegment + + +class NoisePerturbAugmentor(AugmentorBase): + """Augmentation model for adding background noise. + + :param rng: Random generator object. + :type rng: random.Random + :param min_snr_dB: Minimal signal noise ratio, in decibels. + :type min_snr_dB: float + :param max_snr_dB: Maximal signal noise ratio, in decibels. + :type max_snr_dB: float + """ + + def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest): + self._min_snr_dB = min_snr_dB + self._max_snr_dB = max_snr_dB + self._rng = rng + self._manifest = utils.read_manifest(manifest_path=noise_manifest) + + def transform_audio(self, audio_segment): + """Add background noise audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + noise_json = self._rng.sample(self._manifest, 1)[0] + if noise_json['duration'] < audio_segment.duration: + raise RuntimeError("The duration of sampled noise audio is smaller " + "than the audio segment to add effects to.") + diff_duration = noise_json['duration'] - audio_segment.duration + start = self._rng.uniform(0, diff_duration) + end = start + audio_segment.duration + noise_segment = SpeechSegment.slice_from_file( + noise_json['audio_filepath'], transcript="", start=start, end=end) + snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) + audio_segment.add_noise( + noise_segment, snr_dB, allow_downsampling=True, rng=self._rng) diff --git a/data_utils/augmentor/online_bayesian_normalization.py b/data_utils/augmentor/online_bayesian_normalization.py old mode 100755 new mode 100644 diff --git a/data_utils/augmentor/resample.py b/data_utils/augmentor/resample.py old mode 100755 new mode 100644 diff --git a/datasets/noise/chime3_background.py b/datasets/noise/chime3_background.py new file mode 100644 index 000000000..f79ca7335 --- /dev/null +++ b/datasets/noise/chime3_background.py @@ -0,0 +1,128 @@ +"""Prepare CHiME3 background data. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util +import os +import wget +import zipfile +import argparse +import soundfile +import json +from paddle.v2.dataset.common import md5file + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ" +MD5 = "c3ff512618d7a67d4f85566ea1bc39ec" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/chime3_background", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_filepath", + default="manifest.chime3.background", + type=str, + help="Filepath for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download(url, md5sum, target_dir, filename=None): + """Download file from url to target_dir, and check md5sum.""" + if filename == None: + filename = url.split("/")[-1] + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, filename) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + wget.download(url, target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + if filepath.endswith('.zip'): + zip = zipfile.ZipFile(filepath, 'r') + zip.extractall(target_dir) + zip.close() + elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'): + tar = zipfile.open(filepath) + tar.extractall(target_dir) + tar.close() + else: + raise ValueError("File format is not supported for unpacking.") + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + for filename in filelist: + if filename.endswith('.wav'): + filepath = os.path.join(data_dir, subfolder, filename) + audio_data, samplerate = soundfile.read(filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': filepath, + 'duration': duration, + 'text': '' + })) + with open(manifest_path, 'w') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_chime3(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file.""" + if not os.path.exists(os.path.join(target_dir, "CHiME3")): + # download + filepath = download(url, md5sum, target_dir, + "myairbridge-AG0Y3DNBE5IWRRTV.zip") + # unpack + unpack(filepath, target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + prepare_chime3( + url=URL, + md5sum=MD5, + target_dir=args.target_dir, + manifest_path=args.manifest_filepath) + + +if __name__ == '__main__': + main() diff --git a/datasets/run_all.sh b/datasets/run_all.sh index ef2b721fb..61747a50b 100644 --- a/datasets/run_all.sh +++ b/datasets/run_all.sh @@ -6,8 +6,17 @@ if [ $? -ne 0 ]; then fi cd - +cd noise +python chime3_background.py +if [ $? -ne 0 ]; then + echo "Prepare CHiME3 background noise failed. Terminated." + exit 1 +fi +cd - + cat librispeech/manifest.train* | shuf > manifest.train cat librispeech/manifest.dev-clean > manifest.dev cat librispeech/manifest.test-clean > manifest.test +cat noise/manifest.* > manifest.noise echo "All done." From 99e819e8eae355889c5e983abfbe50bb74e0748a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 4 Jul 2017 18:51:11 +0800 Subject: [PATCH 21/30] Add ImpulseResponseAugmentor and augmentation.config file. --- augmentation.config | 34 ++++++++++++++++ data_utils/audio.py | 8 ++-- data_utils/augmentor/augmentation.py | 52 ++++++++++++++++++------ data_utils/augmentor/impulse_response.py | 34 ++++++++++++++++ data_utils/augmentor/noise_perturb.py | 8 ++-- data_utils/data.py | 2 +- data_utils/speech.py | 2 +- train.py | 4 +- 8 files changed, 120 insertions(+), 24 deletions(-) create mode 100644 augmentation.config create mode 100644 data_utils/augmentor/impulse_response.py diff --git a/augmentation.config b/augmentation.config new file mode 100644 index 000000000..9ddedd407 --- /dev/null +++ b/augmentation.config @@ -0,0 +1,34 @@ +[ + { + "type": "noise", + "params": {"min_snr_dB": 50, + "max_snr_dB": 50, + "noise_manifest": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 0.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } +] diff --git a/data_utils/audio.py b/data_utils/audio.py index 3891f5b92..30e25221c 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -204,7 +204,7 @@ class AudioSegment(object): :raise ValueError: If the sample rates of the two segments are not equal, or if the lengths of segments don't match. """ - if type(self) != type(other): + if isinstance(other, type(self)): raise TypeError("Cannot add segments of different types: %s " "and %s." % (type(self), type(other))) if self._sample_rate != other._sample_rate: @@ -231,7 +231,7 @@ class AudioSegment(object): Note that this is an in-place transformation. :param gain: Gain in decibels to apply to samples. - :type gain: float + :type gain: float|1darray """ self._samples *= 10.**(gain / 20.) @@ -457,9 +457,9 @@ class AudioSegment(object): audio segments when resample is not allowed. """ if allow_resample and self.sample_rate != impulse_segment.sample_rate: - impulse_segment = impulse_segment.resample(self.sample_rate) + impulse_segment.resample(self.sample_rate) if self.sample_rate != impulse_segment.sample_rate: - raise ValueError("Impulse segment's sample rate (%d Hz) is not" + raise ValueError("Impulse segment's sample rate (%d Hz) is not " "equal to base signal sample rate (%d Hz)." % (impulse_segment.sample_rate, self.sample_rate)) samples = signal.fftconvolve(self.samples, impulse_segment.samples, diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index 8a50e4400..c9e360313 100644 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -9,6 +9,7 @@ from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor +from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor from data_utils.augmentor.resample import ResampleAugmentor from data_utils.augmentor.online_bayesian_normalization import \ OnlineBayesianNormalizationAugmentor @@ -24,21 +25,46 @@ class AugmentationPipeline(object): string, e.g. .. code-block:: - - '[{"type": "volume", - "params": {"min_gain_dBFS": -15, - "max_gain_dBFS": 15}, - "prob": 0.5}, - {"type": "speed", - "params": {"min_speed_rate": 0.8, - "max_speed_rate": 1.2}, - "prob": 0.5} - ]' + [ { + "type": "noise", + "params": {"min_snr_dB": 10, + "max_snr_dB": 20, + "noise_manifest": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 1.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } + ] + This augmentation configuration inserts two augmentation models into the pipeline, with one is VolumePerturbAugmentor and the other SpeedPerturbAugmentor. "prob" indicates the probability of the current - augmentor to take effect. + augmentor to take effect. If "prob" is zero, the augmentor does not take + effect. :param augmentation_config: Augmentation configuration in json string. :type augmentation_config: str @@ -61,7 +87,7 @@ class AugmentationPipeline(object): :type audio_segment: AudioSegmenet|SpeechSegment """ for augmentor, rate in zip(self._augmentors, self._rates): - if self._rng.uniform(0., 1.) <= rate: + if self._rng.uniform(0., 1.) < rate: augmentor.transform_audio(audio_segment) def _parse_pipeline_from(self, config_json): @@ -92,5 +118,7 @@ class AugmentationPipeline(object): return OnlineBayesianNormalizationAugmentor(self._rng, **params) elif augmentor_type == "noise": return NoisePerturbAugmentor(self._rng, **params) + elif augmentor_type == "impulse": + return ImpulseResponseAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/impulse_response.py b/data_utils/augmentor/impulse_response.py new file mode 100644 index 000000000..d868c3a1c --- /dev/null +++ b/data_utils/augmentor/impulse_response.py @@ -0,0 +1,34 @@ +"""Contains the impulse response augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase +from data_utils import utils +from data_utils.audio import AudioSegment + + +class ImpulseResponseAugmentor(AugmentorBase): + """Augmentation model for adding impulse response effect. + + :param rng: Random generator object. + :type rng: random.Random + :param impulse_manifest: Manifest path for impulse audio data. + :type impulse_manifest: basestring + """ + + def __init__(self, rng, impulse_manifest): + self._rng = rng + self._manifest = utils.read_manifest(manifest_path=impulse_manifest) + + def transform_audio(self, audio_segment): + """Add impulse response effect. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + noise_json = self._rng.sample(self._manifest, 1)[0] + noise_segment = AudioSegment.from_file(noise_json['audio_filepath']) + audio_segment.convolve(noise_segment, allow_resample=True) diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py index c97ab8432..b4fa18e18 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/data_utils/augmentor/noise_perturb.py @@ -5,7 +5,7 @@ from __future__ import print_function from data_utils.augmentor.base import AugmentorBase from data_utils import utils -from data_utils.speech import SpeechSegment +from data_utils.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): @@ -17,6 +17,8 @@ class NoisePerturbAugmentor(AugmentorBase): :type min_snr_dB: float :param max_snr_dB: Maximal signal noise ratio, in decibels. :type max_snr_dB: float + :param noise_manifest: Manifest path for noise audio data. + :type noise_manifest: basestring """ def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest): @@ -40,8 +42,8 @@ class NoisePerturbAugmentor(AugmentorBase): diff_duration = noise_json['duration'] - audio_segment.duration start = self._rng.uniform(0, diff_duration) end = start + audio_segment.duration - noise_segment = SpeechSegment.slice_from_file( - noise_json['audio_filepath'], transcript="", start=start, end=end) + noise_segment = AudioSegment.slice_from_file( + noise_json['audio_filepath'], start=start, end=end) snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) audio_segment.add_noise( noise_segment, snr_dB, allow_downsampling=True, rng=self._rng) diff --git a/data_utils/data.py b/data_utils/data.py index 34f32019c..159bf69d5 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -169,7 +169,7 @@ class DataGenerator(object): manifest, batch_size, clipped=True) elif shuffle_method == "instance_shuffle": self._rng.shuffle(manifest) - elif not shuffle_method: + elif shuffle_method == None: pass else: raise ValueError("Unknown shuffle method %s." % diff --git a/data_utils/speech.py b/data_utils/speech.py index 568e4443b..17d68f315 100644 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -115,7 +115,7 @@ class SpeechSegment(AudioSegment): speech file. :rtype: SpeechSegment """ - audio = Audiosegment.slice_from_file(filepath, start, end) + audio = AudioSegment.slice_from_file(filepath, start, end) return cls(audio.samples, audio.sample_rate, transcript) @classmethod diff --git a/train.py b/train.py index aff619379..34c406015 100644 --- a/train.py +++ b/train.py @@ -123,9 +123,7 @@ parser.add_argument( help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", - default='[{"type": "shift", ' - '"params": {"min_shift_ms": -5, "max_shift_ms": 5},' - '"prob": 1.0}]', + default=open('augmentation.config', 'r').read(), type=str, help="Augmentation configuration in json-format. " "(default: %(default)s)") From 012261912d112d07da58dee40655975cf5cd3dd1 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 5 Jul 2017 14:18:12 +0800 Subject: [PATCH 22/30] Update default config in augmentation.config for DS2. --- augmentation.config | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/augmentation.config b/augmentation.config index 9ddedd407..263af5864 100644 --- a/augmentation.config +++ b/augmentation.config @@ -1,15 +1,15 @@ [ { "type": "noise", - "params": {"min_snr_dB": 50, + "params": {"min_snr_dB": 40, "max_snr_dB": 50, "noise_manifest": "datasets/manifest.noise"}, "prob": 0.0 }, { "type": "speed", - "params": {"min_speed_rate": 0.9, - "max_speed_rate": 1.1}, + "params": {"min_speed_rate": 0.95, + "max_speed_rate": 1.05}, "prob": 0.0 }, { @@ -24,11 +24,4 @@ "max_gain_dBFS": 10}, "prob": 0.0 }, - { - "type": "bayesian_normal", - "params": {"target_db": -20, - "prior_db": -20, - "prior_samples": 100}, - "prob": 0.0 - } ] From 1b787411d332bc0eec265c354562576b33112e91 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 8 Aug 2017 21:53:26 +0800 Subject: [PATCH 23/30] Update noise and impulse augmentor according to code review. --- conf/augmentation.config | 8 ++++++++ .../augmentation.config.example | 18 +++++++++++++++--- data_utils/augmentor/augmentation.py | 2 +- data_utils/augmentor/impulse_response.py | 15 ++++++++------- data_utils/augmentor/noise_perturb.py | 11 ++++++----- datasets/run_all.sh | 9 --------- datasets/run_noise.sh | 10 ++++++++++ train.py | 2 +- 8 files changed, 49 insertions(+), 26 deletions(-) create mode 100644 conf/augmentation.config rename augmentation.config => conf/augmentation.config.example (56%) create mode 100644 datasets/run_noise.sh diff --git a/conf/augmentation.config b/conf/augmentation.config new file mode 100644 index 000000000..6c24da549 --- /dev/null +++ b/conf/augmentation.config @@ -0,0 +1,8 @@ +[ + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + } +] diff --git a/augmentation.config b/conf/augmentation.config.example similarity index 56% rename from augmentation.config rename to conf/augmentation.config.example index 263af5864..21ed6ee10 100644 --- a/augmentation.config +++ b/conf/augmentation.config.example @@ -3,14 +3,19 @@ "type": "noise", "params": {"min_snr_dB": 40, "max_snr_dB": 50, - "noise_manifest": "datasets/manifest.noise"}, - "prob": 0.0 + "noise_manifest_path": "datasets/manifest.noise"}, + "prob": 0.6 + }, + { + "type": "impulse", + "params": {"impulse_manifest_path": "datasets/manifest.impulse"}, + "prob": 0.5 }, { "type": "speed", "params": {"min_speed_rate": 0.95, "max_speed_rate": 1.05}, - "prob": 0.0 + "prob": 0.5 }, { "type": "shift", @@ -24,4 +29,11 @@ "max_gain_dBFS": 10}, "prob": 0.0 }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } ] diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index c9e360313..5c30b627e 100644 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -30,7 +30,7 @@ class AugmentationPipeline(object): "type": "noise", "params": {"min_snr_dB": 10, "max_snr_dB": 20, - "noise_manifest": "datasets/manifest.noise"}, + "noise_manifest_path": "datasets/manifest.noise"}, "prob": 0.0 }, { diff --git a/data_utils/augmentor/impulse_response.py b/data_utils/augmentor/impulse_response.py index d868c3a1c..c3de0fdbb 100644 --- a/data_utils/augmentor/impulse_response.py +++ b/data_utils/augmentor/impulse_response.py @@ -13,13 +13,14 @@ class ImpulseResponseAugmentor(AugmentorBase): :param rng: Random generator object. :type rng: random.Random - :param impulse_manifest: Manifest path for impulse audio data. - :type impulse_manifest: basestring + :param impulse_manifest_path: Manifest path for impulse audio data. + :type impulse_manifest_path: basestring """ - def __init__(self, rng, impulse_manifest): + def __init__(self, rng, impulse_manifest_path): self._rng = rng - self._manifest = utils.read_manifest(manifest_path=impulse_manifest) + self._impulse_manifest = utils.read_manifest( + manifest_path=impulse_manifest_path) def transform_audio(self, audio_segment): """Add impulse response effect. @@ -29,6 +30,6 @@ class ImpulseResponseAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ - noise_json = self._rng.sample(self._manifest, 1)[0] - noise_segment = AudioSegment.from_file(noise_json['audio_filepath']) - audio_segment.convolve(noise_segment, allow_resample=True) + impulse_json = self._rng.sample(self._impulse_manifest, 1)[0] + impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath']) + audio_segment.convolve(impulse_segment, allow_resample=True) diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py index b4fa18e18..281174af4 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/data_utils/augmentor/noise_perturb.py @@ -17,15 +17,16 @@ class NoisePerturbAugmentor(AugmentorBase): :type min_snr_dB: float :param max_snr_dB: Maximal signal noise ratio, in decibels. :type max_snr_dB: float - :param noise_manifest: Manifest path for noise audio data. - :type noise_manifest: basestring + :param noise_manifest_path: Manifest path for noise audio data. + :type noise_manifest_path: basestring """ - def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest): + def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): self._min_snr_dB = min_snr_dB self._max_snr_dB = max_snr_dB self._rng = rng - self._manifest = utils.read_manifest(manifest_path=noise_manifest) + self._noise_manifest = utils.read_manifest( + manifest_path=noise_manifest_path) def transform_audio(self, audio_segment): """Add background noise audio. @@ -35,7 +36,7 @@ class NoisePerturbAugmentor(AugmentorBase): :param audio_segment: Audio segment to add effects to. :type audio_segment: AudioSegmenet|SpeechSegment """ - noise_json = self._rng.sample(self._manifest, 1)[0] + noise_json = self._rng.sample(self._noise_manifest, 1)[0] if noise_json['duration'] < audio_segment.duration: raise RuntimeError("The duration of sampled noise audio is smaller " "than the audio segment to add effects to.") diff --git a/datasets/run_all.sh b/datasets/run_all.sh index 61747a50b..ef2b721fb 100644 --- a/datasets/run_all.sh +++ b/datasets/run_all.sh @@ -6,17 +6,8 @@ if [ $? -ne 0 ]; then fi cd - -cd noise -python chime3_background.py -if [ $? -ne 0 ]; then - echo "Prepare CHiME3 background noise failed. Terminated." - exit 1 -fi -cd - - cat librispeech/manifest.train* | shuf > manifest.train cat librispeech/manifest.dev-clean > manifest.dev cat librispeech/manifest.test-clean > manifest.test -cat noise/manifest.* > manifest.noise echo "All done." diff --git a/datasets/run_noise.sh b/datasets/run_noise.sh new file mode 100644 index 000000000..7b27abde4 --- /dev/null +++ b/datasets/run_noise.sh @@ -0,0 +1,10 @@ +cd noise +python chime3_background.py +if [ $? -ne 0 ]; then + echo "Prepare CHiME3 background noise failed. Terminated." + exit 1 +fi +cd - + +cat noise/manifest.* > manifest.noise +echo "All done." diff --git a/train.py b/train.py index 34c406015..0d4e2508d 100644 --- a/train.py +++ b/train.py @@ -123,7 +123,7 @@ parser.add_argument( help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", - default=open('augmentation.config', 'r').read(), + default=open('conf/augmentation.config', 'r').read(), type=str, help="Augmentation configuration in json-format. " "(default: %(default)s)") From 7e39debcb03c1f5d9faa3548ec53647c379207af Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 8 Aug 2017 22:17:20 +0800 Subject: [PATCH 24/30] Convert README.md's file mode to 644. --- README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 README.md diff --git a/README.md b/README.md old mode 100755 new mode 100644 From 14d2fb795c4b8cd145d9820016ea03d3293a58ea Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 11:09:26 +0800 Subject: [PATCH 25/30] Unify encoding to 'utf-8' and optimize error rate calculation. --- data_utils/data.py | 8 +-- data_utils/featurizer/text_featurizer.py | 3 +- data_utils/utils.py | 7 ++- datasets/librispeech/librispeech.py | 3 +- error_rate.py | 77 ++++++++++++++---------- tests/test_error_rate.py | 18 +++++- 6 files changed, 71 insertions(+), 45 deletions(-) diff --git a/data_utils/data.py b/data_utils/data.py index 159bf69d5..14b02f993 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -91,7 +91,7 @@ class DataGenerator(object): :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and list of token ids for - transcription. + transcription. :rtype: tuple of (2darray, list) """ speech_segment = SpeechSegment.from_file(filename, transcript) @@ -111,7 +111,7 @@ class DataGenerator(object): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. - + Audio features within one batch will be padded with zeros to have the same shape, or a user-defined shape. @@ -191,9 +191,9 @@ class DataGenerator(object): @property def feeding(self): """Returns data reader's feeding dict. - + :return: Data feeding dict. - :rtype: dict + :rtype: dict """ return {"audio_spectrogram": 0, "transcript_text": 1} diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py index 4f9a49b59..89202163c 100644 --- a/data_utils/featurizer/text_featurizer.py +++ b/data_utils/featurizer/text_featurizer.py @@ -4,6 +4,7 @@ from __future__ import division from __future__ import print_function import os +import codecs class TextFeaturizer(object): @@ -59,7 +60,7 @@ class TextFeaturizer(object): def _load_vocabulary_from_file(self, vocab_filepath): """Load vocabulary from file.""" vocab_lines = [] - with open(vocab_filepath, 'r') as file: + with codecs.open(vocab_filepath, 'r', 'utf-8') as file: vocab_lines.extend(file.readlines()) vocab_list = [line[:-1] for line in vocab_lines] vocab_dict = dict( diff --git a/data_utils/utils.py b/data_utils/utils.py index 3f1165718..f970ff55a 100644 --- a/data_utils/utils.py +++ b/data_utils/utils.py @@ -4,15 +4,16 @@ from __future__ import division from __future__ import print_function import json +import codecs def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): """Load and parse manifest file. - + Instances with durations outside [min_duration, max_duration] will be filtered out. - :param manifest_path: Manifest file to load and parse. + :param manifest_path: Manifest file to load and parse. :type manifest_path: basestring :param max_duration: Maximal duration in seconds for instance filter. :type max_duration: float @@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): :raises IOError: If failed to parse the manifest. """ manifest = [] - for json_line in open(manifest_path): + for json_line in codecs.open(manifest_path, 'r', 'utf-8'): try: json_data = json.loads(json_line) except Exception as e: diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py index 7e941f0ea..422b1ed82 100644 --- a/datasets/librispeech/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -17,6 +17,7 @@ import argparse import soundfile import json from paddle.v2.dataset.common import md5file +import codecs DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') @@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path): 'duration': duration, 'text': text })) - with open(manifest_path, 'w') as out_file: + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: for line in json_lines: out_file.write(line + '\n') diff --git a/error_rate.py b/error_rate.py index 0cf17921c..22e5c19b7 100644 --- a/error_rate.py +++ b/error_rate.py @@ -10,47 +10,52 @@ import numpy as np def _levenshtein_distance(ref, hyp): - """Levenshtein distance is a string metric for measuring the difference between - two sequences. Informally, the levenshtein disctance is defined as the minimum - number of single-character edits (substitutions, insertions or deletions) - required to change one word into the other. We can naturally extend the edits to - word level when calculate levenshtein disctance for two sentences. + """Levenshtein distance is a string metric for measuring the difference + between two sequences. Informally, the levenshtein disctance is defined as + the minimum number of single-character edits (substitutions, insertions or + deletions) required to change one word into the other. We can naturally + extend the edits to word level when calculate levenshtein disctance for + two sentences. """ - ref_len = len(ref) - hyp_len = len(hyp) + m = len(ref) + n = len(hyp) # special case if ref == hyp: return 0 - if ref_len == 0: - return hyp_len - if hyp_len == 0: - return ref_len + if m == 0: + return n + if n == 0: + return m - distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32) + if m < n: + ref, hyp = hyp, ref + m, n = n, m + + # use O(min(m, n)) space + distance = np.zeros((2, n + 1), dtype=np.int32) # initialize distance matrix - for j in xrange(hyp_len + 1): + for j in xrange(n + 1): distance[0][j] = j - for i in xrange(ref_len + 1): - distance[i][0] = i # calculate levenshtein distance - for i in xrange(1, ref_len + 1): - for j in xrange(1, hyp_len + 1): + for i in xrange(1, m + 1): + distance[i % 2][0] = i + for j in xrange(1, n + 1): if ref[i - 1] == hyp[j - 1]: - distance[i][j] = distance[i - 1][j - 1] + distance[i % 2][j] = distance[(i - 1) % 2][j - 1] else: - s_num = distance[i - 1][j - 1] + 1 - i_num = distance[i][j - 1] + 1 - d_num = distance[i - 1][j] + 1 - distance[i][j] = min(s_num, i_num, d_num) + s_num = distance[(i - 1) % 2][j - 1] + 1 + i_num = distance[i % 2][j - 1] + 1 + d_num = distance[(i - 1) % 2][j] + 1 + distance[i % 2][j] = min(s_num, i_num, d_num) - return distance[ref_len][hyp_len] + return distance[m % 2][n] def wer(reference, hypothesis, ignore_case=False, delimiter=' '): - """Calculate word error rate (WER). WER compares reference text and + """Calculate word error rate (WER). WER compares reference text and hypothesis text in word-level. WER is defined as: .. math:: @@ -65,8 +70,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): Iw is the number of words inserted, Nw is the number of words in the reference - We can use levenshtein distance to calculate WER. Please draw an attention that - empty items will be removed when splitting sentences by delimiter. + We can use levenshtein distance to calculate WER. Please draw an attention + that empty items will be removed when splitting sentences by delimiter. :param reference: The reference sentence. :type reference: basestring @@ -95,7 +100,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): return wer -def cer(reference, hypothesis, ignore_case=False): +def cer(reference, hypothesis, ignore_case=False, remove_space=False): """Calculate charactor error rate (CER). CER compares reference text and hypothesis text in char-level. CER is defined as: @@ -111,10 +116,10 @@ def cer(reference, hypothesis, ignore_case=False): Ic is the number of characters inserted Nc is the number of characters in the reference - We can use levenshtein distance to calculate CER. Chinese input should be - encoded to unicode. Please draw an attention that the leading and tailing - white space characters will be truncated and multiple consecutive white - space characters in a sentence will be replaced by one white space character. + We can use levenshtein distance to calculate CER. Chinese input should be + encoded to unicode. Please draw an attention that the leading and tailing + space characters will be truncated and multiple consecutive space + characters in a sentence will be replaced by one space character. :param reference: The reference sentence. :type reference: basestring @@ -122,6 +127,8 @@ def cer(reference, hypothesis, ignore_case=False): :type hypothesis: basestring :param ignore_case: Whether case-sensitive or not. :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool :return: Character error rate. :rtype: float :raises ValueError: If the reference length is zero. @@ -130,8 +137,12 @@ def cer(reference, hypothesis, ignore_case=False): reference = reference.lower() hypothesis = hypothesis.lower() - reference = ' '.join(filter(None, reference.split(' '))) - hypothesis = ' '.join(filter(None, hypothesis.split(' '))) + join_char = ' ' + if remove_space == True: + join_char = '' + + reference = join_char.join(filter(None, reference.split(' '))) + hypothesis = join_char.join(filter(None, hypothesis.split(' '))) if len(reference) == 0: raise ValueError("Length of reference should be greater than 0.") diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py index be7313f35..370dd0da2 100644 --- a/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -33,22 +33,34 @@ class TestParse(unittest.TestCase): self.assertTrue(abs(char_error_rate - 0.25) < 1e-6) def test_cer_2(self): + ref = 'werewolf' + hyp = 'weae wolf' + char_error_rate = error_rate.cer(ref, hyp, remove_space=True) + self.assertTrue(abs(char_error_rate - 0.125) < 1e-6) + + def test_cer_3(self): ref = 'werewolf' char_error_rate = error_rate.cer(ref, ref) self.assertEqual(char_error_rate, 0.0) - def test_cer_3(self): + def test_cer_4(self): ref = u'我是中国人' hyp = u'我是 美洲人' char_error_rate = error_rate.cer(ref, hyp) self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) - def test_cer_4(self): + def test_cer_5(self): + ref = u'我 是 中 国 人' + hyp = u'我 是 美 洲 人' + char_error_rate = error_rate.cer(ref, hyp, remove_space=True) + self.assertTrue(abs(char_error_rate - 0.4) < 1e-6) + + def test_cer_6(self): ref = u'我是中国人' char_error_rate = error_rate.cer(ref, ref) self.assertFalse(char_error_rate, 0.0) - def test_cer_5(self): + def test_cer_7(self): ref = '' hyp = 'Hypothesis' with self.assertRaises(ValueError): From 04970705d6cef9538cba93c77d558790ede3c765 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 14:14:04 +0800 Subject: [PATCH 26/30] Add more test cases and make DP more clear. --- datasets/librispeech/librispeech.py | 2 +- error_rate.py | 14 ++++---- tests/test_error_rate.py | 56 +++++++++++++++++++++++++---- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py index 422b1ed82..d963a7d53 100644 --- a/datasets/librispeech/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -16,8 +16,8 @@ import tarfile import argparse import soundfile import json -from paddle.v2.dataset.common import md5file import codecs +from paddle.v2.dataset.common import md5file DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') diff --git a/error_rate.py b/error_rate.py index 22e5c19b7..ea829f470 100644 --- a/error_rate.py +++ b/error_rate.py @@ -41,15 +41,17 @@ def _levenshtein_distance(ref, hyp): # calculate levenshtein distance for i in xrange(1, m + 1): - distance[i % 2][0] = i + prev_row_idx = (i - 1) % 2 + cur_row_idx = i % 2 + distance[cur_row_idx][0] = i for j in xrange(1, n + 1): if ref[i - 1] == hyp[j - 1]: - distance[i % 2][j] = distance[(i - 1) % 2][j - 1] + distance[cur_row_idx][j] = distance[prev_row_idx][j - 1] else: - s_num = distance[(i - 1) % 2][j - 1] + 1 - i_num = distance[i % 2][j - 1] + 1 - d_num = distance[(i - 1) % 2][j] + 1 - distance[i % 2][j] = min(s_num, i_num, d_num) + s_num = distance[prev_row_idx][j - 1] + 1 + i_num = distance[cur_row_idx][j - 1] + 1 + d_num = distance[prev_row_idx][j] + 1 + distance[cur_row_idx][j] = min(s_num, i_num, d_num) return distance[m % 2][n] diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py index 370dd0da2..99e137a9a 100644 --- a/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -11,16 +11,54 @@ import error_rate class TestParse(unittest.TestCase): def test_wer_1(self): ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' - hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night' + hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last '\ + 'night' word_error_rate = error_rate.wer(ref, hyp) self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6) def test_wer_2(self): + ref = 'as any in england i would say said gamewell proudly that is '\ + 'in his day' + hyp = 'as any in england i would say said came well proudly that is '\ + 'in his day' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.1333333) < 1e-6) + + def test_wer_3(self): + ref = 'the lieutenant governor lilburn w boggs afterward governor '\ + 'was a pronounced mormon hater and throughout the period of '\ + 'the troubles he manifested sympathy with the persecutors' + hyp = 'the lieutenant governor little bit how bags afterward '\ + 'governor was a pronounced warman hater and throughout the '\ + 'period of th troubles he manifests sympathy with the '\ + 'persecutors' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.2692307692) < 1e-6) + + def test_wer_4(self): + ref = 'the wood flamed up splendidly under the large brewing copper '\ + 'and it sighed so deeply' + hyp = 'the wood flame do splendidly under the large brewing copper '\ + 'and its side so deeply' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.2666666667) < 1e-6) + + def test_wer_5(self): + ref = 'all the morning they trudged up the mountain path and at noon '\ + 'unc and ojo sat on a fallen tree trunk and ate the last of '\ + 'the bread which the old munchkin had placed in his pocket' + hyp = 'all the morning they trudged up the mountain path and at noon '\ + 'unc in ojo sat on a fallen tree trunk and ate the last of '\ + 'the bread which the old munchkin had placed in his pocket' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.027027027) < 1e-6) + + def test_wer_6(self): ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' word_error_rate = error_rate.wer(ref, ref) self.assertEqual(word_error_rate, 0.0) - def test_wer_3(self): + def test_wer_7(self): ref = ' ' hyp = 'Hypothesis sentence' with self.assertRaises(ValueError): @@ -39,28 +77,34 @@ class TestParse(unittest.TestCase): self.assertTrue(abs(char_error_rate - 0.125) < 1e-6) def test_cer_3(self): + ref = 'were wolf' + hyp = 'were wolf' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.0) < 1e-6) + + def test_cer_4(self): ref = 'werewolf' char_error_rate = error_rate.cer(ref, ref) self.assertEqual(char_error_rate, 0.0) - def test_cer_4(self): + def test_cer_5(self): ref = u'我是中国人' hyp = u'我是 美洲人' char_error_rate = error_rate.cer(ref, hyp) self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) - def test_cer_5(self): + def test_cer_6(self): ref = u'我 是 中 国 人' hyp = u'我 是 美 洲 人' char_error_rate = error_rate.cer(ref, hyp, remove_space=True) self.assertTrue(abs(char_error_rate - 0.4) < 1e-6) - def test_cer_6(self): + def test_cer_7(self): ref = u'我是中国人' char_error_rate = error_rate.cer(ref, ref) self.assertFalse(char_error_rate, 0.0) - def test_cer_7(self): + def test_cer_8(self): ref = '' hyp = 'Hypothesis' with self.assertRaises(ValueError): From 1325cd9b8ed0d2d12042cdd0aaad9a7087ded162 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 16:21:44 +0800 Subject: [PATCH 27/30] Create 'tools' to hold tool scripts and add vocabulary dictionary building script. --- README.md | 6 +- tools/_init_paths.py | 16 +++++ tools/build_vocab.py | 63 +++++++++++++++++++ .../compute_mean_std.py | 1 + 4 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 tools/_init_paths.py create mode 100644 tools/build_vocab.py rename compute_mean_std.py => tools/compute_mean_std.py (99%) diff --git a/README.md b/README.md index 96fbb7d09..9d39903b5 100644 --- a/README.md +++ b/README.md @@ -40,13 +40,13 @@ python datasets/librispeech/librispeech.py --help ### Preparing for Training ``` -python compute_mean_std.py +python tools/compute_mean_std.py ``` It will compute mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing. The default feature of audio data is power spectrum, and the mfcc feature is also supported. To train and infer based on mfcc feature, please generate this file by ``` -python compute_mean_std.py --specgram_type mfcc +python tools/compute_mean_std.py --specgram_type mfcc ``` and specify ```--specgram_type mfcc``` when running train.py, infer.py, evaluator.py or tune.py. @@ -54,7 +54,7 @@ and specify ```--specgram_type mfcc``` when running train.py, infer.py, evaluato More help for arguments: ``` -python compute_mean_std.py --help +python tools/compute_mean_std.py --help ``` ### Training diff --git a/tools/_init_paths.py b/tools/_init_paths.py new file mode 100644 index 000000000..3bb2fd197 --- /dev/null +++ b/tools/_init_paths.py @@ -0,0 +1,16 @@ +"""Set up paths for DS2""" + +import os.path +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +this_dir = os.path.dirname(__file__) + +# Add project path to PYTHONPATH +proj_path = os.path.join(this_dir, '..') +add_path(proj_path) diff --git a/tools/build_vocab.py b/tools/build_vocab.py new file mode 100644 index 000000000..59be40318 --- /dev/null +++ b/tools/build_vocab.py @@ -0,0 +1,63 @@ +"""Build vocabulary dictionary from manifest files. + +Each item in vocabulary file is a character. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import codecs +import json +from collections import Counter +import os.path + +parser = argparse.ArgumentParser( + description='Build vocabulary dictionary from transcription texts.') +parser.add_argument( + "--manifest_paths", + type=str, + help="Manifest paths for building vocabulary dictionary." + "You can provide multiple manifest files.", + nargs='+', + required=True) +parser.add_argument( + "--count_threshold", + default=0, + type=int, + help="Characters whose count below the threshold will be truncated. " + "(default: %(default)s)") +parser.add_argument( + "--vocab_path", + default='datasets/vocab/zh_vocab.txt', + type=str, + help="Filepath to write vocabularies. (default: %(default)s)") +args = parser.parse_args() + + +def count_manifest(counter, manifest_path): + for json_line in codecs.open(manifest_path, 'r', 'utf-8'): + try: + json_data = json.loads(json_line) + except Exception as e: + raise Exception('Error parsing manifest: %s, %s' % \ + (manifest_path, e)) + text = json_data['text'] + for char in text: + counter.update(char) + + +def main(): + counter = Counter() + for manifest_path in args.manifest_paths: + count_manifest(counter, manifest_path) + + count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) + with codecs.open(args.vocab_path, 'w', 'utf-8') as fout: + for item_pair in count_sorted: + if item_pair[1] < args.count_threshold: break + fout.write(item_pair[0] + '\n') + + +if __name__ == '__main__': + main() diff --git a/compute_mean_std.py b/tools/compute_mean_std.py similarity index 99% rename from compute_mean_std.py rename to tools/compute_mean_std.py index 0cc84e730..da49eb4c0 100644 --- a/compute_mean_std.py +++ b/tools/compute_mean_std.py @@ -4,6 +4,7 @@ from __future__ import division from __future__ import print_function import argparse +import _init_paths from data_utils.normalizer import FeatureNormalizer from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.audio_featurizer import AudioFeaturizer From 5ef300f3f0538dc9a70e57e2b23ab63fb2cf4110 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 19:11:00 +0800 Subject: [PATCH 28/30] Make type of error rate optional. --- evaluate.py | 26 ++++++++++++++++++++++---- infer.py | 19 ++++++++++++++++++- model.py | 2 +- 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/evaluate.py b/evaluate.py index 592b7b527..7406e0bdd 100644 --- a/evaluate.py +++ b/evaluate.py @@ -10,6 +10,7 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer +from error_rate import cer import utils parser = argparse.ArgumentParser(description=__doc__) @@ -111,6 +112,14 @@ parser.add_argument( default='datasets/vocab/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--error_rate_type", + default='wer', + choices=['wer', 'cer'], + type=str, + help="There are total two error rate types including wer and cer. wer " + "represents for word error rate while cer for character error rate. " + "(default: %(default)s)") args = parser.parse_args() @@ -136,7 +145,14 @@ def evaluate(): rnn_layer_size=args.rnn_layer_size, pretrained_model_path=args.model_filepath) - wer_sum, num_ins = 0.0, 0 + if args.error_rate_type == 'wer': + error_rate_func = wer + error_rate_info = 'WER' + else: + error_rate_func = cer + error_rate_info = 'CER' + + error_sum, num_ins = 0.0, 0 for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( infer_data=infer_data, @@ -153,10 +169,12 @@ def evaluate(): for _, transcript in infer_data ] for target, result in zip(target_transcripts, result_transcripts): - wer_sum += wer(target, result) + error_sum += error_rate_func(target, result) num_ins += 1 - print("WER (%d/?) = %f" % (num_ins, wer_sum / num_ins)) - print("Final WER (%d/%d) = %f" % (num_ins, num_ins, wer_sum / num_ins)) + print("%s (%d/?) = %f" % \ + (error_rate_info, num_ins, error_sum / num_ins)) + print("Final %s (%d/%d) = %f" % \ + (error_rate_info, num_ins, num_ins, error_sum / num_ins)) def main(): diff --git a/infer.py b/infer.py index df5953e59..3aba847e7 100644 --- a/infer.py +++ b/infer.py @@ -10,6 +10,7 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer +from error_rate import cer import utils parser = argparse.ArgumentParser(description=__doc__) @@ -111,6 +112,14 @@ parser.add_argument( type=float, help="The cutoff probability of pruning" "in beam search. (default: %(default)f)") +parser.add_argument( + "--error_rate_type", + default='wer', + choices=['wer', 'cer'], + type=str, + help="There are total two error rate types including wer and cer. wer " + "represents for word error rate while cer for character error rate. " + "(default: %(default)s)") args = parser.parse_args() @@ -147,6 +156,13 @@ def infer(): language_model_path=args.language_model_path, num_processes=args.num_processes_beam_search) + if args.error_rate_type == 'wer': + error_rate_func = wer + error_rate_info = 'wer' + else: + error_rate_func = cer + error_rate_info = 'cer' + target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data @@ -154,7 +170,8 @@ def infer(): for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % (target, result)) - print("Current wer = %f" % wer(target, result)) + print("Current %s = %f" % \ + (error_rate_info, error_rate_func(target, result))) def main(): diff --git a/model.py b/model.py index 2eb7c3594..e2f2903b6 100644 --- a/model.py +++ b/model.py @@ -185,7 +185,7 @@ class DeepSpeech2Model(object): # best path decode for i, probs in enumerate(probs_split): output_transcription = ctc_best_path_decoder( - probs_seq=probs, vocabulary=data_generator.vocab_list) + probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) elif decode_method == "beam_search": # initialize external scorer From 4b3f768df7d165467fbdc44e6d91fae4a1715dea Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 20:03:53 +0800 Subject: [PATCH 29/30] Simplify description and codes. --- evaluate.py | 23 ++++++++--------------- infer.py | 19 ++++++------------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/evaluate.py b/evaluate.py index 7406e0bdd..82dcec3c2 100644 --- a/evaluate.py +++ b/evaluate.py @@ -9,8 +9,7 @@ import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model -from error_rate import wer -from error_rate import cer +from error_rate import wer, cer import utils parser = argparse.ArgumentParser(description=__doc__) @@ -117,8 +116,8 @@ parser.add_argument( default='wer', choices=['wer', 'cer'], type=str, - help="There are total two error rate types including wer and cer. wer " - "represents for word error rate while cer for character error rate. " + help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " + "for character error rate. " "(default: %(default)s)") args = parser.parse_args() @@ -145,13 +144,7 @@ def evaluate(): rnn_layer_size=args.rnn_layer_size, pretrained_model_path=args.model_filepath) - if args.error_rate_type == 'wer': - error_rate_func = wer - error_rate_info = 'WER' - else: - error_rate_func = cer - error_rate_info = 'CER' - + error_rate_func = cer if args.error_rate_type == 'cer' else wer error_sum, num_ins = 0.0, 0 for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( @@ -171,10 +164,10 @@ def evaluate(): for target, result in zip(target_transcripts, result_transcripts): error_sum += error_rate_func(target, result) num_ins += 1 - print("%s (%d/?) = %f" % \ - (error_rate_info, num_ins, error_sum / num_ins)) - print("Final %s (%d/%d) = %f" % \ - (error_rate_info, num_ins, num_ins, error_sum / num_ins)) + print("Error rate [%s] (%d/?) = %f" % + (args.error_rate_type, num_ins, error_sum / num_ins)) + print("Final error rate [%s] (%d/%d) = %f" % + (args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) def main(): diff --git a/infer.py b/infer.py index 3aba847e7..43643cde7 100644 --- a/infer.py +++ b/infer.py @@ -9,8 +9,7 @@ import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model -from error_rate import wer -from error_rate import cer +from error_rate import wer, cer import utils parser = argparse.ArgumentParser(description=__doc__) @@ -117,8 +116,8 @@ parser.add_argument( default='wer', choices=['wer', 'cer'], type=str, - help="There are total two error rate types including wer and cer. wer " - "represents for word error rate while cer for character error rate. " + help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " + "for character error rate. " "(default: %(default)s)") args = parser.parse_args() @@ -156,13 +155,7 @@ def infer(): language_model_path=args.language_model_path, num_processes=args.num_processes_beam_search) - if args.error_rate_type == 'wer': - error_rate_func = wer - error_rate_info = 'wer' - else: - error_rate_func = cer - error_rate_info = 'cer' - + error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data @@ -170,8 +163,8 @@ def infer(): for target, result in zip(target_transcripts, result_transcripts): print("\nTarget Transcription: %s\nOutput Transcription: %s" % (target, result)) - print("Current %s = %f" % \ - (error_rate_info, error_rate_func(target, result))) + print("Current error rate [%s] = %f" % + (args.error_rate_type, error_rate_func(target, result))) def main(): From c2e6378a64b1526076e4fb99aa6f9228d25891c8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 9 Aug 2017 23:03:30 +0800 Subject: [PATCH 30/30] Simplify codes and comments. --- tools/_init_paths.py | 3 +++ tools/build_vocab.py | 32 ++++++++++++++------------------ 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/tools/_init_paths.py b/tools/_init_paths.py index 3bb2fd197..ddabb535b 100644 --- a/tools/_init_paths.py +++ b/tools/_init_paths.py @@ -1,4 +1,7 @@ """Set up paths for DS2""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import os.path import sys diff --git a/tools/build_vocab.py b/tools/build_vocab.py index 59be40318..618f24985 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -1,4 +1,4 @@ -"""Build vocabulary dictionary from manifest files. +"""Build vocabulary from manifest files. Each item in vocabulary file is a character. """ @@ -11,13 +11,14 @@ import codecs import json from collections import Counter import os.path +import _init_paths +from data_utils import utils -parser = argparse.ArgumentParser( - description='Build vocabulary dictionary from transcription texts.') +parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--manifest_paths", type=str, - help="Manifest paths for building vocabulary dictionary." + help="Manifest paths for building vocabulary." "You can provide multiple manifest files.", nargs='+', required=True) @@ -25,25 +26,20 @@ parser.add_argument( "--count_threshold", default=0, type=int, - help="Characters whose count below the threshold will be truncated. " - "(default: %(default)s)") + help="Characters whose counts are below the threshold will be truncated. " + "(default: %(default)i)") parser.add_argument( "--vocab_path", default='datasets/vocab/zh_vocab.txt', type=str, - help="Filepath to write vocabularies. (default: %(default)s)") + help="File path to write the vocabulary. (default: %(default)s)") args = parser.parse_args() def count_manifest(counter, manifest_path): - for json_line in codecs.open(manifest_path, 'r', 'utf-8'): - try: - json_data = json.loads(json_line) - except Exception as e: - raise Exception('Error parsing manifest: %s, %s' % \ - (manifest_path, e)) - text = json_data['text'] - for char in text: + manifest_jsons = utils.read_manifest(manifest_path) + for line_json in manifest_jsons: + for char in line_json['text']: counter.update(char) @@ -54,9 +50,9 @@ def main(): count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) with codecs.open(args.vocab_path, 'w', 'utf-8') as fout: - for item_pair in count_sorted: - if item_pair[1] < args.count_threshold: break - fout.write(item_pair[0] + '\n') + for char, count in count_sorted: + if count < args.count_threshold: break + fout.write(char + '\n') if __name__ == '__main__':