From 4b26bf620cc32c908964d7ec68b7ec6bec491206 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Mon, 14 Aug 2017 20:42:09 +0800 Subject: [PATCH 01/18] Rename self.local_data to self._local_data in class DataGenerator. --- data_utils/data.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/data_utils/data.py b/data_utils/data.py index 98180b4b..33fcadc7 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -85,9 +85,9 @@ class DataGenerator(object): self._rng = random.Random(random_seed) self._epoch = 0 # for caching tar files info - self.local_data = local() - self.local_data.tar2info = {} - self.local_data.tar2object = {} + self._local_data = local() + self._local_data.tar2info = {} + self._local_data.tar2object = {} def process_utterance(self, filename, transcript): """Load, augment, featurize and normalize for speech data. @@ -240,16 +240,16 @@ class DataGenerator(object): """ if file.startswith('tar:'): tarpath, filename = file.split(':', 1)[1].split('#', 1) - if 'tar2info' not in self.local_data.__dict__: - self.local_data.tar2info = {} - if 'tar2object' not in self.local_data.__dict__: - self.local_data.tar2object = {} - if tarpath not in self.local_data.tar2info: + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: object, infoes = self._parse_tar(tarpath) - self.local_data.tar2info[tarpath] = infoes - self.local_data.tar2object[tarpath] = object - return self.local_data.tar2object[tarpath].extractfile( - self.local_data.tar2info[tarpath][filename]) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) else: return open(file, 'r') From 5a632758450911eff0b0421aa111be5141a4a71a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 21 Aug 2017 21:54:28 +0800 Subject: [PATCH 02/18] Add GRU support. --- demo_server.py | 6 +++++ evaluate.py | 6 +++++ infer.py | 6 +++++ layer.py | 64 ++++++++++++++++++++++++++++++++++++++++++++------ model.py | 9 +++---- train.py | 8 ++++++- tune.py | 6 +++++ 7 files changed, 93 insertions(+), 12 deletions(-) diff --git a/demo_server.py b/demo_server.py index c7e7e94a..60d97239 100644 --- a/demo_server.py +++ b/demo_server.py @@ -66,6 +66,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -199,6 +204,7 @@ def start_server(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) # prepare ASR inference handler diff --git a/evaluate.py b/evaluate.py index 82dcec3c..2f87abbd 100644 --- a/evaluate.py +++ b/evaluate.py @@ -38,6 +38,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -142,6 +147,7 @@ def evaluate(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) error_rate_func = cer if args.error_rate_type == 'cer' else wer diff --git a/infer.py b/infer.py index 43643cde..91b08932 100644 --- a/infer.py +++ b/infer.py @@ -33,6 +33,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -143,6 +148,7 @@ def infer(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, diff --git a/layer.py b/layer.py index 3b492645..1b1a5810 100644 --- a/layer.py +++ b/layer.py @@ -57,7 +57,7 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): # input-hidden weights shared across bi-direcitonal rnn. input_proj = paddle.layer.fc( input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection + # batch norm is only performed on input-state projection input_proj_bn = paddle.layer.batch_norm( input=input_proj, act=paddle.activation.Linear()) # forward and backward in time @@ -68,6 +68,38 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) +def bidirectional_gru_bn_layer(name, input, size, act): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput + """ + # input-hidden weights shared across bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, + size=size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_gru = paddle.layer.grumemory( + input=input_proj_bn, act=act, reverse=False) + backward_gru = paddle.layer.grumemory( + input=input_proj_bn, act=act, reverse=True) + return paddle.layer.concat(input=[forward_gru, backward_gru]) + + def conv_group(input, num_stacks): """Convolution group with stacked convolution layers. @@ -83,7 +115,7 @@ def conv_group(input, num_stacks): filter_size=(11, 41), num_channels_in=1, num_channels_out=32, - stride=(3, 2), + stride=(2, 2), padding=(5, 20), act=paddle.activation.BRelu()) for i in xrange(num_stacks - 1): @@ -100,7 +132,7 @@ def conv_group(input, num_stacks): return conv, output_num_channels, output_height -def rnn_group(input, size, num_stacks): +def rnn_group(input, size, num_stacks, use_gru): """RNN group with stacked bidirectional simple RNN layers. :param input: Input layer. @@ -109,13 +141,25 @@ def rnn_group(input, size, num_stacks): :type size: int :param num_stacks: Number of stacked rnn layers. :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool :return: Output layer of the RNN group. :rtype: LayerOutput """ output = input for i in xrange(num_stacks): - output = bidirectional_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + if use_gru: + output = bidirectional_gru_bn_layer( + name=str(i), + input=output, + size=size, + act=paddle.activation.BRelu()) + else: + output = bidirectional_simple_rnn_bn_layer( + name=str(i), + input=output, + size=size, + act=paddle.activation.BRelu()) return output @@ -124,7 +168,8 @@ def deep_speech2(audio_data, dict_size, num_conv_layers=2, num_rnn_layers=3, - rnn_size=256): + rnn_size=256, + use_gru=True): """ The whole DeepSpeech2 model structure (a simplified version). @@ -140,6 +185,8 @@ def deep_speech2(audio_data, :type num_rnn_layers: int :param rnn_size: RNN layer size (number of RNN cells). :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput @@ -157,7 +204,10 @@ def deep_speech2(audio_data, block_y=conv_group_height) # rnn group rnn_group_output = rnn_group( - input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + input=conv2seq, + size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru) fc = paddle.layer.fc( input=rnn_group_output, size=dict_size + 1, diff --git a/model.py b/model.py index 99412e59..eec971c0 100644 --- a/model.py +++ b/model.py @@ -30,9 +30,9 @@ class DeepSpeech2Model(object): """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, pretrained_model_path): + rnn_layer_size, use_gru, pretrained_model_path): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size) + rnn_layer_size, use_gru) self._create_parameters(pretrained_model_path) self._inferer = None self._loss_inferer = None @@ -226,7 +226,7 @@ class DeepSpeech2Model(object): gzip.open(model_path)) def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size): + rnn_layer_size, use_gru): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape @@ -243,4 +243,5 @@ class DeepSpeech2Model(object): dict_size=vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, - rnn_size=rnn_layer_size) + rnn_size=rnn_layer_size, + use_gru=use_gru) diff --git a/train.py b/train.py index 262d8bf0..8e95d7bc 100644 --- a/train.py +++ b/train.py @@ -37,9 +37,14 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=1280, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--adam_learning_rate", default=5e-4, @@ -170,6 +175,7 @@ def train(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.init_model_path) ds2_model.train( train_batch_reader=train_batch_reader, diff --git a/tune.py b/tune.py index 328d67a1..8a9b5b61 100644 --- a/tune.py +++ b/tune.py @@ -34,6 +34,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -158,6 +163,7 @@ def tune(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) # create grid for search From d7a2c0e9908e6cc2ceba41aaed43931464091373 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 21 Aug 2017 22:00:01 +0800 Subject: [PATCH 03/18] Replace activator BRelu with Relu. --- layer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/layer.py b/layer.py index 1b1a5810..c4055aaa 100644 --- a/layer.py +++ b/layer.py @@ -117,7 +117,7 @@ def conv_group(input, num_stacks): num_channels_out=32, stride=(2, 2), padding=(5, 20), - act=paddle.activation.BRelu()) + act=paddle.activation.Relu()) for i in xrange(num_stacks - 1): conv = conv_bn_layer( input=conv, @@ -126,7 +126,7 @@ def conv_group(input, num_stacks): num_channels_out=32, stride=(1, 2), padding=(5, 10), - act=paddle.activation.BRelu()) + act=paddle.activation.Relu()) output_num_channels = 32 output_height = 160 // pow(2, num_stacks) + 1 return conv, output_num_channels, output_height @@ -153,13 +153,13 @@ def rnn_group(input, size, num_stacks, use_gru): name=str(i), input=output, size=size, - act=paddle.activation.BRelu()) + act=paddle.activation.Relu()) else: output = bidirectional_simple_rnn_bn_layer( name=str(i), input=output, size=size, - act=paddle.activation.BRelu()) + act=paddle.activation.Relu()) return output From 1d163ad15f7bd37799c7015024cbebb110680b95 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Thu, 31 Aug 2017 12:22:27 +0800 Subject: [PATCH 04/18] Fixed a serious mistake of bidirectional simple rnn for DS2. --- cloud/pcloud_submit.sh | 4 ++-- layer.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cloud/pcloud_submit.sh b/cloud/pcloud_submit.sh index a7fb42cb..3c9a1c26 100644 --- a/cloud/pcloud_submit.sh +++ b/cloud/pcloud_submit.sh @@ -1,6 +1,6 @@ TRAIN_MANIFEST="cloud/cloud.manifest.train" DEV_MANIFEST="cloud/cloud.manifest.dev" -CLOUD_MODEL_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/model" +CLOUD_MODEL_DIR="./checkpoints" BATCH_SIZE=256 NUM_GPU=8 NUM_NODE=1 @@ -11,7 +11,7 @@ DS2_PATH=${PWD%/*} cp -f pcloud_train.sh ${DS2_PATH} paddlecloud submit \ --image bootstrapper:5000/wanghaoshuang/pcloud_ds2:latest \ +-image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \ -jobname ${JOB_NAME} \ -cpu ${NUM_GPU} \ -gpu ${NUM_GPU} \ diff --git a/layer.py b/layer.py index 3b492645..ef25c0a1 100644 --- a/layer.py +++ b/layer.py @@ -55,16 +55,20 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): :rtype: LayerOutput """ # input-hidden weights shared across bi-direcitonal rnn. - input_proj = paddle.layer.fc( + input_proj_forward = paddle.layer.fc( input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) + input_proj_backward = paddle.layer.fc( + input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) # forward and backward in time forward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=False) + input=input_proj_bn_forward, act=act, reverse=False) backward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=True) + input=input_proj_bn_backward, act=act, reverse=True) return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) From aed0cc991f45bffa56f5947b84ab14784bc11f87 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Sun, 3 Sep 2017 17:24:04 +0800 Subject: [PATCH 05/18] Fixed a bug of mixing forward and backward projection in bi-directional GRUs. --- layer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/layer.py b/layer.py index c4055aaa..8fec0eea 100644 --- a/layer.py +++ b/layer.py @@ -84,19 +84,26 @@ def bidirectional_gru_bn_layer(name, input, size, act): :rtype: LayerOutput """ # input-hidden weights shared across bi-direcitonal rnn. - input_proj = paddle.layer.fc( + input_proj_forward = paddle.layer.fc( + input=input, + size=size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + input_proj_backward = paddle.layer.fc( input=input, size=size * 3, act=paddle.activation.Linear(), bias_attr=False) # batch norm is only performed on input-state projection - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) # forward and backward in time forward_gru = paddle.layer.grumemory( - input=input_proj_bn, act=act, reverse=False) + input=input_proj_bn_forward, act=act, reverse=False) backward_gru = paddle.layer.grumemory( - input=input_proj_bn, act=act, reverse=True) + input=input_proj_bn_backward, act=act, reverse=True) return paddle.layer.concat(input=[forward_gru, backward_gru]) From 8f89a9bdd49a77b49c756700440a19bcd57a6667 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 4 Sep 2017 13:06:54 +0800 Subject: [PATCH 06/18] Print log to pfs for DS cloud training and set use_gru to False by default. --- cloud/pcloud_train.sh | 4 ++-- train.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh index e42da1d6..75949574 100644 --- a/cloud/pcloud_train.sh +++ b/cloud/pcloud_train.sh @@ -13,7 +13,7 @@ python ./cloud/split_data.py \ --in_manifest_path=${DEV_MANIFEST} \ --out_manifest_path='/local.manifest.dev' -python train.py \ +python -u train.py \ --batch_size=$BATCH_SIZE \ --use_gpu=1 \ --trainer_count=${NUM_GPU} \ @@ -21,4 +21,4 @@ python train.py \ --is_local=${IS_LOCAL} \ --train_manifest_path='/local.manifest.train' \ --dev_manifest_path='/local.manifest.dev' \ ---output_model_dir=${MODEL_PATH} \ +--output_model_dir=${MODEL_PATH} 2>&1 | tee ./log/train.log diff --git a/train.py b/train.py index 8e95d7bc..1d0b92ff 100644 --- a/train.py +++ b/train.py @@ -37,12 +37,12 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=1280, + default=1024, type=int, help="RNN layer cell number. (default: %(default)s)") parser.add_argument( "--use_gru", - default=True, + default=False, type=bool, help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( From 177af059532946964ada888e526dcc33d74c275c Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 4 Sep 2017 15:01:40 +0800 Subject: [PATCH 07/18] Fix a bug in use_gru argument parsing. --- demo_server.py | 4 ++-- evaluate.py | 4 ++-- infer.py | 4 ++-- train.py | 2 +- tune.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/demo_server.py b/demo_server.py index 60d97239..e4093ab2 100644 --- a/demo_server.py +++ b/demo_server.py @@ -68,8 +68,8 @@ parser.add_argument( help="RNN layer cell number. (default: %(default)s)") parser.add_argument( "--use_gru", - default=True, - type=bool, + default=False, + type=distutils.util.strtobool, help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", diff --git a/evaluate.py b/evaluate.py index 2f87abbd..8ab5b944 100644 --- a/evaluate.py +++ b/evaluate.py @@ -40,8 +40,8 @@ parser.add_argument( help="RNN layer cell number. (default: %(default)s)") parser.add_argument( "--use_gru", - default=True, - type=bool, + default=False, + type=distutils.util.strtobool, help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", diff --git a/infer.py b/infer.py index 91b08932..6b77f3d7 100644 --- a/infer.py +++ b/infer.py @@ -35,8 +35,8 @@ parser.add_argument( help="RNN layer cell number. (default: %(default)s)") parser.add_argument( "--use_gru", - default=True, - type=bool, + default=False, + type=distutils.util.strtobool, help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", diff --git a/train.py b/train.py index 1d0b92ff..42870bf5 100644 --- a/train.py +++ b/train.py @@ -43,7 +43,7 @@ parser.add_argument( parser.add_argument( "--use_gru", default=False, - type=bool, + type=distutils.util.strtobool, help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--adam_learning_rate", diff --git a/tune.py b/tune.py index 8a9b5b61..ffab8860 100644 --- a/tune.py +++ b/tune.py @@ -36,8 +36,8 @@ parser.add_argument( help="RNN layer cell number. (default: %(default)s)") parser.add_argument( "--use_gru", - default=True, - type=bool, + default=False, + type=distutils.util.strtobool, help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", From 2aa4af1c29ac22208fb33371a53677fabbd6d9d0 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 4 Sep 2017 17:56:25 +0800 Subject: [PATCH 08/18] Revert back to support input-hidden weights sharing between bi-directional RNNs. 1. Add options to enable and disable RNN weights sharing. 2. Set rnn_layer_size to 2048 by default. 3. Revert back the striding steps of 1st conv layer from 2 to 3. 4. Revert back to BRelu. Above follows DS2 papers. --- demo_server.py | 12 +++++-- evaluate.py | 12 +++++-- infer.py | 12 +++++-- layer.py | 89 +++++++++++++++++++++++++++++++++++--------------- model.py | 14 +++++--- train.py | 12 +++++-- tune.py | 12 +++++-- utils.py | 6 ++-- 8 files changed, 126 insertions(+), 43 deletions(-) diff --git a/demo_server.py b/demo_server.py index e4093ab2..b000e35e 100644 --- a/demo_server.py +++ b/demo_server.py @@ -63,9 +63,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -205,7 +212,8 @@ def start_server(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) # prepare ASR inference handler def file_to_transcript(filename): diff --git a/evaluate.py b/evaluate.py index 8ab5b944..8dd169b6 100644 --- a/evaluate.py +++ b/evaluate.py @@ -35,9 +35,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -148,7 +155,8 @@ def evaluate(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) error_rate_func = cer if args.error_rate_type == 'cer' else wer error_sum, num_ins = 0.0, 0 diff --git a/infer.py b/infer.py index 6b77f3d7..0c52ffc8 100644 --- a/infer.py +++ b/infer.py @@ -30,9 +30,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -149,7 +156,8 @@ def infer(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, decode_method=args.decode_method, diff --git a/layer.py b/layer.py index a91f694b..b7ac3c23 100644 --- a/layer.py +++ b/layer.py @@ -39,7 +39,7 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, return paddle.layer.batch_norm(input=conv_layer, act=act) -def bidirectional_simple_rnn_bn_layer(name, input, size, act): +def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights): """Bidirectonal simple rnn layer with sequence-wise batch normalization. The batch normalization is only performed on input-state weights. @@ -51,24 +51,50 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): :type size: int :param act: Activation type. :type act: BaseActivation + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool :return: Bidirectional simple rnn layer. :rtype: LayerOutput """ - # input-hidden weights shared across bi-direcitonal rnn. - input_proj_forward = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - input_proj_backward = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection - input_proj_bn_forward = paddle.layer.batch_norm( - input=input_proj_forward, act=paddle.activation.Linear()) - input_proj_bn_backward = paddle.layer.batch_norm( - input=input_proj_backward, act=paddle.activation.Linear()) - # forward and backward in time - forward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn_forward, act=act, reverse=False) - backward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn_backward, act=act, reverse=True) + if share_weights: + # input-hidden weights shared between bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + + else: + input_proj_forward = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + input_proj_backward = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn_forward, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn_backward, act=act, reverse=True) + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) @@ -87,7 +113,6 @@ def bidirectional_gru_bn_layer(name, input, size, act): :return: Bidirectional simple rnn layer. :rtype: LayerOutput """ - # input-hidden weights shared across bi-direcitonal rnn. input_proj_forward = paddle.layer.fc( input=input, size=size * 3, @@ -98,7 +123,7 @@ def bidirectional_gru_bn_layer(name, input, size, act): size=size * 3, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection + # batch norm is only performed on input-related projections input_proj_bn_forward = paddle.layer.batch_norm( input=input_proj_forward, act=paddle.activation.Linear()) input_proj_bn_backward = paddle.layer.batch_norm( @@ -126,9 +151,9 @@ def conv_group(input, num_stacks): filter_size=(11, 41), num_channels_in=1, num_channels_out=32, - stride=(2, 2), + stride=(3, 2), padding=(5, 20), - act=paddle.activation.Relu()) + act=paddle.activation.BRelu()) for i in xrange(num_stacks - 1): conv = conv_bn_layer( input=conv, @@ -137,13 +162,13 @@ def conv_group(input, num_stacks): num_channels_out=32, stride=(1, 2), padding=(5, 10), - act=paddle.activation.Relu()) + act=paddle.activation.BRelu()) output_num_channels = 32 output_height = 160 // pow(2, num_stacks) + 1 return conv, output_num_channels, output_height -def rnn_group(input, size, num_stacks, use_gru): +def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights): """RNN group with stacked bidirectional simple RNN layers. :param input: Input layer. @@ -154,6 +179,10 @@ def rnn_group(input, size, num_stacks, use_gru): :type num_stacks: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool :return: Output layer of the RNN group. :rtype: LayerOutput """ @@ -165,12 +194,14 @@ def rnn_group(input, size, num_stacks, use_gru): input=output, size=size, act=paddle.activation.Relu()) + # BRelu does not support hppl, need to add later. Use Relu instead. else: output = bidirectional_simple_rnn_bn_layer( name=str(i), input=output, size=size, - act=paddle.activation.Relu()) + act=paddle.activation.BRelu(), + share_weights=share_rnn_weights) return output @@ -180,9 +211,10 @@ def deep_speech2(audio_data, num_conv_layers=2, num_rnn_layers=3, rnn_size=256, - use_gru=True): + use_gru=False, + share_rnn_weights=True): """ - The whole DeepSpeech2 model structure (a simplified version). + The whole DeepSpeech2 model structure. :param audio_data: Audio spectrogram data layer. :type audio_data: LayerOutput @@ -198,6 +230,10 @@ def deep_speech2(audio_data, :type rnn_size: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput @@ -218,7 +254,8 @@ def deep_speech2(audio_data, input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers, - use_gru=use_gru) + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) fc = paddle.layer.fc( input=rnn_group_output, size=dict_size + 1, diff --git a/model.py b/model.py index eec971c0..0234ed2d 100644 --- a/model.py +++ b/model.py @@ -27,12 +27,17 @@ class DeepSpeech2Model(object): :param pretrained_model_path: Pretrained model path. If None, will train from stratch. :type pretrained_model_path: basestring|None + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs.Notice that + for GRU, weight sharing is not supported. + :type share_rnn_weights: bool """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, use_gru, pretrained_model_path): + rnn_layer_size, use_gru, pretrained_model_path, + share_rnn_weights): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, use_gru) + rnn_layer_size, use_gru, share_rnn_weights) self._create_parameters(pretrained_model_path) self._inferer = None self._loss_inferer = None @@ -226,7 +231,7 @@ class DeepSpeech2Model(object): gzip.open(model_path)) def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, use_gru): + rnn_layer_size, use_gru, share_rnn_weights): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape @@ -244,4 +249,5 @@ class DeepSpeech2Model(object): num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_layer_size, - use_gru=use_gru) + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) diff --git a/train.py b/train.py index 42870bf5..d055341f 100644 --- a/train.py +++ b/train.py @@ -37,9 +37,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=1024, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -176,7 +183,8 @@ def train(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.init_model_path) + pretrained_model_path=args.init_model_path, + share_rnn_weights=args.share_rnn_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, diff --git a/tune.py b/tune.py index ffab8860..d8001339 100644 --- a/tune.py +++ b/tune.py @@ -31,9 +31,16 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=2048, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--share_rnn_weights", + default=True, + type=distutils.util.strtobool, + help="Whether to share input-hidden weights between forword and backward " + "directional simple RNNs. Only available when use_gru=False. " + "(default: %(default)s)") parser.add_argument( "--use_gru", default=False, @@ -164,7 +171,8 @@ def tune(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath) + pretrained_model_path=args.model_filepath, + share_rnn_weights=args.share_rnn_weights) # create grid for search cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) diff --git a/utils.py b/utils.py index 9ca363c8..1d51e204 100644 --- a/utils.py +++ b/utils.py @@ -10,12 +10,12 @@ def print_arguments(args): Usage: .. code-block:: python - + parser = argparse.ArgumentParser() parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() + args = parser.parse_args() print_arguments(args) - + :param args: Input argparse.Namespace for printing. :type args: argparse.Namespace """ From 805846ce67bd82c183c9ab5e6fb3872c31b241d6 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 4 Sep 2017 20:13:08 +0800 Subject: [PATCH 09/18] Reduce the config parsing codes for DS2 and make it looks cleaner. --- decoder.py | 15 ++-- demo_server.py | 187 ++++++++++++++++----------------------- evaluate.py | 206 ++++++++++++++++--------------------------- infer.py | 205 ++++++++++++++++-------------------------- model.py | 19 ++-- train.py | 235 ++++++++++++++++++------------------------------- tune.py | 214 +++++++++++++++++--------------------------- utils.py | 25 ------ 8 files changed, 415 insertions(+), 691 deletions(-) delete mode 100644 utils.py diff --git a/decoder.py b/decoder.py index 8f2e0508..61ead25c 100644 --- a/decoder.py +++ b/decoder.py @@ -9,8 +9,9 @@ from math import log import multiprocessing -def ctc_best_path_decoder(probs_seq, vocabulary): - """Best path decoder, also called argmax decoder or greedy decoder. +def ctc_greedy_decoder(probs_seq, vocabulary): + """CTC greedy (best path) decoder. + Path consisting of the most probable tokens are further post-processed to remove consecutive repetitions and all blanks. @@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq, cutoff_prob=1.0, ext_scoring_func=None, nproc=False): - """Beam search decoder for CTC-trained network. It utilizes beam search - to approximately select top best decoding labels and returning results - in the descending order. The implementation is based on Prefix - Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is + """CTC Beam search decoder. + + It utilizes beam search to approximately select top best decoding + labels and returning results in the descending order. + The implementation is based on Prefix Beam Search + (https://arxiv.org/abs/1408.2873), and the unclear part is redesigned. Two important modifications: 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 2) the if condition "if l^+ not diff --git a/demo_server.py b/demo_server.py index b000e35e..d2afa49b 100644 --- a/demo_server.py +++ b/demo_server.py @@ -9,118 +9,74 @@ import SocketServer import struct import wave import paddle.v2 as paddle -from utils import print_arguments from data_utils.data import DataGenerator from model import DeepSpeech2Model from data_utils.utils import read_manifest parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--host_ip", - default="localhost", - type=str, - help="Server IP address. (default: %(default)s)") -parser.add_argument( - "--host_port", - default=8086, - type=int, - help="Server Port. (default: %(default)s)") -parser.add_argument( - "--speech_save_dir", - default="demo_cache", - type=str, - help="Directory for saving demo speech. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--warmup_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for warmup test. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding: best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--beam_size", - default=100, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('host_port', int, 8086, "Server's IP port.") +add_arg('host_ip', str, + 'localhost', + "Server's IP address.") +add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +# configurations of data preprocess +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('warmup_manifest', str, + 'datasets/manifest.test', + "Filepath of manifest to warm up.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable class AsrTCPServer(SocketServer.TCPServer): @@ -200,8 +156,8 @@ def start_server(): """Start the ASR server""" # prepare data generator data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=1) @@ -212,7 +168,7 @@ def start_server(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # prepare ASR inference handler @@ -220,13 +176,13 @@ def start_server(): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( infer_data=[feature], - decode_method=args.decode_method, + decoder_method=args.decoder_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, + language_model_path=args.lang_model_path, num_processes=1) return result_transcript[0] @@ -235,7 +191,7 @@ def start_server(): print('Warming up ...') warm_up_test( audio_process_handler=file_to_transcript, - manifest_path=args.warmup_manifest_path, + manifest_path=args.warmup_manifest, num_test_cases=3) print('-----------------------------------------------------------') @@ -249,6 +205,13 @@ def start_server(): server.serve_forever() +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=1) diff --git a/evaluate.py b/evaluate.py index 8dd169b6..1adf4255 100644 --- a/evaluate.py +++ b/evaluate.py @@ -10,140 +10,83 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--batch_size", - default=128, - type=int, - help="Minibatch size for evaluation. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding, best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--decode_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for decoding. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--error_rate_type", - default='wer', - choices=['wer', 'cer'], - type=str, - help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " - "for character error rate. " - "(default: %(default)s)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('batch_size', int, 128, "Minibatch size.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", + choices=['wer', 'cer']) +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +# configurations of data preprocess +add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('test_manifest', str, + 'datasets/manifest.test', + "Filepath of manifest to evaluate.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.parallels_data) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.test_manifest, batch_size=args.batch_size, min_batch_size=1, sortagrad=False, @@ -155,7 +98,7 @@ def evaluate(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) error_rate_func = cer if args.error_rate_type == 'cer' else wer @@ -163,14 +106,14 @@ def evaluate(): for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decode_method=args.decode_method, + decoder_method=args.decoder_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.parallels_bsearch) target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data @@ -184,8 +127,15 @@ def evaluate(): (args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) evaluate() diff --git a/infer.py b/infer.py index 0c52ffc8..cf02808c 100644 --- a/infer.py +++ b/infer.py @@ -10,140 +10,82 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_samples", - default=10, - type=int, - help="Number of samples for inference. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=1, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--decode_manifest_path", - default='datasets/manifest.test', - type=str, - help="Manifest path for decoding. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--decode_method", - default='beam_search', - type=str, - help="Method for ctc decoding: best_path or beam_search. " - "(default: %(default)s)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha", - default=0.36, - type=float, - help="Parameter associated with language model. (default: %(default)f)") -parser.add_argument( - "--beta", - default=0.25, - type=float, - help="Parameter associated with word count. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") -parser.add_argument( - "--error_rate_type", - default='wer', - choices=['wer', 'cer'], - type=str, - help="Error rate type for evaluation. 'wer' for word error rate and 'cer' " - "for character error rate. " - "(default: %(default)s)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('num_samples', int, 10, "# of samples to infer.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", + choices=['wer', 'cer']) +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +# configurations of data preprocess +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('infer_manifest', str, + 'datasets/manifest.dev', + "Filepath of manifest to infer.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable def infer(): """Inference for DeepSpeech2.""" data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=1) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.infer_manifest, batch_size=args.num_samples, min_batch_size=1, sortagrad=False, @@ -156,18 +98,18 @@ def infer(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decode_method=args.decode_method, + decoder_method=args.decoder_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.parallels_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ @@ -181,8 +123,15 @@ def infer(): (args.error_rate_type, error_rate_func(target, result))) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) infer() diff --git a/model.py b/model.py index 0234ed2d..894605bf 100644 --- a/model.py +++ b/model.py @@ -146,7 +146,7 @@ class DeepSpeech2Model(object): # run inference return self._loss_inferer.infer(input=infer_data) - def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, + def infer_batch(self, infer_data, decoder_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): """Model inference. Infer the transcription for a batch of speech @@ -156,9 +156,9 @@ class DeepSpeech2Model(object): consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list - :param decode_method: Decoding method name, 'best_path' or - 'beam search'. - :param decode_method: string + :param decoder_method: Decoding method name, 'ctc_greedy' or + 'ctc_beam_search'. + :param decoder_method: string :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. @@ -190,13 +190,13 @@ class DeepSpeech2Model(object): ] # run decoder results = [] - if decode_method == "best_path": + if decoder_method == "ctc_greedy": # best path decode for i, probs in enumerate(probs_split): - output_transcription = ctc_best_path_decoder( + output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) - elif decode_method == "beam_search": + elif decoder_method == "ctc_beam_search": # initialize external scorer if self._ext_scorer == None: self._ext_scorer = LmScorer(beam_alpha, beam_beta, @@ -205,7 +205,6 @@ class DeepSpeech2Model(object): else: self._ext_scorer.reset_params(beam_alpha, beam_beta) assert self._loaded_lm_path == language_model_path - # beam search decode beam_search_results = ctc_beam_search_decoder_batch( probs_split=probs_split, @@ -218,8 +217,8 @@ class DeepSpeech2Model(object): results = [result[0][1] for result in beam_search_results] else: - raise ValueError("Decoding method [%s] is not supported." % - decode_method) + raise ValueError("Decoder method [%s] is not supported." % + decoder_method) return results def _create_parameters(self, model_path=None): diff --git a/train.py b/train.py index d055341f..d21e6a3b 100644 --- a/train.py +++ b/train.py @@ -9,169 +9,103 @@ import multiprocessing import paddle.v2 as paddle from model import DeepSpeech2Model from data_utils.data import DataGenerator -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--batch_size", default=256, type=int, help="Minibatch size.") -parser.add_argument( - "--num_passes", - default=200, - type=int, - help="Training pass number. (default: %(default)s)") -parser.add_argument( - "--num_iterations_print", - default=100, - type=int, - help="Number of iterations for every train cost printing. " - "(default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--adam_learning_rate", - default=5e-4, - type=float, - help="Learning rate for ADAM Optimizer. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--use_sortagrad", - default=True, - type=distutils.util.strtobool, - help="Use sortagrad or not. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--max_duration", - default=27.0, - type=float, - help="Audios with duration larger than this will be discarded. " - "(default: %(default)s)") -parser.add_argument( - "--min_duration", - default=0.0, - type=float, - help="Audios with duration smaller than this will be discarded. " - "(default: %(default)s)") -parser.add_argument( - "--shuffle_method", - default='batch_shuffle_clipped', - type=str, - help="Shuffle method: 'instance_shuffle', 'batch_shuffle', " - "'batch_shuffle_batch'. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--train_manifest_path", - default='datasets/manifest.train', - type=str, - help="Manifest path for training. (default: %(default)s)") -parser.add_argument( - "--dev_manifest_path", - default='datasets/manifest.dev', - type=str, - help="Manifest path for validation. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--init_model_path", - default=None, - type=str, - help="If set None, the training will start from scratch. " - "Otherwise, the training will resume from " - "the existing model of this path. (default: %(default)s)") -parser.add_argument( - "--output_model_dir", - default="./checkpoints", - type=str, - help="Directory for saving models. (default: %(default)s)") -parser.add_argument( - "--augmentation_config", - default=open('conf/augmentation.config', 'r').read(), - type=str, - help="Augmentation configuration in json-format. " - "(default: %(default)s)") -parser.add_argument( - "--is_local", - default=True, - type=distutils.util.strtobool, - help="Set to false if running with pserver in paddlecloud. " - "(default: %(default)s)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of optimization +add_arg('batch_size', int, 256, "Minibatch size.") +add_arg('learning_rate', float, 5e-4, "Learning rate.") +add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('num_passes', int, 200, "# of training epochs.") +add_arg('is_local', bool, True, "Use pserver or not.") +add_arg('num_iter_print', int, 100, "Every # iterations for printing " + "train cost.") +# configurations of data preprocess +add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") +add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") +add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('augment_conf_path',str, + 'conf/augmentation.config', + "Filepath of augmentation configuration file (json-format).") +add_arg('shuffle_method', str, + 'batch_shuffle_clipped', + "Shuffle method.", + choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('train_manifest', str, + 'datasets/manifest.train', + "Filepath of train manifest.") +add_arg('dev_manifest', str, + 'datasets/manifest.dev', + "Filepath of validation manifest.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('init_model_path', str, + None, + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('output_model_dir', str, + "./checkpoints", + "Directory for saving checkpoints.") args = parser.parse_args() +# yapf: disable def train(): """DeepSpeech2 training.""" train_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config=open(args.augment_conf_path, 'r').read(), max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.parallels_data) dev_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=args.parallels_data) train_batch_reader = train_generator.batch_reader_creator( - manifest_path=args.train_manifest_path, + manifest_path=args.train_manifest, batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, shuffle_method=args.shuffle_method) dev_batch_reader = dev_generator.batch_reader_creator( - manifest_path=args.dev_manifest_path, + manifest_path=args.dev_manifest, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, @@ -184,21 +118,28 @@ def train(): rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.init_model_path, - share_rnn_weights=args.share_rnn_weights) + share_rnn_weights=args.share_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, feeding_dict=train_generator.feeding, - learning_rate=args.adam_learning_rate, + learning_rate=args.learning_rate, gradient_clipping=400, num_passes=args.num_passes, - num_iterations_print=args.num_iterations_print, + num_iterations_print=args.num_iter_print, output_model_dir=args.output_model_dir, is_local=args.is_local) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train() diff --git a/tune.py b/tune.py index d8001339..eac7ccd3 100644 --- a/tune.py +++ b/tune.py @@ -1,4 +1,4 @@ -"""Parameters tuning for DeepSpeech2 model.""" +"""Beam search parameters tuning for DeepSpeech2 model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -11,134 +11,71 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer -import utils +NUM_CPU = multiprocessing.cpu_count() // 2 parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--num_samples", - default=100, - type=int, - help="Number of samples for parameters tuning. (default: %(default)s)") -parser.add_argument( - "--num_conv_layers", - default=2, - type=int, - help="Convolution layer number. (default: %(default)s)") -parser.add_argument( - "--num_rnn_layers", - default=3, - type=int, - help="RNN layer number. (default: %(default)s)") -parser.add_argument( - "--rnn_layer_size", - default=2048, - type=int, - help="RNN layer cell number. (default: %(default)s)") -parser.add_argument( - "--share_rnn_weights", - default=True, - type=distutils.util.strtobool, - help="Whether to share input-hidden weights between forword and backward " - "directional simple RNNs. Only available when use_gru=False. " - "(default: %(default)s)") -parser.add_argument( - "--use_gru", - default=False, - type=distutils.util.strtobool, - help="Use GRU or simple RNN. (default: %(default)s)") -parser.add_argument( - "--use_gpu", - default=True, - type=distutils.util.strtobool, - help="Use gpu or not. (default: %(default)s)") -parser.add_argument( - "--trainer_count", - default=8, - type=int, - help="Trainer number. (default: %(default)s)") -parser.add_argument( - "--num_threads_data", - default=1, - type=int, - help="Number of cpu threads for preprocessing data. (default: %(default)s)") -parser.add_argument( - "--num_processes_beam_search", - default=multiprocessing.cpu_count() // 2, - type=int, - help="Number of cpu processes for beam search. (default: %(default)s)") -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--mean_std_filepath", - default='mean_std.npz', - type=str, - help="Manifest path for normalizer. (default: %(default)s)") -parser.add_argument( - "--tune_manifest_path", - default='datasets/manifest.dev', - type=str, - help="Manifest path for tuning. (default: %(default)s)") -parser.add_argument( - "--model_filepath", - default='checkpoints/params.latest.tar.gz', - type=str, - help="Model filepath. (default: %(default)s)") -parser.add_argument( - "--vocab_filepath", - default='datasets/vocab/eng_vocab.txt', - type=str, - help="Vocabulary filepath. (default: %(default)s)") -parser.add_argument( - "--beam_size", - default=500, - type=int, - help="Width for beam search decoding. (default: %(default)d)") -parser.add_argument( - "--language_model_path", - default="lm/data/common_crawl_00.prune01111.trie.klm", - type=str, - help="Path for language model. (default: %(default)s)") -parser.add_argument( - "--alpha_from", - default=0.1, - type=float, - help="Where alpha starts from. (default: %(default)f)") -parser.add_argument( - "--num_alphas", - default=14, - type=int, - help="Number of candidate alphas. (default: %(default)d)") -parser.add_argument( - "--alpha_to", - default=0.36, - type=float, - help="Where alpha ends with. (default: %(default)f)") -parser.add_argument( - "--beta_from", - default=0.05, - type=float, - help="Where beta starts from. (default: %(default)f)") -parser.add_argument( - "--num_betas", - default=20, - type=float, - help="Number of candidate betas. (default: %(default)d)") -parser.add_argument( - "--beta_to", - default=1.0, - type=float, - help="Where beta ends with. (default: %(default)f)") -parser.add_argument( - "--cutoff_prob", - default=0.99, - type=float, - help="The cutoff probability of pruning" - "in beam search. (default: %(default)f)") + + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +# configurations of overall +add_arg('num_samples', int, 100, "# of samples to infer.") +add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", + choices=['wer', 'cer']) +# configurations of tuning parameters +add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") +add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") +add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") +add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") +add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") +add_arg('num_betas', int, 20, "# of beta candidates for tuning.") +# configurations of decoder +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") +add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +# configurations of data preprocess +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# configurations of model structure +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +# configurations of data io +add_arg('tune_manifest', str, + 'datasets/manifest.test', + "Filepath of manifest to tune.") +add_arg('mean_std_path', str, + 'mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'datasets/vocab/eng_vocab.txt', + "Filepath of vocabulary.") +# configurations of model io +add_arg('model_path', str, + './checkpoints/params.latest.tar.gz', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") args = parser.parse_args() +# yapf: disable def tune(): @@ -149,13 +86,13 @@ def tune(): raise ValueError("num_betas must be non-negative!") data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.num_threads_data) + num_threads=1) batch_reader = data_generator.batch_reader_creator( - manifest_path=args.tune_manifest_path, + manifest_path=args.tune_manifest, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) @@ -171,7 +108,7 @@ def tune(): num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, - pretrained_model_path=args.model_filepath, + pretrained_model_path=args.model_path, share_rnn_weights=args.share_rnn_weights) # create grid for search @@ -184,14 +121,14 @@ def tune(): for alpha, beta in params_grid: result_transcripts = ds2_model.infer_batch( infer_data=tune_data, - decode_method='beam_search', + decoder_method='ctc_beam_search', beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, - language_model_path=args.language_model_path, - num_processes=args.num_processes_beam_search) + language_model_path=args.lang_model_path, + num_processes=args.parallels_bsearch) wer_sum, num_ins = 0.0, 0 for target, result in zip(target_transcripts, result_transcripts): wer_sum += wer(target, result) @@ -200,8 +137,15 @@ def tune(): (alpha, beta, wer_sum / num_ins)) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): - utils.print_arguments(args) + print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) tune() diff --git a/utils.py b/utils.py deleted file mode 100644 index 1d51e204..00000000 --- a/utils.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Contains common utility functions.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - - -def print_arguments(args): - """Print argparse's arguments. - - Usage: - - .. code-block:: python - - parser = argparse.ArgumentParser() - parser.add_argument("name", default="Jonh", type=str, help="User name.") - args = parser.parse_args() - print_arguments(args) - - :param args: Input argparse.Namespace for printing. - :type args: argparse.Namespace - """ - print("----- Configuration Arguments -----") - for arg, value in vars(args).iteritems(): - print("%s: %s" % (arg, value)) - print("------------------------------------") From dfd7652308972a2de02cdcdfb5d71e8ebf98c5df Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 5 Sep 2017 00:38:30 +0800 Subject: [PATCH 10/18] Rename ctc_best_path_decoder to ctc_greedy_decoder in unitest. --- tests/test_decoders.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_decoders.py b/tests/test_decoders.py index 99d8a828..fa43879b 100644 --- a/tests/test_decoders.py +++ b/tests/test_decoders.py @@ -49,16 +49,16 @@ class TestDecoders(unittest.TestCase): 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, 0.05294827, 0.22298418 ]] - self.best_path_result = ["ac'bdc", "b'da"] + self.greedy_result = ["ac'bdc", "b'da"] self.beam_search_result = ['acdc', "b'a"] - def test_best_path_decoder_1(self): - bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list) - self.assertEqual(bst_result, self.best_path_result[0]) + def test_greedy_decoder_1(self): + bst_result = ctc_greedy_decoder(self.probs_seq1, self.vocab_list) + self.assertEqual(bst_result, self.greedy_result[0]) - def test_best_path_decoder_2(self): - bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list) - self.assertEqual(bst_result, self.best_path_result[1]) + def test_greedy_decoder_2(self): + bst_result = ctc_greedy_decoder(self.probs_seq2, self.vocab_list) + self.assertEqual(bst_result, self.greedy_result[1]) def test_beam_search_decoder_1(self): beam_result = ctc_beam_search_decoder( From 792129166ab9c1a5380d6a20eebd33ac7b7b9766 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 5 Sep 2017 12:23:41 +0800 Subject: [PATCH 11/18] Sort the config lines to make it look better. --- demo_server.py | 50 ++++++++++++++++++++++---------------------------- evaluate.py | 42 +++++++++++++++++++----------------------- infer.py | 41 +++++++++++++++++++---------------------- train.py | 43 +++++++++++++++++++------------------------ tune.py | 45 ++++++++++++++++++++------------------------- 5 files changed, 99 insertions(+), 122 deletions(-) diff --git a/demo_server.py b/demo_server.py index d2afa49b..5eed3d2e 100644 --- a/demo_server.py +++ b/demo_server.py @@ -27,41 +27,25 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall add_arg('host_port', int, 8086, "Server's IP port.") -add_arg('host_ip', str, - 'localhost', - "Server's IP address.") -add_arg('speech_save_dir', str, - 'demo_cache', - "Directory to save demo audios.") -add_arg('use_gpu', bool, True, "Use GPU or not.") -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoder_method', str, - 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -# configurations of data preprocess -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io -add_arg('warmup_manifest', str, +add_arg('host_ip', str, + 'localhost', + "Server's IP address.") +add_arg('speech_save_dir', str, + 'demo_cache', + "Directory to save demo audios.") +add_arg('warmup_manifest', str, 'datasets/manifest.test', "Filepath of manifest to warm up.") add_arg('mean_std_path', str, @@ -70,11 +54,21 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable diff --git a/evaluate.py b/evaluate.py index 1adf4255..2c412778 100644 --- a/evaluate.py +++ b/evaluate.py @@ -26,39 +26,21 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall add_arg('batch_size', int, 128, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", - choices=['wer', 'cer']) -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoder_method', str, - 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -# configurations of data preprocess add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io add_arg('test_manifest', str, 'datasets/manifest.test', "Filepath of manifest to evaluate.") @@ -68,11 +50,25 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable diff --git a/infer.py b/infer.py index cf02808c..313f80c0 100644 --- a/infer.py +++ b/infer.py @@ -29,35 +29,18 @@ def add_arg(argname, type, default, help, **kwargs): # configurations of overall add_arg('num_samples', int, 10, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", - choices=['wer', 'cer']) -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('alpha', float, 0.36, "Coef of LM for beam search.") -add_arg('beta', float, 0.25, "Coef of WC for beam search.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -add_arg('decoder_method', str, - 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", - choices = ['ctc_beam_search', 'ctc_greedy']) -# configurations of data preprocess -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 0.36, "Coef of LM for beam search.") +add_arg('beta', float, 0.25, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io add_arg('infer_manifest', str, 'datasets/manifest.dev', "Filepath of manifest to infer.") @@ -67,11 +50,25 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('decoder_method', str, + 'ctc_beam_search', + "Decoder method. Options: ctc_beam_search, ctc_greedy", + choices = ['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable diff --git a/train.py b/train.py index d21e6a3b..3d658d27 100644 --- a/train.py +++ b/train.py @@ -25,39 +25,24 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of optimization add_arg('batch_size', int, 256, "Minibatch size.") -add_arg('learning_rate', float, 5e-4, "Learning rate.") -add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('num_passes', int, 200, "# of training epochs.") -add_arg('is_local', bool, True, "Use pserver or not.") -add_arg('num_iter_print', int, 100, "Every # iterations for printing " - "train cost.") -# configurations of data preprocess -add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") -add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -add_arg('augment_conf_path',str, - 'conf/augmentation.config', - "Filepath of augmentation configuration file (json-format).") -add_arg('shuffle_method', str, - 'batch_shuffle_clipped', - "Shuffle method.", - choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('num_iter_print', int, 100, "Every # iterations for printing " + "train cost.") +add_arg('learning_rate', float, 5e-4, "Learning rate.") +add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") +add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") +add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('is_local', bool, True, "Use pserver or not.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io add_arg('train_manifest', str, 'datasets/manifest.train', "Filepath of train manifest.") @@ -70,7 +55,6 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io add_arg('init_model_path', str, None, "If None, the training starts from scratch, " @@ -78,6 +62,17 @@ add_arg('init_model_path', str, add_arg('output_model_dir', str, "./checkpoints", "Directory for saving checkpoints.") +add_arg('augment_conf_path',str, + 'conf/augmentation.config', + "Filepath of augmentation configuration file (json-format).") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('shuffle_method', str, + 'batch_shuffle_clipped', + "Shuffle method.", + choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) args = parser.parse_args() # yapf: disable diff --git a/tune.py b/tune.py index eac7ccd3..2fbe0b98 100644 --- a/tune.py +++ b/tune.py @@ -27,40 +27,25 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall add_arg('num_samples', int, 100, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") -add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('error_rate_type', str, 'wer', "Error rate type for evaluation.", - choices=['wer', 'cer']) -# configurations of tuning parameters -add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") -add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") -add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") -add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") -add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") -add_arg('num_betas', int, 20, "# of beta candidates for tuning.") -# configurations of decoder add_arg('beam_size', int, 500, "Beam search width.") -add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('lang_model_path', str, - 'lm/data/common_crawl_00.prune01111.trie.klm', - "Filepath for language model.") -# configurations of data preprocess -add_arg('specgram_type', str, - 'linear', - "Audio feature type. Options: linear, mfcc.", - choices=['linear', 'mfcc']) -# configurations of model structure add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.") +add_arg('num_betas', int, 20, "# of beta candidates for tuning.") +add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") +add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") +add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") +add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") +add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") -# configurations of data io -add_arg('tune_manifest', str, +add_arg('tune_manifest', str, 'datasets/manifest.test', "Filepath of manifest to tune.") add_arg('mean_std_path', str, @@ -69,11 +54,21 @@ add_arg('mean_std_path', str, add_arg('vocab_path', str, 'datasets/vocab/eng_vocab.txt', "Filepath of vocabulary.") -# configurations of model io +add_arg('lang_model_path', str, + 'lm/data/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) args = parser.parse_args() # yapf: disable From 8b64ef29c8810387bf6adadf2e9a0087bf3d4812 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 5 Sep 2017 12:48:11 +0800 Subject: [PATCH 12/18] Re-style the config codes for tools in DS2. --- demo_server.py | 3 +- evaluate.py | 9 ++--- infer.py | 8 ++--- tools/build_vocab.py | 50 ++++++++++++++++++---------- tools/compute_mean_std.py | 70 ++++++++++++++++++++------------------- train.py | 7 ++-- tune.py | 8 ++--- 7 files changed, 79 insertions(+), 76 deletions(-) diff --git a/demo_server.py b/demo_server.py index 5eed3d2e..81b56f94 100644 --- a/demo_server.py +++ b/demo_server.py @@ -13,8 +13,6 @@ from data_utils.data import DataGenerator from model import DeepSpeech2Model from data_utils.utils import read_manifest -parser = argparse.ArgumentParser(description=__doc__) - def add_arg(argname, type, default, help, **kwargs): type = distutils.util.strtobool if type == bool else type @@ -27,6 +25,7 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable +parser = argparse.ArgumentParser(description=__doc__) add_arg('host_port', int, 8086, "Server's IP port.") add_arg('beam_size', int, 500, "Beam search width.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") diff --git a/evaluate.py b/evaluate.py index 2c412778..38204c57 100644 --- a/evaluate.py +++ b/evaluate.py @@ -5,15 +5,11 @@ from __future__ import print_function import distutils.util import argparse -import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer -NUM_CPU = multiprocessing.cpu_count() // 2 -parser = argparse.ArgumentParser(description=__doc__) - def add_arg(argname, type, default, help, **kwargs): type = distutils.util.strtobool if type == bool else type @@ -26,11 +22,12 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable +parser = argparse.ArgumentParser(description=__doc__) add_arg('batch_size', int, 128, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") -add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") +add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") +add_arg('parallels_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") diff --git a/infer.py b/infer.py index 313f80c0..e08cb1ca 100644 --- a/infer.py +++ b/infer.py @@ -5,15 +5,11 @@ from __future__ import print_function import argparse import distutils.util -import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer -NUM_CPU = multiprocessing.cpu_count() // 2 -parser = argparse.ArgumentParser(description=__doc__) - def add_arg(argname, type, default, help, **kwargs): type = distutils.util.strtobool if type == bool else type @@ -26,11 +22,11 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable -# configurations of overall +parser = argparse.ArgumentParser(description=__doc__) add_arg('num_samples', int, 10, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") diff --git a/tools/build_vocab.py b/tools/build_vocab.py index 618f2498..f6cf6b9f 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -14,26 +14,31 @@ import os.path import _init_paths from data_utils import utils + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable parser = argparse.ArgumentParser(description=__doc__) -parser.add_argument( - "--manifest_paths", - type=str, - help="Manifest paths for building vocabulary." - "You can provide multiple manifest files.", - nargs='+', - required=True) -parser.add_argument( - "--count_threshold", - default=0, - type=int, - help="Characters whose counts are below the threshold will be truncated. " - "(default: %(default)i)") -parser.add_argument( - "--vocab_path", - default='datasets/vocab/zh_vocab.txt', - type=str, - help="File path to write the vocabulary. (default: %(default)s)") +add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") +add_arg('vocab_path', str, + 'datasets/vocab/zh_vocab.txt', + "Filepath to write the vocabulary.") +add_arg('manifest_paths', str, + None, + "Filepaths of manifests for building vocabulary. " + "You can provide multiple manifest files.", + nargs='+', + required=True) args = parser.parse_args() +# yapf: disable def count_manifest(counter, manifest_path): @@ -43,7 +48,16 @@ def count_manifest(counter, manifest_path): counter.update(char) +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + def main(): + print_arguments(args) + counter = Counter() for manifest_path in args.manifest_paths: count_manifest(counter, manifest_path) diff --git a/tools/compute_mean_std.py b/tools/compute_mean_std.py index da49eb4c..913a4334 100644 --- a/tools/compute_mean_std.py +++ b/tools/compute_mean_std.py @@ -9,43 +9,45 @@ from data_utils.normalizer import FeatureNormalizer from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.audio_featurizer import AudioFeaturizer -parser = argparse.ArgumentParser( - description='Computing mean and stddev for feature normalizer.') -parser.add_argument( - "--specgram_type", - default='linear', - type=str, - help="Feature type of audio data: 'linear' (power spectrum)" - " or 'mfcc'. (default: %(default)s)") -parser.add_argument( - "--manifest_path", - default='datasets/manifest.train', - type=str, - help="Manifest path for computing normalizer's mean and stddev." - "(default: %(default)s)") -parser.add_argument( - "--num_samples", - default=2000, - type=int, - help="Number of samples for computing mean and stddev. " - "(default: %(default)s)") -parser.add_argument( - "--augmentation_config", - default='{}', - type=str, - help="Augmentation configuration in json-format. " - "(default: %(default)s)") -parser.add_argument( - "--output_file", - default='mean_std.npz', - type=str, - help="Filepath to write mean and std to (.npz)." - "(default: %(default)s)") + +def add_arg(argname, type, default, help, **kwargs): + type = distutils.util.strtobool if type == bool else type + parser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +# yapf: disable +parser = argparse.ArgumentParser(description=__doc__) +add_arg('num_samples', int, 2000, "# of samples to for statistics.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('manifest_path', str, + 'datasets/manifest.train', + "Filepath of manifest to compute normalizer's mean and stddev.") +add_arg('output_path', str, + 'mean_std.npz', + "Filepath of write mean and stddev to (.npz).") args = parser.parse_args() +# yapf: disable + + +def print_arguments(args): + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") def main(): - augmentation_pipeline = AugmentationPipeline(args.augmentation_config) + print_arguments(args) + + augmentation_pipeline = AugmentationPipeline('{}') audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) def augment_and_featurize(audio_segment): @@ -57,7 +59,7 @@ def main(): manifest_path=args.manifest_path, featurize_func=augment_and_featurize, num_samples=args.num_samples) - normalizer.write_to_file(args.output_file) + normalizer.write_to_file(args.output_path) if __name__ == '__main__': diff --git a/train.py b/train.py index 3d658d27..bd00d21d 100644 --- a/train.py +++ b/train.py @@ -5,14 +5,10 @@ from __future__ import print_function import argparse import distutils.util -import multiprocessing import paddle.v2 as paddle from model import DeepSpeech2Model from data_utils.data import DataGenerator -NUM_CPU = multiprocessing.cpu_count() // 2 -parser = argparse.ArgumentParser(description=__doc__) - def add_arg(argname, type, default, help, **kwargs): type = distutils.util.strtobool if type == bool else type @@ -25,10 +21,11 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable +parser = argparse.ArgumentParser(description=__doc__) add_arg('batch_size', int, 256, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('num_passes', int, 200, "# of training epochs.") -add_arg('parallels_data', int, NUM_CPU,"# of CPUs for data preprocessing.") +add_arg('parallels_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") diff --git a/tune.py b/tune.py index 2fbe0b98..e066596c 100644 --- a/tune.py +++ b/tune.py @@ -6,15 +6,11 @@ from __future__ import print_function import numpy as np import distutils.util import argparse -import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer -NUM_CPU = multiprocessing.cpu_count() // 2 -parser = argparse.ArgumentParser(description=__doc__) - def add_arg(argname, type, default, help, **kwargs): type = distutils.util.strtobool if type == bool else type @@ -27,10 +23,11 @@ def add_arg(argname, type, default, help, **kwargs): # yapf: disable +parser = argparse.ArgumentParser(description=__doc__) add_arg('num_samples', int, 100, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, NUM_CPU,"# of CPUs for beam search.") +add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") @@ -73,6 +70,7 @@ args = parser.parse_args() # yapf: disable + def tune(): """Tune parameters alpha and beta on one minibatch.""" if not args.num_alphas >= 0: From 9571b6fc0e186a14d10c4b464b8e65883d2ced4b Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 5 Sep 2017 14:23:27 +0800 Subject: [PATCH 13/18] Add back utils.py. --- demo_server.py | 26 +++++----------------- evaluate.py | 26 +++++----------------- infer.py | 26 +++++----------------- tools/build_vocab.py | 25 +++++---------------- tools/compute_mean_std.py | 25 +++++---------------- train.py | 26 +++++----------------- tune.py | 27 +++++----------------- utils.py | 47 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 82 insertions(+), 146 deletions(-) create mode 100644 utils.py diff --git a/demo_server.py b/demo_server.py index 81b56f94..6b73971a 100644 --- a/demo_server.py +++ b/demo_server.py @@ -3,7 +3,7 @@ import os import time import random import argparse -import distutils.util +import functools from time import gmtime, strftime import SocketServer import struct @@ -12,20 +12,11 @@ import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from data_utils.utils import read_manifest +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('host_port', int, 8086, "Server's IP port.") add_arg('beam_size', int, 500, "Beam search width.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") @@ -68,8 +59,8 @@ add_arg('specgram_type', str, 'linear', "Audio feature type. Options: linear, mfcc.", choices=['linear', 'mfcc']) -args = parser.parse_args() # yapf: disable +args = parser.parse_args() class AsrTCPServer(SocketServer.TCPServer): @@ -198,13 +189,6 @@ def start_server(): server.serve_forever() -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=1) diff --git a/evaluate.py b/evaluate.py index 38204c57..35888f82 100644 --- a/evaluate.py +++ b/evaluate.py @@ -3,26 +3,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import distutils.util import argparse +import functools import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('batch_size', int, 128, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") @@ -66,8 +57,8 @@ add_arg('specgram_type', str, 'linear', "Audio feature type. Options: linear, mfcc.", choices=['linear', 'mfcc']) -args = parser.parse_args() # yapf: disable +args = parser.parse_args() def evaluate(): @@ -120,13 +111,6 @@ def evaluate(): (args.error_rate_type, num_ins, num_ins, error_sum / num_ins)) -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) diff --git a/infer.py b/infer.py index e08cb1ca..9d4bff84 100644 --- a/infer.py +++ b/infer.py @@ -4,25 +4,16 @@ from __future__ import division from __future__ import print_function import argparse -import distutils.util +import functools import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer, cer +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('num_samples', int, 10, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") @@ -65,8 +56,8 @@ add_arg('specgram_type', str, 'linear', "Audio feature type. Options: linear, mfcc.", choices=['linear', 'mfcc']) -args = parser.parse_args() # yapf: disable +args = parser.parse_args() def infer(): @@ -116,13 +107,6 @@ def infer(): (args.error_rate_type, error_rate_func(target, result))) -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) diff --git a/tools/build_vocab.py b/tools/build_vocab.py index f6cf6b9f..ac600302 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -7,26 +7,18 @@ from __future__ import division from __future__ import print_function import argparse +import functools import codecs import json from collections import Counter import os.path import _init_paths from data_utils import utils +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") add_arg('vocab_path', str, 'datasets/vocab/zh_vocab.txt', @@ -37,8 +29,8 @@ add_arg('manifest_paths', str, "You can provide multiple manifest files.", nargs='+', required=True) -args = parser.parse_args() # yapf: disable +args = parser.parse_args() def count_manifest(counter, manifest_path): @@ -48,13 +40,6 @@ def count_manifest(counter, manifest_path): counter.update(char) -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - def main(): print_arguments(args) diff --git a/tools/compute_mean_std.py b/tools/compute_mean_std.py index 913a4334..9f7bf06c 100644 --- a/tools/compute_mean_std.py +++ b/tools/compute_mean_std.py @@ -4,24 +4,16 @@ from __future__ import division from __future__ import print_function import argparse +import functools import _init_paths from data_utils.normalizer import FeatureNormalizer from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.audio_featurizer import AudioFeaturizer +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('num_samples', int, 2000, "# of samples to for statistics.") add_arg('specgram_type', str, 'linear', @@ -33,15 +25,8 @@ add_arg('manifest_path', str, add_arg('output_path', str, 'mean_std.npz', "Filepath of write mean and stddev to (.npz).") -args = parser.parse_args() # yapf: disable - - -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") +args = parser.parse_args() def main(): diff --git a/train.py b/train.py index bd00d21d..966e1d9b 100644 --- a/train.py +++ b/train.py @@ -4,24 +4,15 @@ from __future__ import division from __future__ import print_function import argparse -import distutils.util +import functools import paddle.v2 as paddle from model import DeepSpeech2Model from data_utils.data import DataGenerator +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('batch_size', int, 256, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('num_passes', int, 200, "# of training epochs.") @@ -70,8 +61,8 @@ add_arg('shuffle_method', str, 'batch_shuffle_clipped', "Shuffle method.", choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped']) -args = parser.parse_args() # yapf: disable +args = parser.parse_args() def train(): @@ -123,13 +114,6 @@ def train(): is_local=args.is_local) -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) diff --git a/tune.py b/tune.py index e066596c..62e8f288 100644 --- a/tune.py +++ b/tune.py @@ -4,26 +4,17 @@ from __future__ import division from __future__ import print_function import numpy as np -import distutils.util import argparse +import functools import paddle.v2 as paddle from data_utils.data import DataGenerator from model import DeepSpeech2Model from error_rate import wer +from utils import add_arguments, print_arguments - -def add_arg(argname, type, default, help, **kwargs): - type = distutils.util.strtobool if type == bool else type - parser.add_argument( - "--" + argname, - default=default, - type=type, - help=help + ' Default: %(default)s.', - **kwargs) - - -# yapf: disable parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable add_arg('num_samples', int, 100, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") @@ -66,9 +57,8 @@ add_arg('specgram_type', str, 'linear', "Audio feature type. Options: linear, mfcc.", choices=['linear', 'mfcc']) -args = parser.parse_args() # yapf: disable - +args = parser.parse_args() def tune(): @@ -130,13 +120,6 @@ def tune(): (alpha, beta, wer_sum / num_ins)) -def print_arguments(args): - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).iteritems()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") - - def main(): print_arguments(args) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) diff --git a/utils.py b/utils.py new file mode 100644 index 00000000..2e489ade --- /dev/null +++ b/utils.py @@ -0,0 +1,47 @@ +"""Contains common utility functions.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) From e8f7a8fde1cbe78e3695c49804d9bbe8e305826d Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 5 Sep 2017 16:30:42 +0800 Subject: [PATCH 14/18] Update argument naming following Yibing's reviews. --- demo_server.py | 8 ++++---- evaluate.py | 16 ++++++++-------- infer.py | 12 ++++++------ model.py | 16 ++++++++-------- train.py | 8 ++++---- tune.py | 8 ++++---- 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/demo_server.py b/demo_server.py index 6b73971a..7cbee1fd 100644 --- a/demo_server.py +++ b/demo_server.py @@ -25,7 +25,7 @@ add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") add_arg('alpha', float, 0.36, "Coef of LM for beam search.") add_arg('beta', float, 0.25, "Coef of WC for beam search.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -51,9 +51,9 @@ add_arg('model_path', str, add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") -add_arg('decoder_method', str, +add_arg('decoding_method', str, 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", + "Decoding method. Options: ctc_beam_search, ctc_greedy", choices = ['ctc_beam_search', 'ctc_greedy']) add_arg('specgram_type', str, 'linear', @@ -160,7 +160,7 @@ def start_server(): feature = data_generator.process_utterance(filename, "") result_transcript = ds2_model.infer_batch( infer_data=[feature], - decoder_method=args.decoder_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, diff --git a/evaluate.py b/evaluate.py index 35888f82..1cc307da 100644 --- a/evaluate.py +++ b/evaluate.py @@ -17,15 +17,15 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('batch_size', int, 128, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") -add_arg('parallels_data', int, 12, "# of CPUs for data preprocessing.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") +add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") add_arg('alpha', float, 0.36, "Coef of LM for beam search.") add_arg('beta', float, 0.25, "Coef of WC for beam search.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -45,9 +45,9 @@ add_arg('model_path', str, add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', "Filepath for language model.") -add_arg('decoder_method', str, +add_arg('decoding_method', str, 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", + "Decoding method. Options: ctc_beam_search, ctc_greedy", choices = ['ctc_beam_search', 'ctc_greedy']) add_arg('error_rate_type', str, 'wer', @@ -68,7 +68,7 @@ def evaluate(): mean_std_filepath=args.mean_std_path, augmentation_config='{}', specgram_type=args.specgram_type, - num_threads=args.parallels_data) + num_threads=args.num_proc_data) batch_reader = data_generator.batch_reader_creator( manifest_path=args.test_manifest, batch_size=args.batch_size, @@ -90,14 +90,14 @@ def evaluate(): for infer_data in batch_reader(): result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decoder_method=args.decoder_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, - num_processes=args.parallels_bsearch) + num_processes=args.num_proc_bsearch) target_transcripts = [ ''.join([data_generator.vocab_list[token] for token in transcript]) for _, transcript in infer_data diff --git a/infer.py b/infer.py index 9d4bff84..3fd835b4 100644 --- a/infer.py +++ b/infer.py @@ -17,14 +17,14 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('num_samples', int, 10, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") add_arg('alpha', float, 0.36, "Coef of LM for beam search.") add_arg('beta', float, 0.25, "Coef of WC for beam search.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -44,9 +44,9 @@ add_arg('model_path', str, './checkpoints/params.latest.tar.gz', "If None, the training starts from scratch, " "otherwise, it resumes from the pre-trained model.") -add_arg('decoder_method', str, +add_arg('decoding_method', str, 'ctc_beam_search', - "Decoder method. Options: ctc_beam_search, ctc_greedy", + "Decoding method. Options: ctc_beam_search, ctc_greedy", choices = ['ctc_beam_search', 'ctc_greedy']) add_arg('error_rate_type', str, 'wer', @@ -86,14 +86,14 @@ def infer(): share_rnn_weights=args.share_rnn_weights) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, - decoder_method=args.decoder_method, + decoding_method=args.decoding_method, beam_alpha=args.alpha, beam_beta=args.beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, - num_processes=args.parallels_bsearch) + num_processes=args.num_proc_bsearch) error_rate_func = cer if args.error_rate_type == 'cer' else wer target_transcripts = [ diff --git a/model.py b/model.py index 894605bf..06f69290 100644 --- a/model.py +++ b/model.py @@ -146,7 +146,7 @@ class DeepSpeech2Model(object): # run inference return self._loss_inferer.infer(input=infer_data) - def infer_batch(self, infer_data, decoder_method, beam_alpha, beam_beta, + def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): """Model inference. Infer the transcription for a batch of speech @@ -156,9 +156,9 @@ class DeepSpeech2Model(object): consisting of a tuple of audio features and transcription text (empty string). :type infer_data: list - :param decoder_method: Decoding method name, 'ctc_greedy' or - 'ctc_beam_search'. - :param decoder_method: string + :param decoding_method: Decoding method name, 'ctc_greedy' or + 'ctc_beam_search'. + :param decoding_method: string :param beam_alpha: Parameter associated with language model. :type beam_alpha: float :param beam_beta: Parameter associated with word count. @@ -190,13 +190,13 @@ class DeepSpeech2Model(object): ] # run decoder results = [] - if decoder_method == "ctc_greedy": + if decoding_method == "ctc_greedy": # best path decode for i, probs in enumerate(probs_split): output_transcription = ctc_greedy_decoder( probs_seq=probs, vocabulary=vocab_list) results.append(output_transcription) - elif decoder_method == "ctc_beam_search": + elif decoding_method == "ctc_beam_search": # initialize external scorer if self._ext_scorer == None: self._ext_scorer = LmScorer(beam_alpha, beam_beta, @@ -217,8 +217,8 @@ class DeepSpeech2Model(object): results = [result[0][1] for result in beam_search_results] else: - raise ValueError("Decoder method [%s] is not supported." % - decoder_method) + raise ValueError("Decoding method [%s] is not supported." % + decoding_method) return results def _create_parameters(self, model_path=None): diff --git a/train.py b/train.py index 966e1d9b..7cef7539 100644 --- a/train.py +++ b/train.py @@ -16,7 +16,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('batch_size', int, 256, "Minibatch size.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('num_passes', int, 200, "# of training epochs.") -add_arg('parallels_data', int, 12, "# of CPUs for data preprocessing.") +add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") @@ -28,7 +28,7 @@ add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('is_local', bool, True, "Use pserver or not.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('train_manifest', str, @@ -74,13 +74,13 @@ def train(): max_duration=args.max_duration, min_duration=args.min_duration, specgram_type=args.specgram_type, - num_threads=args.parallels_data) + num_threads=args.num_proc_data) dev_generator = DataGenerator( vocab_filepath=args.vocab_path, mean_std_filepath=args.mean_std_path, augmentation_config="{}", specgram_type=args.specgram_type, - num_threads=args.parallels_data) + num_threads=args.num_proc_data) train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest, batch_size=args.batch_size, diff --git a/tune.py b/tune.py index 62e8f288..eab00cfd 100644 --- a/tune.py +++ b/tune.py @@ -18,7 +18,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('num_samples', int, 100, "# of samples to infer.") add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).") add_arg('beam_size', int, 500, "Beam search width.") -add_arg('parallels_bsearch',int, 12, "# of CPUs for beam search.") +add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.") add_arg('num_conv_layers', int, 2, "# of convolution layers.") add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") @@ -29,7 +29,7 @@ add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") -add_arg('use_gru', bool, False, "Use GRUs instead of Simple RNNs.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") @@ -104,14 +104,14 @@ def tune(): for alpha, beta in params_grid: result_transcripts = ds2_model.infer_batch( infer_data=tune_data, - decoder_method='ctc_beam_search', + decoding_method='ctc_beam_search', beam_alpha=alpha, beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, vocab_list=data_generator.vocab_list, language_model_path=args.lang_model_path, - num_processes=args.parallels_bsearch) + num_processes=args.num_proc_bsearch) wer_sum, num_ins = 0.0, 0 for target, result in zip(target_transcripts, result_transcripts): wer_sum += wer(target, result) From 0bbb9c3ee21e48a215ab226d6963077b3ab4a336 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 5 Sep 2017 23:50:41 +0800 Subject: [PATCH 15/18] Re-organize folder structure and hierarchy for DS2. --- README.md | 0 cloud/README.md | 0 cloud/pcloud_submit.sh | 8 +++-- cloud/pcloud_train.sh | 32 +++++++++++++++---- cloud/pcloud_upload_data.sh | 9 ++++-- .../vocab => data/librispeech}/eng_vocab.txt | 0 {datasets => data}/librispeech/librispeech.py | 0 {datasets => data}/noise/chime3_background.py | 0 data_utils/augmentor/impulse_response.py | 9 +++--- data_utils/augmentor/noise_perturb.py | 9 +++--- data_utils/data.py | 4 +-- data_utils/featurizer/audio_featurizer.py | 2 +- data_utils/normalizer.py | 4 +-- data_utils/{utils.py => utility.py} | 0 datasets/run_all.sh | 13 -------- datasets/run_noise.sh | 10 ------ deploy/_init_paths.py | 19 +++++++++++ demo_client.py => deploy/demo_client.py | 0 demo_server.py => deploy/demo_server.py | 11 ++++--- evaluate.py | 12 +++---- examples/librispeech/generate.sh | 28 ++++++++++++++++ examples/librispeech/prepare_data.sh | 32 +++++++++++++++++++ examples/librispeech/run_test.sh | 28 ++++++++++++++++ examples/librispeech/run_train.sh | 30 +++++++++++++++++ examples/librispeech/run_tune.sh | 30 +++++++++++++++++ infer.py | 12 +++---- models/__init__.py | 0 model.py => models/model.py | 8 ++--- layer.py => models/network.py | 21 ++++++------ tools/build_vocab.py | 4 +-- tools/compute_mean_std.py | 2 +- tune.py => tools/tune.py | 15 +++++---- train.py | 16 +++++----- utils/__init__.py | 0 decoder.py => utils/decoder.py | 0 error_rate.py => utils/error_rate.py | 0 utils.py => utils/utility.py | 0 37 files changed, 269 insertions(+), 99 deletions(-) mode change 100755 => 100644 README.md mode change 100755 => 100644 cloud/README.md rename {datasets/vocab => data/librispeech}/eng_vocab.txt (100%) rename {datasets => data}/librispeech/librispeech.py (100%) rename {datasets => data}/noise/chime3_background.py (100%) rename data_utils/{utils.py => utility.py} (100%) delete mode 100644 datasets/run_all.sh delete mode 100644 datasets/run_noise.sh create mode 100644 deploy/_init_paths.py rename demo_client.py => deploy/demo_client.py (100%) rename demo_server.py => deploy/demo_server.py (96%) create mode 100644 examples/librispeech/generate.sh create mode 100644 examples/librispeech/prepare_data.sh create mode 100644 examples/librispeech/run_test.sh create mode 100644 examples/librispeech/run_train.sh create mode 100644 examples/librispeech/run_tune.sh create mode 100644 models/__init__.py rename model.py => models/model.py (98%) rename layer.py => models/network.py (95%) rename tune.py => tools/tune.py (93%) create mode 100644 utils/__init__.py rename decoder.py => utils/decoder.py (100%) rename error_rate.py => utils/error_rate.py (100%) rename utils.py => utils/utility.py (100%) diff --git a/README.md b/README.md old mode 100755 new mode 100644 diff --git a/cloud/README.md b/cloud/README.md old mode 100755 new mode 100644 diff --git a/cloud/pcloud_submit.sh b/cloud/pcloud_submit.sh index 3c9a1c26..378a7c6e 100644 --- a/cloud/pcloud_submit.sh +++ b/cloud/pcloud_submit.sh @@ -1,7 +1,9 @@ -TRAIN_MANIFEST="cloud/cloud.manifest.train" -DEV_MANIFEST="cloud/cloud.manifest.dev" +#! /usr/bin/bash + +TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train" +DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev" CLOUD_MODEL_DIR="./checkpoints" -BATCH_SIZE=256 +BATCH_SIZE=512 NUM_GPU=8 NUM_NODE=1 IS_LOCAL="True" diff --git a/cloud/pcloud_train.sh b/cloud/pcloud_train.sh index 75949574..d04132f9 100644 --- a/cloud/pcloud_train.sh +++ b/cloud/pcloud_train.sh @@ -1,3 +1,5 @@ +#! /usr/bin/bash + TRAIN_MANIFEST=$1 DEV_MANIFEST=$2 MODEL_PATH=$3 @@ -14,11 +16,29 @@ python ./cloud/split_data.py \ --out_manifest_path='/local.manifest.dev' python -u train.py \ ---batch_size=$BATCH_SIZE \ ---use_gpu=1 \ +--batch_size=${BATCH_SIZE} \ --trainer_count=${NUM_GPU} \ ---num_threads_data=${NUM_GPU} \ +--num_passes=200 \ +--num_proc_data=${NUM_GPU} \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_iter_print=100 \ +--learning_rate=5e-4 \ +--max_duration=27.0 \ +--min_duration=0.0 \ +--use_sortagrad=True \ +--use_gru=False \ +--use_gpu=True \ --is_local=${IS_LOCAL} \ ---train_manifest_path='/local.manifest.train' \ ---dev_manifest_path='/local.manifest.dev' \ ---output_model_dir=${MODEL_PATH} 2>&1 | tee ./log/train.log +--share_rnn_weights=True \ +--train_manifest='/local.manifest.train' \ +--dev_manifest='/local.manifest.dev' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--output_model_dir='./checkpoints' \ +--output_model_dir=${MODEL_PATH} \ +--augment_conf_path='conf/augmentation.config' \ +--specgram_type='linear' \ +--shuffle_method='batch_shuffle_clipped' \ +2>&1 | tee ./log/train.log diff --git a/cloud/pcloud_upload_data.sh b/cloud/pcloud_upload_data.sh index 97a0ab18..4ef235ef 100644 --- a/cloud/pcloud_upload_data.sh +++ b/cloud/pcloud_upload_data.sh @@ -1,5 +1,9 @@ -IN_MANIFESTS="../datasets/manifest.train ../datasets/manifest.dev ../datasets/manifest.test" -OUT_MANIFESTS="./cloud.manifest.train ./cloud.manifest.dev ./cloud.manifest.test" +#! /usr/bin/bash + +mkdir cloud_manifests + +IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean" +OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test" CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech" NUM_SHARDS=50 @@ -14,4 +18,5 @@ then echo "Upload Data Failed!" exit 1 fi + echo "All Done." diff --git a/datasets/vocab/eng_vocab.txt b/data/librispeech/eng_vocab.txt similarity index 100% rename from datasets/vocab/eng_vocab.txt rename to data/librispeech/eng_vocab.txt diff --git a/datasets/librispeech/librispeech.py b/data/librispeech/librispeech.py similarity index 100% rename from datasets/librispeech/librispeech.py rename to data/librispeech/librispeech.py diff --git a/datasets/noise/chime3_background.py b/data/noise/chime3_background.py similarity index 100% rename from datasets/noise/chime3_background.py rename to data/noise/chime3_background.py diff --git a/data_utils/augmentor/impulse_response.py b/data_utils/augmentor/impulse_response.py index c3de0fdb..536b4d6a 100644 --- a/data_utils/augmentor/impulse_response.py +++ b/data_utils/augmentor/impulse_response.py @@ -4,23 +4,22 @@ from __future__ import division from __future__ import print_function from data_utils.augmentor.base import AugmentorBase -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment class ImpulseResponseAugmentor(AugmentorBase): """Augmentation model for adding impulse response effect. - + :param rng: Random generator object. :type rng: random.Random :param impulse_manifest_path: Manifest path for impulse audio data. - :type impulse_manifest_path: basestring + :type impulse_manifest_path: basestring """ def __init__(self, rng, impulse_manifest_path): self._rng = rng - self._impulse_manifest = utils.read_manifest( - manifest_path=impulse_manifest_path) + self._impulse_manifest = read_manifest(impulse_manifest_path) def transform_audio(self, audio_segment): """Add impulse response effect. diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py index 281174af..96e0ff4d 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/data_utils/augmentor/noise_perturb.py @@ -4,13 +4,13 @@ from __future__ import division from __future__ import print_function from data_utils.augmentor.base import AugmentorBase -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): """Augmentation model for adding background noise. - + :param rng: Random generator object. :type rng: random.Random :param min_snr_dB: Minimal signal noise ratio, in decibels. @@ -18,15 +18,14 @@ class NoisePerturbAugmentor(AugmentorBase): :param max_snr_dB: Maximal signal noise ratio, in decibels. :type max_snr_dB: float :param noise_manifest_path: Manifest path for noise audio data. - :type noise_manifest_path: basestring + :type noise_manifest_path: basestring """ def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): self._min_snr_dB = min_snr_dB self._max_snr_dB = max_snr_dB self._rng = rng - self._noise_manifest = utils.read_manifest( - manifest_path=noise_manifest_path) + self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) def transform_audio(self, audio_segment): """Add background noise audio. diff --git a/data_utils/data.py b/data_utils/data.py index 33fcadc7..8bff6826 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -11,7 +11,7 @@ import multiprocessing import numpy as np import paddle.v2 as paddle from threading import local -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.speech_featurizer import SpeechFeaturizer from data_utils.speech import SpeechSegment @@ -159,7 +159,7 @@ class DataGenerator(object): def batch_reader(): # read manifest - manifest = utils.read_manifest( + manifest = read_manifest( manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 39f45301..12f8784a 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -4,7 +4,7 @@ from __future__ import division from __future__ import print_function import numpy as np -from data_utils import utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment from python_speech_features import mfcc from python_speech_features import delta diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py index 1f4aae9a..7c2e05c9 100644 --- a/data_utils/normalizer.py +++ b/data_utils/normalizer.py @@ -5,7 +5,7 @@ from __future__ import print_function import numpy as np import random -import data_utils.utils as utils +from data_utils.utility import read_manifest from data_utils.audio import AudioSegment @@ -75,7 +75,7 @@ class FeatureNormalizer(object): def _compute_mean_std(self, manifest_path, featurize_func, num_samples): """Compute mean and std from randomly sampled instances.""" - manifest = utils.read_manifest(manifest_path) + manifest = read_manifest(manifest_path) sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in sampled_manifest: diff --git a/data_utils/utils.py b/data_utils/utility.py similarity index 100% rename from data_utils/utils.py rename to data_utils/utility.py diff --git a/datasets/run_all.sh b/datasets/run_all.sh deleted file mode 100644 index ef2b721f..00000000 --- a/datasets/run_all.sh +++ /dev/null @@ -1,13 +0,0 @@ -cd librispeech -python librispeech.py -if [ $? -ne 0 ]; then - echo "Prepare LibriSpeech failed. Terminated." - exit 1 -fi -cd - - -cat librispeech/manifest.train* | shuf > manifest.train -cat librispeech/manifest.dev-clean > manifest.dev -cat librispeech/manifest.test-clean > manifest.test - -echo "All done." diff --git a/datasets/run_noise.sh b/datasets/run_noise.sh deleted file mode 100644 index 7b27abde..00000000 --- a/datasets/run_noise.sh +++ /dev/null @@ -1,10 +0,0 @@ -cd noise -python chime3_background.py -if [ $? -ne 0 ]; then - echo "Prepare CHiME3 background noise failed. Terminated." - exit 1 -fi -cd - - -cat noise/manifest.* > manifest.noise -echo "All done." diff --git a/deploy/_init_paths.py b/deploy/_init_paths.py new file mode 100644 index 00000000..ddabb535 --- /dev/null +++ b/deploy/_init_paths.py @@ -0,0 +1,19 @@ +"""Set up paths for DS2""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os.path +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +this_dir = os.path.dirname(__file__) + +# Add project path to PYTHONPATH +proj_path = os.path.join(this_dir, '..') +add_path(proj_path) diff --git a/demo_client.py b/deploy/demo_client.py similarity index 100% rename from demo_client.py rename to deploy/demo_client.py diff --git a/demo_server.py b/deploy/demo_server.py similarity index 96% rename from demo_server.py rename to deploy/demo_server.py index 7cbee1fd..658b1419 100644 --- a/demo_server.py +++ b/deploy/demo_server.py @@ -9,10 +9,11 @@ import SocketServer import struct import wave import paddle.v2 as paddle +import _init_paths from data_utils.data import DataGenerator -from model import DeepSpeech2Model +from models.model import DeepSpeech2Model from data_utils.utils import read_manifest -from utils import add_arguments, print_arguments +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -36,13 +37,13 @@ add_arg('speech_save_dir', str, 'demo_cache', "Directory to save demo audios.") add_arg('warmup_manifest', str, - 'datasets/manifest.test', + 'data/librispeech/manifest.test-clean', "Filepath of manifest to warm up.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', diff --git a/evaluate.py b/evaluate.py index 1cc307da..747e40df 100644 --- a/evaluate.py +++ b/evaluate.py @@ -7,9 +7,9 @@ import argparse import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer, cer -from utils import add_arguments, print_arguments +from models.model import DeepSpeech2Model +from utils.error_rate import wer, cer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -30,13 +30,13 @@ add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('test_manifest', str, - 'datasets/manifest.test', + 'data/librispeech/manifest.test-clean', "Filepath of manifest to evaluate.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('model_path', str, './checkpoints/params.latest.tar.gz', diff --git a/examples/librispeech/generate.sh b/examples/librispeech/generate.sh new file mode 100644 index 00000000..a34b7bc1 --- /dev/null +++ b/examples/librispeech/generate.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0 \ +python -u infer.py \ +--num_samples=10 \ +--trainer_count=1 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=0.36 \ +--beta=0.25 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--infer_manifest='data/librispeech/manifest.dev-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/examples/librispeech/prepare_data.sh b/examples/librispeech/prepare_data.sh new file mode 100644 index 00000000..162a38c4 --- /dev/null +++ b/examples/librispeech/prepare_data.sh @@ -0,0 +1,32 @@ +#! /usr/bin/bash + +pushd ../.. + +# download data, generate manifests +python data/librispeech/librispeech.py \ +--manifest_prefix='data/librispeech/manifest' \ +--full_download='True' \ +--target_dir='~/.cache/paddle/dataset/speech/Libri' + +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi + +#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train + + +# compute mean and stddev for normalizer +python tools/compute_mean_std.py \ +--manifest_path='data/librispeech/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/librispeech/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "LibriSpeech Data preparation done." diff --git a/examples/librispeech/run_test.sh b/examples/librispeech/run_test.sh new file mode 100644 index 00000000..5a14cb68 --- /dev/null +++ b/examples/librispeech/run_test.sh @@ -0,0 +1,28 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u evaluate.py \ +--batch_size=128 \ +--trainer_count=8 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=0.36 \ +--beta=0.25 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--test_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/examples/librispeech/run_train.sh b/examples/librispeech/run_train.sh new file mode 100644 index 00000000..832838a8 --- /dev/null +++ b/examples/librispeech/run_train.sh @@ -0,0 +1,30 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u train.py \ +--batch_size=256 \ +--trainer_count=8 \ +--num_passes=200 \ +--num_proc_data=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_iter_print=100 \ +--learning_rate=5e-4 \ +--max_duration=27.0 \ +--min_duration=0.0 \ +--use_sortagrad=True \ +--use_gru=False \ +--use_gpu=True \ +--is_local=True \ +--share_rnn_weights=True \ +--train_manifest='data/librispeech/manifest.train' \ +--dev_manifest='data/librispeech/manifest.dev' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--output_model_dir='./checkpoints' \ +--augment_conf_path='conf/augmentation.config' \ +--specgram_type='linear' \ +--shuffle_method='batch_shuffle_clipped' diff --git a/examples/librispeech/run_tune.sh b/examples/librispeech/run_tune.sh new file mode 100644 index 00000000..9d992e88 --- /dev/null +++ b/examples/librispeech/run_tune.sh @@ -0,0 +1,30 @@ +#! /usr/bin/bash + +pushd ../.. + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +python -u tools/tune.py \ +--num_samples=100 \ +--trainer_count=8 \ +--beam_size=500 \ +--num_proc_bsearch=12 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--num_alphas=14 \ +--num_betas=20 \ +--alpha_from=0.1 \ +--alpha_to=0.36 \ +--beta_from=0.05 \ +--beta_to=1.0 \ +--cutoff_prob=0.99 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--tune_manifest='data/librispeech/manifest.dev-clean' \ +--mean_std_path='data/librispeech/mean_std.npz' \ +--vocab_path='data/librispeech/eng_vocab.txt' \ +--model_path='checkpoints/params.latest.tar.gz' \ +--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \ +--error_rate_type='wer' \ +--specgram_type='linear' diff --git a/infer.py b/infer.py index 3fd835b4..1ce969ae 100644 --- a/infer.py +++ b/infer.py @@ -7,9 +7,9 @@ import argparse import functools import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer, cer -from utils import add_arguments, print_arguments +from models.model import DeepSpeech2Model +from utils.error_rate import wer, cer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -29,13 +29,13 @@ add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('infer_manifest', str, - 'datasets/manifest.dev', + 'data/librispeech/manifest.dev-clean', "Filepath of manifest to infer.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model.py b/models/model.py similarity index 98% rename from model.py rename to models/model.py index 06f69290..3e6fc328 100644 --- a/model.py +++ b/models/model.py @@ -7,10 +7,10 @@ import sys import os import time import gzip -from decoder import * -from lm.lm_scorer import LmScorer import paddle.v2 as paddle -from layer import * +from utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder +from lm.lm_scorer import LmScorer +from models.network import deep_speech_v2_network class DeepSpeech2Model(object): @@ -241,7 +241,7 @@ class DeepSpeech2Model(object): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(vocab_size)) - self._log_probs, self._loss = deep_speech2( + self._log_probs, self._loss = deep_speech_v2_network( audio_data=audio_data, text_data=text_data, dict_size=vocab_size, diff --git a/layer.py b/models/network.py similarity index 95% rename from layer.py rename to models/network.py index b7ac3c23..13ba5d2c 100644 --- a/layer.py +++ b/models/network.py @@ -1,4 +1,4 @@ -"""Contains DeepSpeech2 layers.""" +"""Contains DeepSpeech2 layers and networks.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -205,16 +205,15 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights): return output -def deep_speech2(audio_data, - text_data, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=256, - use_gru=False, - share_rnn_weights=True): - """ - The whole DeepSpeech2 model structure. +def deep_speech_v2_network(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256, + use_gru=False, + share_rnn_weights=True): + """The DeepSpeech2 network structure. :param audio_data: Audio spectrogram data layer. :type audio_data: LayerOutput diff --git a/tools/build_vocab.py b/tools/build_vocab.py index ac600302..6fbb9bdf 100644 --- a/tools/build_vocab.py +++ b/tools/build_vocab.py @@ -13,8 +13,8 @@ import json from collections import Counter import os.path import _init_paths -from data_utils import utils -from utils import add_arguments, print_arguments +from data_utils.utility import read_manifest +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/tools/compute_mean_std.py b/tools/compute_mean_std.py index 9f7bf06c..5bb6be39 100644 --- a/tools/compute_mean_std.py +++ b/tools/compute_mean_std.py @@ -9,7 +9,7 @@ import _init_paths from data_utils.normalizer import FeatureNormalizer from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.featurizer.audio_featurizer import AudioFeaturizer -from utils import add_arguments, print_arguments +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) diff --git a/tune.py b/tools/tune.py similarity index 93% rename from tune.py rename to tools/tune.py index eab00cfd..7a237910 100644 --- a/tune.py +++ b/tools/tune.py @@ -7,10 +7,11 @@ import numpy as np import argparse import functools import paddle.v2 as paddle +import _init_paths from data_utils.data import DataGenerator -from model import DeepSpeech2Model -from error_rate import wer -from utils import add_arguments, print_arguments +from models.model import DeepSpeech2Model +from utils.error_rate import wer +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -27,20 +28,20 @@ add_arg('num_betas', int, 20, "# of beta candidates for tuning.") add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.") add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.") add_arg('beta_from', float, 0.05, "Where beta starts tuning from.") -add_arg('beta_to', float, 0.36, "Where beta ends tuning with.") +add_arg('beta_to', float, 1.0, "Where beta ends tuning with.") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") add_arg('use_gpu', bool, True, "Use GPU or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('tune_manifest', str, - 'datasets/manifest.test', + 'data/librispeech/manifest.dev', "Filepath of manifest to tune.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('lang_model_path', str, 'lm/data/common_crawl_00.prune01111.trie.klm', diff --git a/train.py b/train.py index 7cef7539..4a7a0eda 100644 --- a/train.py +++ b/train.py @@ -6,9 +6,9 @@ from __future__ import print_function import argparse import functools import paddle.v2 as paddle -from model import DeepSpeech2Model +from models.model import DeepSpeech2Model from data_utils.data import DataGenerator -from utils import add_arguments, print_arguments +from utils.utility import add_arguments, print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -27,21 +27,21 @@ add_arg('max_duration', float, 27.0, "Longest audio duration allowed.") add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.") add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.") add_arg('use_gpu', bool, True, "Use GPU or not.") -add_arg('is_local', bool, True, "Use pserver or not.") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('is_local', bool, True, "Use pserver or not.") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across " "bi-directional RNNs. Not for GRU.") add_arg('train_manifest', str, - 'datasets/manifest.train', + 'data/librispeech/manifest.train', "Filepath of train manifest.") add_arg('dev_manifest', str, - 'datasets/manifest.dev', + 'data/librispeech/manifest.dev-clean', "Filepath of validation manifest.") add_arg('mean_std_path', str, - 'mean_std.npz', + 'data/librispeech/mean_std.npz', "Filepath of normalizer's mean & std.") add_arg('vocab_path', str, - 'datasets/vocab/eng_vocab.txt', + 'data/librispeech/eng_vocab.txt', "Filepath of vocabulary.") add_arg('init_model_path', str, None, @@ -101,7 +101,7 @@ def train(): rnn_layer_size=args.rnn_layer_size, use_gru=args.use_gru, pretrained_model_path=args.init_model_path, - share_rnn_weights=args.share_weights) + share_rnn_weights=args.share_rnn_weights) ds2_model.train( train_batch_reader=train_batch_reader, dev_batch_reader=dev_batch_reader, diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/decoder.py b/utils/decoder.py similarity index 100% rename from decoder.py rename to utils/decoder.py diff --git a/error_rate.py b/utils/error_rate.py similarity index 100% rename from error_rate.py rename to utils/error_rate.py diff --git a/utils.py b/utils/utility.py similarity index 100% rename from utils.py rename to utils/utility.py From 5623b09868abe7fc81fb356b9e9f5453772ac7ef Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 6 Sep 2017 14:33:25 +0800 Subject: [PATCH 16/18] Move decoder.py to models and re-arrange unitests. --- README.md | 2 ++ {utils => models}/decoder.py | 0 models/model.py | 2 +- {tests => models/tests}/test_decoders.py | 0 evaluate.py => test.py | 0 {tests => utils/tests}/test_error_rate.py | 0 6 files changed, 3 insertions(+), 1 deletion(-) rename {utils => models}/decoder.py (100%) rename {tests => models/tests}/test_decoders.py (100%) rename evaluate.py => test.py (100%) rename {tests => utils/tests}/test_error_rate.py (100%) diff --git a/README.md b/README.md index 4e8befa5..db07d8c2 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # DeepSpeech2 on PaddlePaddle +>TODO: to be updated, since the directory hierarchy was changed. + ## Installation ``` diff --git a/utils/decoder.py b/models/decoder.py similarity index 100% rename from utils/decoder.py rename to models/decoder.py diff --git a/models/model.py b/models/model.py index 3e6fc328..93c4c41b 100644 --- a/models/model.py +++ b/models/model.py @@ -8,8 +8,8 @@ import os import time import gzip import paddle.v2 as paddle -from utils.decoder import ctc_greedy_decoder, ctc_beam_search_decoder from lm.lm_scorer import LmScorer +from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder from models.network import deep_speech_v2_network diff --git a/tests/test_decoders.py b/models/tests/test_decoders.py similarity index 100% rename from tests/test_decoders.py rename to models/tests/test_decoders.py diff --git a/evaluate.py b/test.py similarity index 100% rename from evaluate.py rename to test.py diff --git a/tests/test_error_rate.py b/utils/tests/test_error_rate.py similarity index 100% rename from tests/test_error_rate.py rename to utils/tests/test_error_rate.py From b2eb008a71a15166ae33746ea4c0d6029e3ab392 Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 6 Sep 2017 14:40:11 +0800 Subject: [PATCH 17/18] Remove test_setup.py. --- tests/test_setup.py | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 tests/test_setup.py diff --git a/tests/test_setup.py b/tests/test_setup.py deleted file mode 100644 index 18b9c1a0..00000000 --- a/tests/test_setup.py +++ /dev/null @@ -1,23 +0,0 @@ -"""Test Setup.""" -import unittest -import numpy as np -import os - - -class TestSetup(unittest.TestCase): - def test_soundfile(self): - import soundfile as sf - # floating point data is typically limited to the interval [-1.0, 1.0], - # but smaller/larger values are supported as well - data = np.array([[1.75, -1.75], [1.0, -1.0], [0.5, -0.5], - [0.25, -0.25]]) - file = 'test.wav' - sf.write(file, data, 44100, format='WAV', subtype='FLOAT') - read, fs = sf.read(file) - self.assertTrue(np.all(read == data)) - self.assertEqual(fs, 44100) - os.remove(file) - - -if __name__ == '__main__': - unittest.main() From d776ce9bd71d1878bd51c2a795bd4373dd0119fb Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 6 Sep 2017 16:02:22 +0800 Subject: [PATCH 18/18] Fix import errors in unitests. --- models/tests/test_decoders.py | 14 ++++++++------ utils/tests/test_error_rate.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/models/tests/test_decoders.py b/models/tests/test_decoders.py index fa43879b..acce46af 100644 --- a/models/tests/test_decoders.py +++ b/models/tests/test_decoders.py @@ -4,7 +4,7 @@ from __future__ import division from __future__ import print_function import unittest -from decoder import * +from models import decoder class TestDecoders(unittest.TestCase): @@ -53,15 +53,17 @@ class TestDecoders(unittest.TestCase): self.beam_search_result = ['acdc', "b'a"] def test_greedy_decoder_1(self): - bst_result = ctc_greedy_decoder(self.probs_seq1, self.vocab_list) + bst_result = decoder.ctc_greedy_decoder(self.probs_seq1, + self.vocab_list) self.assertEqual(bst_result, self.greedy_result[0]) def test_greedy_decoder_2(self): - bst_result = ctc_greedy_decoder(self.probs_seq2, self.vocab_list) + bst_result = decoder.ctc_greedy_decoder(self.probs_seq2, + self.vocab_list) self.assertEqual(bst_result, self.greedy_result[1]) def test_beam_search_decoder_1(self): - beam_result = ctc_beam_search_decoder( + beam_result = decoder.ctc_beam_search_decoder( probs_seq=self.probs_seq1, beam_size=self.beam_size, vocabulary=self.vocab_list, @@ -69,7 +71,7 @@ class TestDecoders(unittest.TestCase): self.assertEqual(beam_result[0][1], self.beam_search_result[0]) def test_beam_search_decoder_2(self): - beam_result = ctc_beam_search_decoder( + beam_result = decoder.ctc_beam_search_decoder( probs_seq=self.probs_seq2, beam_size=self.beam_size, vocabulary=self.vocab_list, @@ -77,7 +79,7 @@ class TestDecoders(unittest.TestCase): self.assertEqual(beam_result[0][1], self.beam_search_result[1]) def test_beam_search_decoder_batch(self): - beam_results = ctc_beam_search_decoder_batch( + beam_results = decoder.ctc_beam_search_decoder_batch( probs_split=[self.probs_seq1, self.probs_seq2], beam_size=self.beam_size, vocabulary=self.vocab_list, diff --git a/utils/tests/test_error_rate.py b/utils/tests/test_error_rate.py index 99e137a9..d6bc7442 100644 --- a/utils/tests/test_error_rate.py +++ b/utils/tests/test_error_rate.py @@ -5,7 +5,7 @@ from __future__ import division from __future__ import print_function import unittest -import error_rate +from utils import error_rate class TestParse(unittest.TestCase):