From 92eacf548bf5ca278a2ad741dd9c901ca6d23a8f Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Mon, 31 Jul 2017 21:57:07 +0800 Subject: [PATCH 1/3] Update default config params and result display for evaluator.py and infer.py for DS2. --- evaluate.py | 26 ++++++++++++++++++-------- infer.py | 9 +++++++-- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/evaluate.py b/evaluate.py index 19eabf4e..1d758687 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,6 +4,7 @@ from __future__ import division from __future__ import print_function import distutils.util +import sys import argparse import gzip import paddle.v2 as paddle @@ -12,13 +13,19 @@ from model import deep_speech2 from decoder import * from lm.lm_scorer import LmScorer from error_rate import wer +import utils parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--batch_size", - default=100, + default=128, type=int, help="Minibatch size for evaluation. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=8, + type=int, + help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_conv_layers", default=2, @@ -58,8 +65,8 @@ parser.add_argument( "--decode_method", default='beam_search', type=str, - help="Method for ctc decoding, best_path or beam_search. (default: %(default)s)" -) + help="Method for ctc decoding, best_path or beam_search. " + "(default: %(default)s)") parser.add_argument( "--language_model_path", default="lm/data/common_crawl_00.prune01111.trie.klm", @@ -67,12 +74,12 @@ parser.add_argument( help="Path for language model. (default: %(default)s)") parser.add_argument( "--alpha", - default=0.26, + default=0.36, type=float, help="Parameter associated with language model. (default: %(default)f)") parser.add_argument( "--beta", - default=0.1, + default=0.25, type=float, help="Parameter associated with word count. (default: %(default)f)") parser.add_argument( @@ -191,7 +198,7 @@ def evaluate(): blank_id=len(data_generator.vocab_list), num_processes=args.num_processes_beam_search, ext_scoring_func=ext_scorer, - cutoff_prob=args.cutoff_prob, ) + cutoff_prob=args.cutoff_prob) for i, beam_search_result in enumerate(beam_search_results): wer_sum += wer(target_transcription[i], beam_search_result[0][1]) @@ -199,12 +206,15 @@ def evaluate(): else: raise ValueError("Decoding method [%s] is not supported." % decode_method) + print("WER (%d/?) = %f" % (wer_counter, wer_sum / wer_counter)) - print("Final WER = %f" % (wer_sum / wer_counter)) + print("Final WER (%d/%d) = %f" % (wer_counter, wer_counter, + wer_sum / wer_counter)) def main(): - paddle.init(use_gpu=args.use_gpu, trainer_count=1) + utils.print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) evaluate() diff --git a/infer.py b/infer.py index 81752630..ad3fdc4d 100644 --- a/infer.py +++ b/infer.py @@ -57,6 +57,11 @@ parser.add_argument( type=str, help="Feature type of audio data: 'linear' (power spectrum)" " or 'mfcc'. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=8, + type=int, + help="Trainer number. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -208,7 +213,7 @@ def infer(): wer_cur = wer(target_transcription[i], beam_search_result[0][1]) wer_sum += wer_cur wer_counter += 1 - print("cur wer = %f , average wer = %f" % + print("Current WER = %f , Average WER = %f" % (wer_cur, wer_sum / wer_counter)) else: raise ValueError("Decoding method [%s] is not supported." % @@ -217,7 +222,7 @@ def infer(): def main(): utils.print_arguments(args) - paddle.init(use_gpu=args.use_gpu, trainer_count=1) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) infer() From 8122dd9c2999ac451e5a02e22f67d1ba09bfb51c Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Tue, 1 Aug 2017 16:21:46 +0800 Subject: [PATCH 2/3] Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class. --- evaluate.py | 107 ++++++--------------- infer.py | 106 +++++---------------- layer.py | 155 ++++++++++++++++++++++++++++++ model.py | 265 +++++++++++++++++++++++++++------------------------- train.py | 121 +++++++----------------- tune.py | 102 ++++++++------------ 6 files changed, 415 insertions(+), 441 deletions(-) create mode 100644 layer.py diff --git a/evaluate.py b/evaluate.py index 1d758687..fb7211fc 100644 --- a/evaluate.py +++ b/evaluate.py @@ -4,14 +4,11 @@ from __future__ import division from __future__ import print_function import distutils.util -import sys import argparse -import gzip +import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import deep_speech2 -from decoder import * -from lm.lm_scorer import LmScorer +from model import DeepSpeech2Model from error_rate import wer import utils @@ -119,37 +116,12 @@ args = parser.parse_args() def evaluate(): """Evaluate on whole test data for DeepSpeech2.""" - # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_threads_data) - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) - output_probs = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.model_filepath)) - - # prepare infer data batch_reader = data_generator.batch_reader_creator( manifest_path=args.decode_manifest_path, batch_size=args.batch_size, @@ -157,59 +129,34 @@ def evaluate(): sortagrad=False, shuffle_method=None) - # define inferer - inferer = paddle.inference.Inference( - output_layer=output_probs, parameters=parameters) - - # initialize external scorer for beam search decoding - if args.decode_method == 'beam_search': - ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path) + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) - wer_counter, wer_sum = 0, 0.0 + wer_sum, num_ins = 0.0, 0 for infer_data in batch_reader(): - # run inference - infer_results = inferer.infer(input=infer_data) - num_steps = len(infer_results) // len(infer_data) - probs_split = [ - infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(0, len(infer_data)) + result_transcripts = ds2_model.infer_batch( + infer_data=infer_data, + decode_method=args.decode_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=args.num_processes_beam_search) + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in infer_data ] - # target transcription - target_transcription = [ - ''.join([ - data_generator.vocab_list[index] for index in infer_data[i][1] - ]) for i, probs in enumerate(probs_split) - ] - # decode and print - # best path decode - if args.decode_method == "best_path": - for i, probs in enumerate(probs_split): - output_transcription = ctc_best_path_decoder( - probs_seq=probs, vocabulary=data_generator.vocab_list) - wer_sum += wer(target_transcription[i], output_transcription) - wer_counter += 1 - # beam search decode - elif args.decode_method == "beam_search": - # beam search using multiple processes - beam_search_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=data_generator.vocab_list, - beam_size=args.beam_size, - blank_id=len(data_generator.vocab_list), - num_processes=args.num_processes_beam_search, - ext_scoring_func=ext_scorer, - cutoff_prob=args.cutoff_prob) - for i, beam_search_result in enumerate(beam_search_results): - wer_sum += wer(target_transcription[i], - beam_search_result[0][1]) - wer_counter += 1 - else: - raise ValueError("Decoding method [%s] is not supported." % - decode_method) - print("WER (%d/?) = %f" % (wer_counter, wer_sum / wer_counter)) - - print("Final WER (%d/%d) = %f" % (wer_counter, wer_counter, - wer_sum / wer_counter)) + for target, result in zip(target_transcripts, result_transcripts): + wer_sum += wer(target, result) + num_ins += 1 + print("WER (%d/?) = %f" % (num_ins, wer_sum / num_ins)) + print("Final WER (%d/%d) = %f" % (num_ins, num_ins, wer_sum / num_ins)) def main(): diff --git a/infer.py b/infer.py index ad3fdc4d..ec65cc74 100644 --- a/infer.py +++ b/infer.py @@ -4,14 +4,11 @@ from __future__ import division from __future__ import print_function import argparse -import gzip import distutils.util import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import deep_speech2 -from decoder import * -from lm.lm_scorer import LmScorer +from model import DeepSpeech2Model from error_rate import wer import utils @@ -124,37 +121,12 @@ args = parser.parse_args() def infer(): """Inference for DeepSpeech2.""" - # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_threads_data) - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) - output_probs = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.model_filepath)) - - # prepare infer data batch_reader = data_generator.batch_reader_creator( manifest_path=args.decode_manifest_path, batch_size=args.num_samples, @@ -163,61 +135,31 @@ def infer(): shuffle_method=None) infer_data = batch_reader().next() - # run inference - infer_results = paddle.infer( - output_layer=output_probs, parameters=parameters, input=infer_data) - num_steps = len(infer_results) // len(infer_data) - probs_split = [ - infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(len(infer_data)) - ] + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) + result_transcripts = ds2_model.infer_batch( + infer_data=infer_data, + decode_method=args.decode_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=args.num_processes_beam_search) - # targe transcription - target_transcription = [ - ''.join( - [data_generator.vocab_list[index] for index in infer_data[i][1]]) - for i, probs in enumerate(probs_split) + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in infer_data ] - - ## decode and print - # best path decode - wer_sum, wer_counter = 0, 0 - if args.decode_method == "best_path": - for i, probs in enumerate(probs_split): - best_path_transcription = ctc_best_path_decoder( - probs_seq=probs, vocabulary=data_generator.vocab_list) - print("\nTarget Transcription: %s\nOutput Transcription: %s" % - (target_transcription[i], best_path_transcription)) - wer_cur = wer(target_transcription[i], best_path_transcription) - wer_sum += wer_cur - wer_counter += 1 - print("cur wer = %f, average wer = %f" % - (wer_cur, wer_sum / wer_counter)) - # beam search decode - elif args.decode_method == "beam_search": - ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path) - beam_search_batch_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=data_generator.vocab_list, - beam_size=args.beam_size, - blank_id=len(data_generator.vocab_list), - num_processes=args.num_processes_beam_search, - cutoff_prob=args.cutoff_prob, - ext_scoring_func=ext_scorer, ) - for i, beam_search_result in enumerate(beam_search_batch_results): - print("\nTarget Transcription:\t%s" % target_transcription[i]) - for index in xrange(args.num_results_per_sample): - result = beam_search_result[index] - #output: index, log prob, beam result - print("Beam %d: %f \t%s" % (index, result[0], result[1])) - wer_cur = wer(target_transcription[i], beam_search_result[0][1]) - wer_sum += wer_cur - wer_counter += 1 - print("Current WER = %f , Average WER = %f" % - (wer_cur, wer_sum / wer_counter)) - else: - raise ValueError("Decoding method [%s] is not supported." % - decode_method) + for target, result in zip(target_transcripts, result_transcripts): + print("\nTarget Transcription: %s\nOutput Transcription: %s" % + (target, result)) + print("Current wer = %f" % wer(target, result)) def main(): diff --git a/layer.py b/layer.py new file mode 100644 index 00000000..7b027338 --- /dev/null +++ b/layer.py @@ -0,0 +1,155 @@ +"""Contains DeepSpeech2 layers.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.v2 as paddle + +DISABLE_CUDNN_BATCH_NORM = True + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + """ + Convolution layer with batch normalization. + """ + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + if DISABLE_CUDNN_BATCH_NORM: + # temopary patch, need to be removed. + return paddle.layer.batch_norm( + input=conv_layer, act=act, batch_norm_type="batch_norm") + else: + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectional_simple_rnn_bn_layer(name, input, size, act): + """ + Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + """ + # input-hidden weights shared across bi-direcitonal rnn. + input_proj = paddle.layer.fc( + input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) + # batch norm is only performed on input-state projection + if DISABLE_CUDNN_BATCH_NORM: + # temopary patch, need to be removed. + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, + act=paddle.activation.Linear(), + batch_norm_type="batch_norm") + else: + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) + # forward and backward in time + forward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=False) + backward_simple_rnn = paddle.layer.recurrent( + input=input_proj_bn, act=act, reverse=True) + return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) + + +def conv_group(input, num_stacks): + """ + Convolution group with several stacking convolution layers. + """ + conv = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + for i in xrange(num_stacks - 1): + conv = conv_bn_layer( + input=conv, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + output_num_channels = 32 + output_height = 160 // pow(2, num_stacks) + 1 + return conv, output_num_channels, output_height + + +def rnn_group(input, size, num_stacks): + """ + RNN group with several stacking RNN layers. + """ + output = input + for i in xrange(num_stacks): + output = bidirectional_simple_rnn_bn_layer( + name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + return output + + +def deep_speech2(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256): + """ + The whole DeepSpeech2 model structure (a simplified version). + + :param audio_data: Audio spectrogram data layer. + :type audio_data: LayerOutput + :param text_data: Transcription text data layer. + :type text_data: LayerOutput + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (number of RNN cells). + :type rnn_size: int + :param is_inference: False in the training mode, and True in the + inferene mode. + :type is_inference: bool + :return: If is_inference set False, return a ctc cost layer; + if is_inference set True, return a sequence layer of output + probability distribution. + :rtype: tuple of LayerOutput + """ + # convolution group + conv_group_output, conv_group_num_channels, conv_group_height = conv_group( + input=audio_data, num_stacks=num_conv_layers) + # convert data form convolution feature map to sequence of vectors + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=conv_group_num_channels, + stride_x=1, + stride_y=1, + block_x=1, + block_y=conv_group_height) + # rnn group + rnn_group_output = rnn_group( + input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + # probability distribution with softmax + log_probs = paddle.layer.mixed( + input=paddle.layer.identity_projection(input=fc), + act=paddle.activation.Softmax()) + # ctc cost + ctc_loss = paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) + return log_probs, ctc_loss diff --git a/model.py b/model.py index cb0b4ecb..d1efabb7 100644 --- a/model.py +++ b/model.py @@ -3,141 +3,150 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import sys +import os +import time +import gzip +from decoder import * +from lm.lm_scorer import LmScorer import paddle.v2 as paddle +from layer import * -def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, - padding, act): - """ - Convolution layer with batch normalization. - """ - conv_layer = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=num_channels_in, - num_filters=num_channels_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=conv_layer, act=act) +class DeepSpeech2Model(object): + def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, + rnn_layer_size, pretrained_model_path): + self._create_network(vocab_size, num_conv_layers, num_rnn_layers, + rnn_layer_size) + self._create_parameters(pretrained_model_path) + self._inferer = None + self._ext_scorer = None + def train(self, + train_batch_reader, + dev_batch_reader, + feeding_dict, + learning_rate, + gradient_clipping, + num_passes, + num_iterations_print=100, + output_model_dir='checkpoints'): + # prepare optimizer and trainer + optimizer = paddle.optimizer.Adam( + learning_rate=learning_rate, + gradient_clipping_threshold=gradient_clipping) + trainer = paddle.trainer.SGD( + cost=self._loss, + parameters=self._parameters, + update_equation=optimizer) -def bidirectional_simple_rnn_bn_layer(name, input, size, act): - """ - Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - """ - # input-hidden weights shared across bi-direcitonal rnn. - input_proj = paddle.layer.fc( - input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) - # batch norm is only performed on input-state projection - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) - # forward and backward in time - forward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=False) - backward_simple_rnn = paddle.layer.recurrent( - input=input_proj_bn, act=act, reverse=True) - return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) + # create event handler + def event_handler(event): + global start_time, cost_sum, cost_counter + if isinstance(event, paddle.event.EndIteration): + cost_sum += event.cost + cost_counter += 1 + if (event.batch_id + 1) % num_iterations_print == 0: + output_model_path = os.path.join(output_model_dir, + "params.latest.tar.gz") + with gzip.open(output_model_path, 'w') as f: + self._parameters.to_tar(f) + print("\nPass: %d, Batch: %d, TrainCost: %f" % + (event.pass_id, event.batch_id + 1, + cost_sum / cost_counter)) + cost_sum, cost_counter = 0.0, 0 + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.BeginPass): + start_time = time.time() + cost_sum, cost_counter = 0.0, 0 + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + reader=dev_batch_reader, feeding=feeding_dict) + output_model_path = os.path.join( + output_model_dir, "params.pass-%d.tar.gz" % event.pass_id) + with gzip.open(output_model_path, 'w') as f: + self._parameters.to_tar(f) + print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % + (time.time() - start_time, event.pass_id, result.cost)) + # run train + trainer.train( + reader=train_batch_reader, + event_handler=event_handler, + num_passes=num_passes, + feeding=feeding_dict) -def conv_group(input, num_stacks): - """ - Convolution group with several stacking convolution layers. - """ - conv = conv_bn_layer( - input=input, - filter_size=(11, 41), - num_channels_in=1, - num_channels_out=32, - stride=(3, 2), - padding=(5, 20), - act=paddle.activation.BRelu()) - for i in xrange(num_stacks - 1): - conv = conv_bn_layer( - input=conv, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - output_num_channels = 32 - output_height = 160 // pow(2, num_stacks) + 1 - return conv, output_num_channels, output_height + def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, + beam_size, cutoff_prob, vocab_list, language_model_path, + num_processes): + # define inferer + if self._inferer == None: + self._inferer = paddle.inference.Inference( + output_layer=self._log_probs, parameters=self._parameters) + # run inference + infer_results = self._inferer.infer(input=infer_data) + num_steps = len(infer_results) // len(infer_data) + probs_split = [ + infer_results[i * num_steps:(i + 1) * num_steps] + for i in xrange(0, len(infer_data)) + ] + # run decoder + results = [] + if decode_method == "best_path": + # best path decode + for i, probs in enumerate(probs_split): + output_transcription = ctc_best_path_decoder( + probs_seq=probs, vocabulary=data_generator.vocab_list) + results.append(output_transcription) + elif decode_method == "beam_search": + # initialize external scorer + if self._ext_scorer == None: + self._ext_scorer = LmScorer(beam_alpha, beam_beta, + language_model_path) + self._loaded_lm_path = language_model_path + else: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + assert self._loaded_lm_path == language_model_path + # beam search decode + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=vocab_list, + beam_size=beam_size, + blank_id=len(vocab_list), + num_processes=num_processes, + ext_scoring_func=self._ext_scorer, + cutoff_prob=cutoff_prob) + results = [result[0][1] for result in beam_search_results] + else: + raise ValueError("Decoding method [%s] is not supported." % + decode_method) + return results -def rnn_group(input, size, num_stacks): - """ - RNN group with several stacking RNN layers. - """ - output = input - for i in xrange(num_stacks): - output = bidirectional_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) - return output + def _create_parameters(self, model_path=None): + if model_path is None: + self._parameters = paddle.parameters.create(self._loss) + else: + self._parameters = paddle.parameters.Parameters.from_tar( + gzip.open(model_path)) - -def deep_speech2(audio_data, - text_data, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=256, - is_inference=False): - """ - The whole DeepSpeech2 model structure (a simplified version). - - :param audio_data: Audio spectrogram data layer. - :type audio_data: LayerOutput - :param text_data: Transcription text data layer. - :type text_data: LayerOutput - :param dict_size: Dictionary size for tokenized transcription. - :type dict_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_size: RNN layer size (number of RNN cells). - :type rnn_size: int - :param is_inference: False in the training mode, and True in the - inferene mode. - :type is_inference: bool - :return: If is_inference set False, return a ctc cost layer; - if is_inference set True, return a sequence layer of output - probability distribution. - :rtype: tuple of LayerOutput - """ - # convolution group - conv_group_output, conv_group_num_channels, conv_group_height = conv_group( - input=audio_data, num_stacks=num_conv_layers) - # convert data form convolution feature map to sequence of vectors - conv2seq = paddle.layer.block_expand( - input=conv_group_output, - num_channels=conv_group_num_channels, - stride_x=1, - stride_y=1, - block_x=1, - block_y=conv_group_height) - # rnn group - rnn_group_output = rnn_group( - input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) - fc = paddle.layer.fc( - input=rnn_group_output, - size=dict_size + 1, - act=paddle.activation.Linear(), - bias_attr=True) - if is_inference: - # probability distribution with softmax - return paddle.layer.mixed( - input=paddle.layer.identity_projection(input=fc), - act=paddle.activation.Softmax()) - else: - # ctc cost - return paddle.layer.warp_ctc( - input=fc, - label=text_data, - size=dict_size + 1, - blank=dict_size, - norm_by_times=True) + def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, + rnn_layer_size): + # paddle.data_type.dense_array is used for variable batch input. + # The size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be induced during training. + audio_data = paddle.layer.data( + name="audio_spectrogram", + type=paddle.data_type.dense_array(161 * 161)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(vocab_size)) + self._log_probs, self._loss = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=vocab_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_layer_size) diff --git a/train.py b/train.py index 6481074c..45f7a6d9 100644 --- a/train.py +++ b/train.py @@ -3,15 +3,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import sys -import os import argparse -import gzip -import time import distutils.util import multiprocessing import paddle.v2 as paddle -from model import deep_speech2 +from model import DeepSpeech2Model from data_utils.data import DataGenerator import utils @@ -23,6 +19,12 @@ parser.add_argument( default=200, type=int, help="Training pass number. (default: %(default)s)") +parser.add_argument( + "--num_iterations_print", + default=100, + type=int, + help="Number of iterations for every train cost printing. " + "(default: %(default)s)") parser.add_argument( "--num_conv_layers", default=2, @@ -127,100 +129,47 @@ args = parser.parse_args() def train(): """DeepSpeech2 training.""" - - # initialize data generator - def data_generator(): - return DataGenerator( - vocab_filepath=args.vocab_filepath, - mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config, - max_duration=args.max_duration, - min_duration=args.min_duration, - specgram_type=args.specgram_type, - num_threads=args.num_threads_data) - - train_generator = data_generator() - test_generator = data_generator() - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence( - train_generator.vocab_size)) - cost = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=train_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=False) - - # create/load parameters and optimizer - if args.init_model_path is None: - parameters = paddle.parameters.create(cost) - else: - if not os.path.isfile(args.init_model_path): - raise IOError("Invalid model!") - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.init_model_path)) - optimizer = paddle.optimizer.Adam( - learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - - # prepare data reader + train_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config=args.augmentation_config, + max_duration=args.max_duration, + min_duration=args.min_duration, + specgram_type=args.specgram_type, + num_threads=args.num_threads_data) + dev_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config="{}", + specgram_type=args.specgram_type, + num_threads=args.num_threads_data) train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, min_batch_size=args.trainer_count, sortagrad=args.use_sortagrad if args.init_model_path is None else False, shuffle_method=args.shuffle_method) - test_batch_reader = test_generator.batch_reader_creator( + dev_batch_reader = dev_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, min_batch_size=1, # must be 1, but will have errors. sortagrad=False, shuffle_method=None) - # create event handler - def event_handler(event): - global start_time, cost_sum, cost_counter - if isinstance(event, paddle.event.EndIteration): - cost_sum += event.cost - cost_counter += 1 - if (event.batch_id + 1) % 100 == 0: - print("\nPass: %d, Batch: %d, TrainCost: %f" % ( - event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) - cost_sum, cost_counter = 0.0, 0 - with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f: - parameters.to_tar(f) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.BeginPass): - start_time = time.time() - cost_sum, cost_counter = 0.0, 0 - if isinstance(event, paddle.event.EndPass): - result = trainer.test( - reader=test_batch_reader, feeding=test_generator.feeding) - print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" % - (time.time() - start_time, event.pass_id, result.cost)) - with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id, - 'w') as f: - parameters.to_tar(f) - - # run train - trainer.train( - reader=train_batch_reader, - event_handler=event_handler, + ds2_model = DeepSpeech2Model( + vocab_size=train_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.init_model_path) + ds2_model.train( + train_batch_reader=train_batch_reader, + dev_batch_reader=dev_batch_reader, + feeding_dict=train_generator.feeding, + learning_rate=args.adam_learning_rate, + gradient_clipping=400, num_passes=args.num_passes, - feeding=train_generator.feeding) + num_iterations_print=args.num_iterations_print) def main(): diff --git a/tune.py b/tune.py index 2fcca486..f414622e 100644 --- a/tune.py +++ b/tune.py @@ -3,14 +3,13 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np import distutils.util import argparse -import gzip +import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator -from model import deep_speech2 -from decoder import * -from lm.lm_scorer import LmScorer +from model import DeepSpeech2Model from error_rate import wer import utils @@ -40,6 +39,11 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--trainer_count", + default=8, + type=int, + help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", default=multiprocessing.cpu_count(), @@ -62,10 +66,10 @@ parser.add_argument( type=str, help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( - "--decode_manifest_path", + "--tune_manifest_path", default='datasets/manifest.test', type=str, - help="Manifest path for decoding. (default: %(default)s)") + help="Manifest path for tuning. (default: %(default)s)") parser.add_argument( "--model_filepath", default='checkpoints/params.latest.tar.gz', @@ -127,96 +131,64 @@ args = parser.parse_args() def tune(): """Tune parameters alpha and beta on one minibatch.""" - if not args.num_alphas >= 0: raise ValueError("num_alphas must be non-negative!") - if not args.num_betas >= 0: raise ValueError("num_betas must be non-negative!") - # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, augmentation_config='{}', specgram_type=args.specgram_type, num_threads=args.num_threads_data) - - # create network config - # paddle.data_type.dense_array is used for variable batch input. - # The size 161 * 161 is only an placeholder value and the real shape - # of input batch data will be induced during training. - audio_data = paddle.layer.data( - name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) - text_data = paddle.layer.data( - name="transcript_text", - type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) - output_probs = deep_speech2( - audio_data=audio_data, - text_data=text_data, - dict_size=data_generator.vocab_size, - num_conv_layers=args.num_conv_layers, - num_rnn_layers=args.num_rnn_layers, - rnn_size=args.rnn_layer_size, - is_inference=True) - - # load parameters - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(args.model_filepath)) - - # prepare infer data batch_reader = data_generator.batch_reader_creator( - manifest_path=args.decode_manifest_path, + manifest_path=args.tune_manifest_path, batch_size=args.num_samples, sortagrad=False, shuffle_method=None) - # get one batch data for tuning - infer_data = batch_reader().next() - - # run inference - infer_results = paddle.infer( - output_layer=output_probs, parameters=parameters, input=infer_data) - num_steps = len(infer_results) // len(infer_data) - probs_split = [ - infer_results[i * num_steps:(i + 1) * num_steps] - for i in xrange(0, len(infer_data)) + tune_data = batch_reader().next() + target_transcripts = [ + ''.join([data_generator.vocab_list[token] for token in transcript]) + for _, transcript in tune_data ] + ds2_model = DeepSpeech2Model( + vocab_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_layer_size=args.rnn_layer_size, + pretrained_model_path=args.model_filepath) + # create grid for search cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) params_grid = [(alpha, beta) for alpha in cand_alphas for beta in cand_betas] - ext_scorer = LmScorer(args.alpha_from, args.beta_from, - args.language_model_path) ## tune parameters in loop for alpha, beta in params_grid: - wer_sum, wer_counter = 0, 0 - # reset scorer - ext_scorer.reset_params(alpha, beta) - # beam search using multiple processes - beam_search_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=data_generator.vocab_list, + result_transcripts = ds2_model.infer_batch( + infer_data=tune_data, + decode_method='beam_search', + beam_alpha=alpha, + beam_beta=beta, beam_size=args.beam_size, cutoff_prob=args.cutoff_prob, - blank_id=len(data_generator.vocab_list), - num_processes=args.num_processes_beam_search, - ext_scoring_func=ext_scorer, ) - for i, beam_search_result in enumerate(beam_search_results): - target_transcription = ''.join([ - data_generator.vocab_list[index] for index in infer_data[i][1] - ]) - wer_sum += wer(target_transcription, beam_search_result[0][1]) - wer_counter += 1 - + vocab_list=data_generator.vocab_list, + language_model_path=args.language_model_path, + num_processes=args.num_processes_beam_search) + wer_sum, num_ins = 0.0, 0 + for target, result in zip(target_transcripts, result_transcripts): + wer_sum += wer(target, result) + num_ins += 1 print("alpha = %f\tbeta = %f\tWER = %f" % - (alpha, beta, wer_sum / wer_counter)) + (alpha, beta, wer_sum / num_ins)) def main(): - paddle.init(use_gpu=args.use_gpu, trainer_count=1) + utils.print_arguments(args) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) tune() From 526e18b11964b00ced661e0119244d7bf8e0229a Mon Sep 17 00:00:00 2001 From: Xinghai Sun Date: Wed, 2 Aug 2017 23:50:46 +0800 Subject: [PATCH 3/3] Add function docs for layer.py and model.py and update other details. --- decoder.py | 2 +- infer.py | 2 +- layer.py | 84 ++++++++++++++++++++++++++++++++++-------------------- model.py | 74 +++++++++++++++++++++++++++++++++++++++++++++-- setup.sh | 3 -- train.py | 8 +++++- tune.py | 4 +-- 7 files changed, 136 insertions(+), 41 deletions(-) diff --git a/decoder.py b/decoder.py index a1fadc2c..8f2e0508 100644 --- a/decoder.py +++ b/decoder.py @@ -205,9 +205,9 @@ def ctc_beam_search_decoder_batch(probs_split, :type num_processes: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. + :type cutoff_prob: float :param num_processes: Number of parallel processes. :type num_processes: int - :type cutoff_prob: float :param ext_scoring_func: External scoring function for partially decoded sentence, e.g. word count or language model. diff --git a/infer.py b/infer.py index ec65cc74..bc77dab7 100644 --- a/infer.py +++ b/infer.py @@ -40,7 +40,7 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=1, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( diff --git a/layer.py b/layer.py index 7b027338..3b492645 100644 --- a/layer.py +++ b/layer.py @@ -5,13 +5,27 @@ from __future__ import print_function import paddle.v2 as paddle -DISABLE_CUDNN_BATCH_NORM = True - def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding, act): - """ - Convolution layer with batch normalization. + """Convolution layer with batch normalization. + + :param input: Input layer. + :type input: LayerOutput + :param filter_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type filter_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :type num_channels_out: Number of output channels. + :type num_channels_in: out + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type. + :type act: BaseActivation + :return: Batch norm layer after convolution layer. + :rtype: LayerOutput """ conv_layer = paddle.layer.img_conv( input=input, @@ -22,32 +36,30 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding=padding, act=paddle.activation.Linear(), bias_attr=False) - if DISABLE_CUDNN_BATCH_NORM: - # temopary patch, need to be removed. - return paddle.layer.batch_norm( - input=conv_layer, act=act, batch_norm_type="batch_norm") - else: - return paddle.layer.batch_norm(input=conv_layer, act=act) + return paddle.layer.batch_norm(input=conv_layer, act=act) def bidirectional_simple_rnn_bn_layer(name, input, size, act): - """ - Bidirectonal simple rnn layer with sequence-wise batch normalization. + """Bidirectonal simple rnn layer with sequence-wise batch normalization. The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput """ # input-hidden weights shared across bi-direcitonal rnn. input_proj = paddle.layer.fc( input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) # batch norm is only performed on input-state projection - if DISABLE_CUDNN_BATCH_NORM: - # temopary patch, need to be removed. - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, - act=paddle.activation.Linear(), - batch_norm_type="batch_norm") - else: - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) # forward and backward in time forward_simple_rnn = paddle.layer.recurrent( input=input_proj_bn, act=act, reverse=False) @@ -57,8 +69,14 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): def conv_group(input, num_stacks): - """ - Convolution group with several stacking convolution layers. + """Convolution group with stacked convolution layers. + + :param input: Input layer. + :type input: LayerOutput + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + :return: Output layer of the convolution group. + :rtype: LayerOutput """ conv = conv_bn_layer( input=input, @@ -83,8 +101,16 @@ def conv_group(input, num_stacks): def rnn_group(input, size, num_stacks): - """ - RNN group with several stacking RNN layers. + """RNN group with stacked bidirectional simple RNN layers. + + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :return: Output layer of the RNN group. + :rtype: LayerOutput """ output = input for i in xrange(num_stacks): @@ -114,12 +140,8 @@ def deep_speech2(audio_data, :type num_rnn_layers: int :param rnn_size: RNN layer size (number of RNN cells). :type rnn_size: int - :param is_inference: False in the training mode, and True in the - inferene mode. - :type is_inference: bool - :return: If is_inference set False, return a ctc cost layer; - if is_inference set True, return a sequence layer of output - probability distribution. + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ # convolution group diff --git a/model.py b/model.py index d1efabb7..f5333f17 100644 --- a/model.py +++ b/model.py @@ -14,6 +14,21 @@ from layer import * class DeepSpeech2Model(object): + """DeepSpeech2Model class. + + :param vocab_size: Decoding vocabulary size. + :type vocab_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_layer_size: RNN layer size (number of RNN cells). + :type rnn_layer_size: int + :param pretrained_model_path: Pretrained model path. If None, will train + from stratch. + :type pretrained_model_path: basestring|None + """ + def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, pretrained_model_path): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, @@ -29,8 +44,33 @@ class DeepSpeech2Model(object): learning_rate, gradient_clipping, num_passes, - num_iterations_print=100, - output_model_dir='checkpoints'): + output_model_dir, + num_iterations_print=100): + """Train the model. + + :param train_batch_reader: Train data reader. + :type train_batch_reader: callable + :param dev_batch_reader: Validation data reader. + :type dev_batch_reader: callable + :param feeding_dict: Feeding is a map of field name and tuple index + of the data that reader returns. + :type feeding_dict: dict|list + :param learning_rate: Learning rate for ADAM optimizer. + :type learning_rate: float + :param gradient_clipping: Gradient clipping threshold. + :type gradient_clipping: float + :param num_passes: Number of training epochs. + :type num_passes: int + :param num_iterations_print: Number of training iterations for printing + a training loss. + :type rnn_iteratons_print: int + :param output_model_dir: Directory for saving the model (every pass). + :type output_model_dir: basestring + """ + # prepare model output directory + if not os.path.exists(output_model_dir): + os.mkdir(output_model_dir) + # prepare optimizer and trainer optimizer = paddle.optimizer.Adam( learning_rate=learning_rate, @@ -81,6 +121,34 @@ class DeepSpeech2Model(object): def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): + """Model inference. Infer the transcription for a batch of speech + utterances. + + :param infer_data: List of utterances to infer, with each utterance a + tuple of audio features and transcription text (empty + string). + :type infer_data: list + :param decode_method: Decoding method name, 'best_path' or + 'beam search'. + :param decode_method: string + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param language_model_path: Filepath for language model. + :type language_model_path: basestring|None + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of basestring + """ # define inferer if self._inferer == None: self._inferer = paddle.inference.Inference( @@ -126,6 +194,7 @@ class DeepSpeech2Model(object): return results def _create_parameters(self, model_path=None): + """Load or create model parameters.""" if model_path is None: self._parameters = paddle.parameters.create(self._loss) else: @@ -134,6 +203,7 @@ class DeepSpeech2Model(object): def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size): + """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape # of input batch data will be induced during training. diff --git a/setup.sh b/setup.sh index 4d451a6f..7f427255 100644 --- a/setup.sh +++ b/setup.sh @@ -26,7 +26,4 @@ if [ $? != 0 ]; then rm libsndfile-1.0.28.tar.gz fi -# prepare ./checkpoints -mkdir checkpoints - echo "Install all dependencies successfully." diff --git a/train.py b/train.py index 45f7a6d9..080f57d2 100644 --- a/train.py +++ b/train.py @@ -116,6 +116,11 @@ parser.add_argument( help="If set None, the training will start from scratch. " "Otherwise, the training will resume from " "the existing model of this path. (default: %(default)s)") +parser.add_argument( + "--output_model_dir", + default="./checkpoints", + type=str, + help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", default='[{"type": "shift", ' @@ -169,7 +174,8 @@ def train(): learning_rate=args.adam_learning_rate, gradient_clipping=400, num_passes=args.num_passes, - num_iterations_print=args.num_iterations_print) + num_iterations_print=args.num_iterations_print, + output_model_dir=args.output_model_dir) def main(): diff --git a/tune.py b/tune.py index f414622e..a17be30f 100644 --- a/tune.py +++ b/tune.py @@ -46,7 +46,7 @@ parser.add_argument( help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=1, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( @@ -67,7 +67,7 @@ parser.add_argument( help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--tune_manifest_path", - default='datasets/manifest.test', + default='datasets/manifest.dev', type=str, help="Manifest path for tuning. (default: %(default)s)") parser.add_argument(