diff --git a/decoder.py b/decoder.py index a1fadc2c..8f2e0508 100644 --- a/decoder.py +++ b/decoder.py @@ -205,9 +205,9 @@ def ctc_beam_search_decoder_batch(probs_split, :type num_processes: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. + :type cutoff_prob: float :param num_processes: Number of parallel processes. :type num_processes: int - :type cutoff_prob: float :param ext_scoring_func: External scoring function for partially decoded sentence, e.g. word count or language model. diff --git a/infer.py b/infer.py index ec65cc74..bc77dab7 100644 --- a/infer.py +++ b/infer.py @@ -40,7 +40,7 @@ parser.add_argument( help="Use gpu or not. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=1, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( diff --git a/layer.py b/layer.py index 7b027338..3b492645 100644 --- a/layer.py +++ b/layer.py @@ -5,13 +5,27 @@ from __future__ import print_function import paddle.v2 as paddle -DISABLE_CUDNN_BATCH_NORM = True - def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding, act): - """ - Convolution layer with batch normalization. + """Convolution layer with batch normalization. + + :param input: Input layer. + :type input: LayerOutput + :param filter_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type filter_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :type num_channels_out: Number of output channels. + :type num_channels_in: out + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type. + :type act: BaseActivation + :return: Batch norm layer after convolution layer. + :rtype: LayerOutput """ conv_layer = paddle.layer.img_conv( input=input, @@ -22,32 +36,30 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, padding=padding, act=paddle.activation.Linear(), bias_attr=False) - if DISABLE_CUDNN_BATCH_NORM: - # temopary patch, need to be removed. - return paddle.layer.batch_norm( - input=conv_layer, act=act, batch_norm_type="batch_norm") - else: - return paddle.layer.batch_norm(input=conv_layer, act=act) + return paddle.layer.batch_norm(input=conv_layer, act=act) def bidirectional_simple_rnn_bn_layer(name, input, size, act): - """ - Bidirectonal simple rnn layer with sequence-wise batch normalization. + """Bidirectonal simple rnn layer with sequence-wise batch normalization. The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput """ # input-hidden weights shared across bi-direcitonal rnn. input_proj = paddle.layer.fc( input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) # batch norm is only performed on input-state projection - if DISABLE_CUDNN_BATCH_NORM: - # temopary patch, need to be removed. - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, - act=paddle.activation.Linear(), - batch_norm_type="batch_norm") - else: - input_proj_bn = paddle.layer.batch_norm( - input=input_proj, act=paddle.activation.Linear()) + input_proj_bn = paddle.layer.batch_norm( + input=input_proj, act=paddle.activation.Linear()) # forward and backward in time forward_simple_rnn = paddle.layer.recurrent( input=input_proj_bn, act=act, reverse=False) @@ -57,8 +69,14 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): def conv_group(input, num_stacks): - """ - Convolution group with several stacking convolution layers. + """Convolution group with stacked convolution layers. + + :param input: Input layer. + :type input: LayerOutput + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + :return: Output layer of the convolution group. + :rtype: LayerOutput """ conv = conv_bn_layer( input=input, @@ -83,8 +101,16 @@ def conv_group(input, num_stacks): def rnn_group(input, size, num_stacks): - """ - RNN group with several stacking RNN layers. + """RNN group with stacked bidirectional simple RNN layers. + + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :return: Output layer of the RNN group. + :rtype: LayerOutput """ output = input for i in xrange(num_stacks): @@ -114,12 +140,8 @@ def deep_speech2(audio_data, :type num_rnn_layers: int :param rnn_size: RNN layer size (number of RNN cells). :type rnn_size: int - :param is_inference: False in the training mode, and True in the - inferene mode. - :type is_inference: bool - :return: If is_inference set False, return a ctc cost layer; - if is_inference set True, return a sequence layer of output - probability distribution. + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ # convolution group diff --git a/model.py b/model.py index d1efabb7..f5333f17 100644 --- a/model.py +++ b/model.py @@ -14,6 +14,21 @@ from layer import * class DeepSpeech2Model(object): + """DeepSpeech2Model class. + + :param vocab_size: Decoding vocabulary size. + :type vocab_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_layer_size: RNN layer size (number of RNN cells). + :type rnn_layer_size: int + :param pretrained_model_path: Pretrained model path. If None, will train + from stratch. + :type pretrained_model_path: basestring|None + """ + def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size, pretrained_model_path): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, @@ -29,8 +44,33 @@ class DeepSpeech2Model(object): learning_rate, gradient_clipping, num_passes, - num_iterations_print=100, - output_model_dir='checkpoints'): + output_model_dir, + num_iterations_print=100): + """Train the model. + + :param train_batch_reader: Train data reader. + :type train_batch_reader: callable + :param dev_batch_reader: Validation data reader. + :type dev_batch_reader: callable + :param feeding_dict: Feeding is a map of field name and tuple index + of the data that reader returns. + :type feeding_dict: dict|list + :param learning_rate: Learning rate for ADAM optimizer. + :type learning_rate: float + :param gradient_clipping: Gradient clipping threshold. + :type gradient_clipping: float + :param num_passes: Number of training epochs. + :type num_passes: int + :param num_iterations_print: Number of training iterations for printing + a training loss. + :type rnn_iteratons_print: int + :param output_model_dir: Directory for saving the model (every pass). + :type output_model_dir: basestring + """ + # prepare model output directory + if not os.path.exists(output_model_dir): + os.mkdir(output_model_dir) + # prepare optimizer and trainer optimizer = paddle.optimizer.Adam( learning_rate=learning_rate, @@ -81,6 +121,34 @@ class DeepSpeech2Model(object): def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, beam_size, cutoff_prob, vocab_list, language_model_path, num_processes): + """Model inference. Infer the transcription for a batch of speech + utterances. + + :param infer_data: List of utterances to infer, with each utterance a + tuple of audio features and transcription text (empty + string). + :type infer_data: list + :param decode_method: Decoding method name, 'best_path' or + 'beam search'. + :param decode_method: string + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param language_model_path: Filepath for language model. + :type language_model_path: basestring|None + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of basestring + """ # define inferer if self._inferer == None: self._inferer = paddle.inference.Inference( @@ -126,6 +194,7 @@ class DeepSpeech2Model(object): return results def _create_parameters(self, model_path=None): + """Load or create model parameters.""" if model_path is None: self._parameters = paddle.parameters.create(self._loss) else: @@ -134,6 +203,7 @@ class DeepSpeech2Model(object): def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, rnn_layer_size): + """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape # of input batch data will be induced during training. diff --git a/setup.sh b/setup.sh index 4d451a6f..7f427255 100644 --- a/setup.sh +++ b/setup.sh @@ -26,7 +26,4 @@ if [ $? != 0 ]; then rm libsndfile-1.0.28.tar.gz fi -# prepare ./checkpoints -mkdir checkpoints - echo "Install all dependencies successfully." diff --git a/train.py b/train.py index 45f7a6d9..080f57d2 100644 --- a/train.py +++ b/train.py @@ -116,6 +116,11 @@ parser.add_argument( help="If set None, the training will start from scratch. " "Otherwise, the training will resume from " "the existing model of this path. (default: %(default)s)") +parser.add_argument( + "--output_model_dir", + default="./checkpoints", + type=str, + help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", default='[{"type": "shift", ' @@ -169,7 +174,8 @@ def train(): learning_rate=args.adam_learning_rate, gradient_clipping=400, num_passes=args.num_passes, - num_iterations_print=args.num_iterations_print) + num_iterations_print=args.num_iterations_print, + output_model_dir=args.output_model_dir) def main(): diff --git a/tune.py b/tune.py index f414622e..a17be30f 100644 --- a/tune.py +++ b/tune.py @@ -46,7 +46,7 @@ parser.add_argument( help="Trainer number. (default: %(default)s)") parser.add_argument( "--num_threads_data", - default=multiprocessing.cpu_count(), + default=1, type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( @@ -67,7 +67,7 @@ parser.add_argument( help="Manifest path for normalizer. (default: %(default)s)") parser.add_argument( "--tune_manifest_path", - default='datasets/manifest.test', + default='datasets/manifest.dev', type=str, help="Manifest path for tuning. (default: %(default)s)") parser.add_argument(