Add function docs.

8 years ago · 0babc5c4d7
parent 70a343a499
commit 0babc5c4d7
5 changed files with 153 additions and 19 deletions
--- a/audio_data_utils.py
+++ b/audio_data_utils.py
@ -1,3 +1,6 @@
 """
   Audio data preprocessing tools and reader creators.
 """
 import paddle.v2 as paddle
 import logging
 import json
@ -86,17 +89,23 @@ def vocabulary_from_file(vocabulary_path):
 def get_vocabulary_size():
    """
    Get vocabulary size.
    """
    vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
    return len(vocab_dict)
 def get_vocabulary():
    """
    Get vocabulary.
    """
    return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
 def parse_transcript(text, vocabulary):
    """
-      Convert the transcript text string to list of token index integers..
+    Convert the transcript text string to list of token index integers.
    """
    return [vocabulary[w] for w in text]
@ -106,6 +115,28 @@ def reader_creator(manifest_path,
                   shuffle=False,
                   max_duration=10.0,
                   min_duration=0.0):
    """
    Audio data reader creator.
    Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
    tokenized transcription text.
    :param manifest_path: Filepath for Manifest of audio clip files.
    :type manifest_path: basestring
    :param sort_by_duration: Sort the audio clips by duration if set True.
                             For SortaGrad.
    :type sort_by_duration: bool
    :param shuffle: Shuffle the audio clips if set True.
    :type shuffle: bool
    :param max_duration: Audio clips with duration (in seconds) greater than
                         this will be discarded.
    :type max_duration: float
    :param min_duration: Audio clips with duration (in seconds) smaller than
                         this will be discarded.
    :type min_duration: float
    :return: Data reader function.
    :rtype: callable
    """
    if sort_by_duration and shuffle:
        sort_by_duration = False
        logger.warn("When shuffle set to true, "
@ -138,6 +169,27 @@ def reader_creator(manifest_path,
 def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True):
    """
    Padding for batches. Return a batch reader.
    Each instance in a batch will be padded to be of a same target shape.
    The target shape is the largest shape among all the batch instances and
    'padding' argument. Therefore, if padding is set [-1, -1], instance will be
    padded to have the same shape just within each batch and the shape will
    be different across batches; if padding is set
    [VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to
    have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM].
    :param batch_reader: Input batch reader.
    :type batch_reader: callable
    :param padding: Padding pattern. Details please refer to the above.
    :type padding: list
    :param flatten: Flatten the tensor to be one dimension.
    :type flatten: bool
    :return: Batch reader function.
    :rtype: callable
    """
    def padding_batch(batch):
        new_batch = []
        # get target shape within batch
--- a/infer.py
+++ b/infer.py
@ -1,14 +1,21 @@
 """
   Inference for a simplifed version of Baidu DeepSpeech2 model.
 """
 import paddle.v2 as paddle
-import audio_data_utils
+from itertools import groupby
 import argparse
 from model import deep_speech2
 import gzip
-from itertools import groupby
+import audio_data_utils
 from model import deep_speech2
 parser = argparse.ArgumentParser(
-    description='Simpled version of DeepSpeech2 inference.')
+    description='Simplified version of DeepSpeech2 inference.')
 parser.add_argument(
-    "--num_samples", default=10, type=int, help="Number of inference samples.")
+    "--num_samples",
    default=10,
    type=int,
    help="Number of samples for inference.")
 parser.add_argument(
    "--num_conv_layers", default=2, type=int, help="Convolution layer number.")
 parser.add_argument(
@ -21,13 +28,21 @@ args = parser.parse_args()
 def remove_duplicate_and_blank(id_list, blank_id):
    """
    Postprocessing for max-ctc-decoder.
    - remove consecutive duplicate tokens.
    - remove blanks.
    """
    # remove consecutive duplicate tokens
    id_list = [x[0] for x in groupby(id_list)]
-    # remove blank
+    # remove blanks
    return [id for id in id_list if id != blank_id]
 def max_infer():
    """
    Max-ctc-decoding for DeepSpeech2.
    """
    # create network config
    _, vocab_list = audio_data_utils.get_vocabulary()
    dict_size = len(vocab_list)
@ -64,7 +79,7 @@ def max_infer():
        padding=[-1, 1000])
    infer_data = test_batch_reader().next()
-    # run inference
+    # run max-ctc-decoding
    max_id_results = paddle.infer(
        output_layer=max_id,
        parameters=parameters,
--- a/librispeech.py
+++ b/librispeech.py
@ -1,3 +1,11 @@
 """
   Download, unpack and create manifest for Librespeech dataset.
   Manifest is a json file with each line containing one audio clip filepath,
   its transcription text string, and its duration. It servers as a unified
   interfance to organize different data sets.
 """
 import paddle.v2 as paddle
 import os
 import wget
@ -88,9 +96,10 @@ def main():
        url=URL_DEV,
        target_dir=os.path.join(args.target_dir),
        manifest_path=args.manifest + ".dev")
-    #prepare_dataset(url=URL_TRAIN,
+    prepare_dataset(
-#target_dir=os.path.join(args.target_dir),
+        url=URL_TRAIN,
-#manifest_path=args.manifest + ".train")
+        target_dir=os.path.join(args.target_dir),
        manifest_path=args.manifest + ".train")
 if __name__ == '__main__':
--- a/model.py
+++ b/model.py
@ -1,8 +1,17 @@
 """
   A simplifed version of Baidu DeepSpeech2 model.
 """
 import paddle.v2 as paddle
 #TODO: add bidirectional rnn.
 def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
                  padding, act):
    """
    Convolution layer with batch normalization.
    """
    conv_layer = paddle.layer.img_conv(
        input=input,
        filter_size=filter_size,
@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
 def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
    """
    Bidirectonal simple rnn layer with batch normalization.
    The batch normalization is only performed on input-state projection
    (sequence-wise normalization).
    Question: does mean and variance statistics computed over the whole sequence
    or just on each individual time steps?
    """
    def __simple_rnn_step__(input):
        last_state = paddle.layer.memory(name=name + "_state", size=size)
        input_fc = paddle.layer.fc(
@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        # batch norm is only performed on input-state projection 
        input_fc_bn = paddle.layer.batch_norm(
            input=input_fc, act=paddle.activation.Linear())
        state_fc = paddle.layer.fc(
@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
 def conv_group(input, num_stacks):
    """
    Convolution group with several stacking convolution layers.
    """
    conv = conv_bn_layer(
        input=input,
        filter_size=(11, 41),
@ -68,6 +90,9 @@ def conv_group(input, num_stacks):
 def rnn_group(input, size, num_stacks):
    """
    RNN group with several stacking RNN layers.
    """
    output = input
    for i in xrange(num_stacks):
        output = bidirectonal_simple_rnn_bn_layer(
@ -81,7 +106,27 @@ def deep_speech2(audio_data,
                 num_conv_layers=2,
                 num_rnn_layers=3,
                 rnn_size=256):
    """
    The whole DeepSpeech2 model structure (a simplified version).
    :param audio_data: Audio spectrogram data layer.
    :type audio_data: LayerOutput
    :param text_data: Transcription text data layer.
    :type text_data: LayerOutput
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
    :return: Tuple of the cost layer and the max_id decoder layer.
    :rtype: tuple of LayerOutput
    """
    # convolution group
    conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers)
    # convert data form convolution feature map to sequence of vectors
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
        num_channels=32,
@ -89,18 +134,22 @@ def deep_speech2(audio_data,
        stride_y=1,
        block_x=1,
        block_y=21)
    # rnn group
    rnn_group_output = rnn_group(
        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
    # output token distribution
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
    # ctc cost
    cost = paddle.layer.warp_ctc(
        input=fc,
        label=text_data,
        size=dict_size + 1,
        blank=dict_size,
        norm_by_times=True)
    # max decoder
    max_id = paddle.layer.max_id(input=fc)
    return cost, max_id
--- a/train.py
+++ b/train.py
@ -1,20 +1,27 @@
 """
   Trainer for a simplifed version of Baidu DeepSpeech2 model.
 """
 import paddle.v2 as paddle
 import audio_data_utils
 import argparse
 from model import deep_speech2
 import gzip
 import sys
 from model import deep_speech2
 import audio_data_utils
 #TODO: add WER metric
 parser = argparse.ArgumentParser(
-    description='Simpled version of DeepSpeech2 trainer.')
+    description='Simplified version of DeepSpeech2 trainer.')
 parser.add_argument(
    "--batch_size", default=512, type=int, help="Minibatch size.")
 parser.add_argument("--trainer", default=1, type=int, help="Trainer number.")
 parser.add_argument(
    "--num_passes", default=20, type=int, help="Training pass number.")
 parser.add_argument(
-    "--num_conv_layers", default=2, type=int, help="Convolution layer number.")
+    "--num_conv_layers", default=3, type=int, help="Convolution layer number.")
 parser.add_argument(
-    "--num_rnn_layers", default=3, type=int, help="RNN layer number.")
+    "--num_rnn_layers", default=5, type=int, help="RNN layer number.")
 parser.add_argument(
    "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.")
 parser.add_argument(
@ -25,6 +32,9 @@ args = parser.parse_args()
 def train():
    """
    DeepSpeech2 training.
    """
    # create network config
    dict_size = audio_data_utils.get_vocabulary_size()
    audio_data = paddle.layer.data(
@ -89,8 +99,7 @@ def train():
                sys.stdout.flush()
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_batch_reader, feeding=feeding)
-            print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
+            print "Pass: %d, TestMetric: %s" % (event.pass_id, result.metrics)
                                                  result.metrics)
            with gzip.open("params.tar.gz", 'w') as f:
                parameters.to_tar(f)