Add function docs.

8 years ago · 0babc5c4d7
parent 70a343a499
commit 0babc5c4d7
5 changed files with 153 additions and 19 deletions
--- a/audio_data_utils.py
+++ b/audio_data_utils.py
@ -1,3 +1,6 @@
+"""
+   Audio data preprocessing tools and reader creators.
+"""
 import paddle.v2 as paddle
 import logging
 import json
@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path):


 def get_vocabulary_size():
+    """
+    Get vocabulary size.
+    """
    vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
    return len(vocab_dict)


 def get_vocabulary():
+    """
+    Get vocabulary.
+    """
    return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)


 def parse_transcript(text, vocabulary):
    """
-      Convert the transcript text string to list of token index integers..
-      """
+    Convert the transcript text string to list of token index integers.
+    """
    return [vocabulary[w] for w in text]


@ -106,6 +115,28 @@ def reader_creator(manifest_path,
                   shuffle=False,
                   max_duration=10.0,
                   min_duration=0.0):
+    """
+    Audio data reader creator.
+
+    Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
+    tokenized transcription text.
+
+    :param manifest_path: Filepath for Manifest of audio clip files.
+    :type manifest_path: basestring
+    :param sort_by_duration: Sort the audio clips by duration if set True.
+                             For SortaGrad.
+    :type sort_by_duration: bool
+    :param shuffle: Shuffle the audio clips if set True.
+    :type shuffle: bool
+    :param max_duration: Audio clips with duration (in seconds) greater than
+                         this will be discarded.
+    :type max_duration: float
+    :param min_duration: Audio clips with duration (in seconds) smaller than
+                         this will be discarded.
+    :type min_duration: float
+    :return: Data reader function.
+    :rtype: callable
+    """
    if sort_by_duration and shuffle:
        sort_by_duration = False
        logger.warn("When shuffle set to true, "
@ -138,6 +169,27 @@ def reader_creator(manifest_path,


 def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True):
+    """
+    Padding for batches. Return a batch reader.
+
+    Each instance in a batch will be padded to be of a same target shape.
+    The target shape is the largest shape among all the batch instances and
+    'padding' argument. Therefore, if padding is set [-1, -1], instance will be
+    padded to have the same shape just within each batch and the shape will
+    be different across batches; if padding is set
+    [VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to
+    have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM].
+
+    :param batch_reader: Input batch reader.
+    :type batch_reader: callable
+    :param padding: Padding pattern. Details please refer to the above.
+    :type padding: list
+    :param flatten: Flatten the tensor to be one dimension.
+    :type flatten: bool
+    :return: Batch reader function.
+    :rtype: callable
+    """
+
    def padding_batch(batch):
        new_batch = []
        # get target shape within batch
--- a/infer.py
+++ b/infer.py
@ -1,14 +1,21 @@
+"""
+   Inference for a simplifed version of Baidu DeepSpeech2 model.
+"""
+
 import paddle.v2 as paddle
-import audio_data_utils
+from itertools import groupby
 import argparse
-from model import deep_speech2
 import gzip
-from itertools import groupby
+import audio_data_utils
+from model import deep_speech2

 parser = argparse.ArgumentParser(
-    description='Simpled version of DeepSpeech2 inference.')
+    description='Simplified version of DeepSpeech2 inference.')
 parser.add_argument(
-    "--num_samples", default=10, type=int, help="Number of inference samples.")
+    "--num_samples",
+    default=10,
+    type=int,
+    help="Number of samples for inference.")
 parser.add_argument(
    "--num_conv_layers", default=2, type=int, help="Convolution layer number.")
 parser.add_argument(
@ -21,13 +28,21 @@ args = parser.parse_args()


 def remove_duplicate_and_blank(id_list, blank_id):
+    """
+    Postprocessing for max-ctc-decoder.
+    - remove consecutive duplicate tokens.
+    - remove blanks.
+    """
    # remove consecutive duplicate tokens
    id_list = [x[0] for x in groupby(id_list)]
-    # remove blank
+    # remove blanks
    return [id for id in id_list if id != blank_id]


 def max_infer():
+    """
+    Max-ctc-decoding for DeepSpeech2.
+    """
    # create network config
    _, vocab_list = audio_data_utils.get_vocabulary()
    dict_size = len(vocab_list)
@ -64,7 +79,7 @@ def max_infer():
        padding=[-1, 1000])
    infer_data = test_batch_reader().next()

-    # run inference
+    # run max-ctc-decoding
    max_id_results = paddle.infer(
        output_layer=max_id,
        parameters=parameters,
--- a/librispeech.py
+++ b/librispeech.py
@ -1,3 +1,11 @@
+"""
+   Download, unpack and create manifest for Librespeech dataset.
+
+   Manifest is a json file with each line containing one audio clip filepath,
+   its transcription text string, and its duration. It servers as a unified
+   interfance to organize different data sets.
+"""
+
 import paddle.v2 as paddle
 import os
 import wget
@ -88,9 +96,10 @@ def main():
        url=URL_DEV,
        target_dir=os.path.join(args.target_dir),
        manifest_path=args.manifest + ".dev")
-    #prepare_dataset(url=URL_TRAIN,
-#target_dir=os.path.join(args.target_dir),
-#manifest_path=args.manifest + ".train")
+    prepare_dataset(
+        url=URL_TRAIN,
+        target_dir=os.path.join(args.target_dir),
+        manifest_path=args.manifest + ".train")


 if __name__ == '__main__':
--- a/model.py
+++ b/model.py
@ -1,8 +1,17 @@
+"""
+   A simplifed version of Baidu DeepSpeech2 model.
+"""
+
 import paddle.v2 as paddle

+#TODO: add bidirectional rnn.
+

 def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
                  padding, act):
+    """
+    Convolution layer with batch normalization.
+    """
    conv_layer = paddle.layer.img_conv(
        input=input,
        filter_size=filter_size,
@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,


 def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
+    """
+    Bidirectonal simple rnn layer with batch normalization.
+    The batch normalization is only performed on input-state projection
+    (sequence-wise normalization).
+
+    Question: does mean and variance statistics computed over the whole sequence
+    or just on each individual time steps?
+    """
+
    def __simple_rnn_step__(input):
        last_state = paddle.layer.memory(name=name + "_state", size=size)
        input_fc = paddle.layer.fc(
@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
+        # batch norm is only performed on input-state projection 
        input_fc_bn = paddle.layer.batch_norm(
            input=input_fc, act=paddle.activation.Linear())
        state_fc = paddle.layer.fc(
@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):


 def conv_group(input, num_stacks):
+    """
+    Convolution group with several stacking convolution layers.
+    """
    conv = conv_bn_layer(
        input=input,
        filter_size=(11, 41),
@ -68,6 +90,9 @@ def conv_group(input, num_stacks):


 def rnn_group(input, size, num_stacks):
+    """
+    RNN group with several stacking RNN layers.
+    """
    output = input
    for i in xrange(num_stacks):
        output = bidirectonal_simple_rnn_bn_layer(
@ -81,7 +106,27 @@ def deep_speech2(audio_data,
                 num_conv_layers=2,
                 num_rnn_layers=3,
                 rnn_size=256):
+    """
+    The whole DeepSpeech2 model structure (a simplified version).
+
+    :param audio_data: Audio spectrogram data layer.
+    :type audio_data: LayerOutput
+    :param text_data: Transcription text data layer.
+    :type text_data: LayerOutput
+    :param dict_size: Dictionary size for tokenized transcription.
+    :type dict_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_size: RNN layer size (number of RNN cells).
+    :type rnn_size: int
+    :return: Tuple of the cost layer and the max_id decoder layer.
+    :rtype: tuple of LayerOutput
+    """
+    # convolution group
    conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers)
+    # convert data form convolution feature map to sequence of vectors
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
        num_channels=32,
@ -89,18 +134,22 @@ def deep_speech2(audio_data,
        stride_y=1,
        block_x=1,
        block_y=21)
+    # rnn group
    rnn_group_output = rnn_group(
        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
+    # output token distribution
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
+    # ctc cost
    cost = paddle.layer.warp_ctc(
        input=fc,
        label=text_data,
        size=dict_size + 1,
        blank=dict_size,
        norm_by_times=True)
+    # max decoder
    max_id = paddle.layer.max_id(input=fc)
    return cost, max_id
--- a/train.py
+++ b/train.py
@ -1,20 +1,27 @@
+"""
+   Trainer for a simplifed version of Baidu DeepSpeech2 model.
+"""
+
 import paddle.v2 as paddle
-import audio_data_utils
 import argparse
-from model import deep_speech2
 import gzip
+import sys
+from model import deep_speech2
+import audio_data_utils
+
+#TODO: add WER metric

 parser = argparse.ArgumentParser(
-    description='Simpled version of DeepSpeech2 trainer.')
+    description='Simplified version of DeepSpeech2 trainer.')
 parser.add_argument(
    "--batch_size", default=512, type=int, help="Minibatch size.")
 parser.add_argument("--trainer", default=1, type=int, help="Trainer number.")
 parser.add_argument(
    "--num_passes", default=20, type=int, help="Training pass number.")
 parser.add_argument(
-    "--num_conv_layers", default=2, type=int, help="Convolution layer number.")
+    "--num_conv_layers", default=3, type=int, help="Convolution layer number.")
 parser.add_argument(
-    "--num_rnn_layers", default=3, type=int, help="RNN layer number.")
+    "--num_rnn_layers", default=5, type=int, help="RNN layer number.")
 parser.add_argument(
    "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.")
 parser.add_argument(
@ -25,6 +32,9 @@ args = parser.parse_args()


 def train():
+    """
+    DeepSpeech2 training.
+    """
    # create network config
    dict_size = audio_data_utils.get_vocabulary_size()
    audio_data = paddle.layer.data(
@ -89,8 +99,7 @@ def train():
                sys.stdout.flush()
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_batch_reader, feeding=feeding)
-            print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost,
-                                                  result.metrics)
+            print "Pass: %d, TestMetric: %s" % (event.pass_id, result.metrics)
            with gzip.open("params.tar.gz", 'w') as f:
                parameters.to_tar(f)