Add function docs.

pull/2/head
Xinghai Sun 8 years ago
parent 70a343a499
commit 0babc5c4d7

@ -1,3 +1,6 @@
"""
Audio data preprocessing tools and reader creators.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import logging import logging
import json import json
@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path):
def get_vocabulary_size(): def get_vocabulary_size():
"""
Get vocabulary size.
"""
vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) vocab_dict, _ = vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
return len(vocab_dict) return len(vocab_dict)
def get_vocabulary(): def get_vocabulary():
"""
Get vocabulary.
"""
return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH)
def parse_transcript(text, vocabulary): def parse_transcript(text, vocabulary):
""" """
Convert the transcript text string to list of token index integers.. Convert the transcript text string to list of token index integers.
""" """
return [vocabulary[w] for w in text] return [vocabulary[w] for w in text]
@ -106,6 +115,28 @@ def reader_creator(manifest_path,
shuffle=False, shuffle=False,
max_duration=10.0, max_duration=10.0,
min_duration=0.0): min_duration=0.0):
"""
Audio data reader creator.
Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
tokenized transcription text.
:param manifest_path: Filepath for Manifest of audio clip files.
:type manifest_path: basestring
:param sort_by_duration: Sort the audio clips by duration if set True.
For SortaGrad.
:type sort_by_duration: bool
:param shuffle: Shuffle the audio clips if set True.
:type shuffle: bool
:param max_duration: Audio clips with duration (in seconds) greater than
this will be discarded.
:type max_duration: float
:param min_duration: Audio clips with duration (in seconds) smaller than
this will be discarded.
:type min_duration: float
:return: Data reader function.
:rtype: callable
"""
if sort_by_duration and shuffle: if sort_by_duration and shuffle:
sort_by_duration = False sort_by_duration = False
logger.warn("When shuffle set to true, " logger.warn("When shuffle set to true, "
@ -138,6 +169,27 @@ def reader_creator(manifest_path,
def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True): def padding_batch_reader(batch_reader, padding=[-1, -1], flatten=True):
"""
Padding for batches. Return a batch reader.
Each instance in a batch will be padded to be of a same target shape.
The target shape is the largest shape among all the batch instances and
'padding' argument. Therefore, if padding is set [-1, -1], instance will be
padded to have the same shape just within each batch and the shape will
be different across batches; if padding is set
[VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to
have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM].
:param batch_reader: Input batch reader.
:type batch_reader: callable
:param padding: Padding pattern. Details please refer to the above.
:type padding: list
:param flatten: Flatten the tensor to be one dimension.
:type flatten: bool
:return: Batch reader function.
:rtype: callable
"""
def padding_batch(batch): def padding_batch(batch):
new_batch = [] new_batch = []
# get target shape within batch # get target shape within batch

@ -1,14 +1,21 @@
"""
Inference for a simplifed version of Baidu DeepSpeech2 model.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import audio_data_utils from itertools import groupby
import argparse import argparse
from model import deep_speech2
import gzip import gzip
from itertools import groupby import audio_data_utils
from model import deep_speech2
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Simpled version of DeepSpeech2 inference.') description='Simplified version of DeepSpeech2 inference.')
parser.add_argument( parser.add_argument(
"--num_samples", default=10, type=int, help="Number of inference samples.") "--num_samples",
default=10,
type=int,
help="Number of samples for inference.")
parser.add_argument( parser.add_argument(
"--num_conv_layers", default=2, type=int, help="Convolution layer number.") "--num_conv_layers", default=2, type=int, help="Convolution layer number.")
parser.add_argument( parser.add_argument(
@ -21,13 +28,21 @@ args = parser.parse_args()
def remove_duplicate_and_blank(id_list, blank_id): def remove_duplicate_and_blank(id_list, blank_id):
"""
Postprocessing for max-ctc-decoder.
- remove consecutive duplicate tokens.
- remove blanks.
"""
# remove consecutive duplicate tokens # remove consecutive duplicate tokens
id_list = [x[0] for x in groupby(id_list)] id_list = [x[0] for x in groupby(id_list)]
# remove blank # remove blanks
return [id for id in id_list if id != blank_id] return [id for id in id_list if id != blank_id]
def max_infer(): def max_infer():
"""
Max-ctc-decoding for DeepSpeech2.
"""
# create network config # create network config
_, vocab_list = audio_data_utils.get_vocabulary() _, vocab_list = audio_data_utils.get_vocabulary()
dict_size = len(vocab_list) dict_size = len(vocab_list)
@ -64,7 +79,7 @@ def max_infer():
padding=[-1, 1000]) padding=[-1, 1000])
infer_data = test_batch_reader().next() infer_data = test_batch_reader().next()
# run inference # run max-ctc-decoding
max_id_results = paddle.infer( max_id_results = paddle.infer(
output_layer=max_id, output_layer=max_id,
parameters=parameters, parameters=parameters,

@ -1,3 +1,11 @@
"""
Download, unpack and create manifest for Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import os import os
import wget import wget
@ -88,9 +96,10 @@ def main():
url=URL_DEV, url=URL_DEV,
target_dir=os.path.join(args.target_dir), target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".dev") manifest_path=args.manifest + ".dev")
#prepare_dataset(url=URL_TRAIN, prepare_dataset(
#target_dir=os.path.join(args.target_dir), url=URL_TRAIN,
#manifest_path=args.manifest + ".train") target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".train")
if __name__ == '__main__': if __name__ == '__main__':

@ -1,8 +1,17 @@
"""
A simplifed version of Baidu DeepSpeech2 model.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
#TODO: add bidirectional rnn.
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act): padding, act):
"""
Convolution layer with batch normalization.
"""
conv_layer = paddle.layer.img_conv( conv_layer = paddle.layer.img_conv(
input=input, input=input,
filter_size=filter_size, filter_size=filter_size,
@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
def bidirectonal_simple_rnn_bn_layer(name, input, size, act): def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
"""
Bidirectonal simple rnn layer with batch normalization.
The batch normalization is only performed on input-state projection
(sequence-wise normalization).
Question: does mean and variance statistics computed over the whole sequence
or just on each individual time steps?
"""
def __simple_rnn_step__(input): def __simple_rnn_step__(input):
last_state = paddle.layer.memory(name=name + "_state", size=size) last_state = paddle.layer.memory(name=name + "_state", size=size)
input_fc = paddle.layer.fc( input_fc = paddle.layer.fc(
@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
size=size, size=size,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=False) bias_attr=False)
# batch norm is only performed on input-state projection
input_fc_bn = paddle.layer.batch_norm( input_fc_bn = paddle.layer.batch_norm(
input=input_fc, act=paddle.activation.Linear()) input=input_fc, act=paddle.activation.Linear())
state_fc = paddle.layer.fc( state_fc = paddle.layer.fc(
@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
def conv_group(input, num_stacks): def conv_group(input, num_stacks):
"""
Convolution group with several stacking convolution layers.
"""
conv = conv_bn_layer( conv = conv_bn_layer(
input=input, input=input,
filter_size=(11, 41), filter_size=(11, 41),
@ -68,6 +90,9 @@ def conv_group(input, num_stacks):
def rnn_group(input, size, num_stacks): def rnn_group(input, size, num_stacks):
"""
RNN group with several stacking RNN layers.
"""
output = input output = input
for i in xrange(num_stacks): for i in xrange(num_stacks):
output = bidirectonal_simple_rnn_bn_layer( output = bidirectonal_simple_rnn_bn_layer(
@ -81,7 +106,27 @@ def deep_speech2(audio_data,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=256): rnn_size=256):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: Tuple of the cost layer and the max_id decoder layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers) conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand( conv2seq = paddle.layer.block_expand(
input=conv_group_output, input=conv_group_output,
num_channels=32, num_channels=32,
@ -89,18 +134,22 @@ def deep_speech2(audio_data,
stride_y=1, stride_y=1,
block_x=1, block_x=1,
block_y=21) block_y=21)
# rnn group
rnn_group_output = rnn_group( rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
# output token distribution
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=rnn_group_output, input=rnn_group_output,
size=dict_size + 1, size=dict_size + 1,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=True) bias_attr=True)
# ctc cost
cost = paddle.layer.warp_ctc( cost = paddle.layer.warp_ctc(
input=fc, input=fc,
label=text_data, label=text_data,
size=dict_size + 1, size=dict_size + 1,
blank=dict_size, blank=dict_size,
norm_by_times=True) norm_by_times=True)
# max decoder
max_id = paddle.layer.max_id(input=fc) max_id = paddle.layer.max_id(input=fc)
return cost, max_id return cost, max_id

@ -1,20 +1,27 @@
"""
Trainer for a simplifed version of Baidu DeepSpeech2 model.
"""
import paddle.v2 as paddle import paddle.v2 as paddle
import audio_data_utils
import argparse import argparse
from model import deep_speech2
import gzip import gzip
import sys
from model import deep_speech2
import audio_data_utils
#TODO: add WER metric
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='Simpled version of DeepSpeech2 trainer.') description='Simplified version of DeepSpeech2 trainer.')
parser.add_argument( parser.add_argument(
"--batch_size", default=512, type=int, help="Minibatch size.") "--batch_size", default=512, type=int, help="Minibatch size.")
parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument("--trainer", default=1, type=int, help="Trainer number.")
parser.add_argument( parser.add_argument(
"--num_passes", default=20, type=int, help="Training pass number.") "--num_passes", default=20, type=int, help="Training pass number.")
parser.add_argument( parser.add_argument(
"--num_conv_layers", default=2, type=int, help="Convolution layer number.") "--num_conv_layers", default=3, type=int, help="Convolution layer number.")
parser.add_argument( parser.add_argument(
"--num_rnn_layers", default=3, type=int, help="RNN layer number.") "--num_rnn_layers", default=5, type=int, help="RNN layer number.")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.")
parser.add_argument( parser.add_argument(
@ -25,6 +32,9 @@ args = parser.parse_args()
def train(): def train():
"""
DeepSpeech2 training.
"""
# create network config # create network config
dict_size = audio_data_utils.get_vocabulary_size() dict_size = audio_data_utils.get_vocabulary_size()
audio_data = paddle.layer.data( audio_data = paddle.layer.data(
@ -89,8 +99,7 @@ def train():
sys.stdout.flush() sys.stdout.flush()
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_batch_reader, feeding=feeding) result = trainer.test(reader=test_batch_reader, feeding=feeding)
print "Pass: %d, TestCost: %f, %s" % (event.pass_id, event.cost, print "Pass: %d, TestMetric: %s" % (event.pass_id, result.metrics)
result.metrics)
with gzip.open("params.tar.gz", 'w') as f: with gzip.open("params.tar.gz", 'w') as f:
parameters.to_tar(f) parameters.to_tar(f)

Loading…
Cancel
Save