diff --git a/README.md b/README.md index fcadf568..1f7e0384 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,5 @@ sh requirements.sh python librispeech.py python train.py ``` + +Please add warp-ctc library path (usually $PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib) to LD_LIBRARY_PATH. diff --git a/audio_data_utils.py b/audio_data_utils.py index 2f7bfcf7..6dedfbf9 100644 --- a/audio_data_utils.py +++ b/audio_data_utils.py @@ -90,6 +90,10 @@ def get_vocabulary_size(): return len(vocab_dict) +def get_vocabulary(): + return vocabulary_from_file(ENGLISH_CHAR_VOCAB_FILEPATH) + + def parse_transcript(text, vocabulary): """ Convert the transcript text string to list of token index integers.. diff --git a/infer.py b/infer.py new file mode 100644 index 00000000..7b16c838 --- /dev/null +++ b/infer.py @@ -0,0 +1,94 @@ +import paddle.v2 as paddle +import audio_data_utils +import argparse +from model import deep_speech2 +import gzip +from itertools import groupby + +parser = argparse.ArgumentParser( + description='Simpled version of DeepSpeech2 inference.') +parser.add_argument( + "--num_samples", default=10, type=int, help="Number of inference samples.") +parser.add_argument( + "--num_conv_layers", default=2, type=int, help="Convolution layer number.") +parser.add_argument( + "--num_rnn_layers", default=3, type=int, help="RNN layer number.") +parser.add_argument( + "--rnn_layer_size", default=512, type=int, help="RNN layer cell number.") +parser.add_argument( + "--use_gpu", default=True, type=bool, help="Use gpu or not.") +args = parser.parse_args() + + +def remove_duplicate_and_blank(id_list, blank_id): + # remove consecutive duplicate tokens + id_list = [x[0] for x in groupby(id_list)] + # remove blank + return [id for id in id_list if id != blank_id] + + +def max_infer(): + # create network config + _, vocab_list = audio_data_utils.get_vocabulary() + dict_size = len(vocab_list) + audio_data = paddle.layer.data( + name="audio_spectrogram", + height=161, + width=1000, + type=paddle.data_type.dense_vector(161000)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(dict_size)) + _, max_id = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=dict_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar( + gzip.open("params.tar.gz")) + + # prepare infer data + feeding = { + "audio_spectrogram": 0, + "transcript_text": 1, + } + test_batch_reader = audio_data_utils.padding_batch_reader( + paddle.batch( + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.test", sort_by_duration=False), + batch_size=args.num_samples), + padding=[-1, 1000]) + infer_data = test_batch_reader().next() + + # run inference + max_id_results = paddle.infer( + output_layer=max_id, + parameters=parameters, + input=infer_data, + field=['id']) + + # postprocess + instance_length = len(max_id_results) / args.num_samples + instance_list = [ + max_id_results[i:i + instance_length] + for i in xrange(0, args.num_samples) + ] + for i, instance in enumerate(instance_list): + id_list = remove_duplicate_and_blank(instance, dict_size) + output_transcript = ''.join([vocab_list[id] for id in id_list]) + target_transcript = ''.join([vocab_list[id] for id in infer_data[i][1]]) + print("Target Transcript: %s \nOutput Transcript: %s \n" % + (target_transcript, output_transcript)) + + +def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=1) + max_infer() + + +if __name__ == '__main__': + main() diff --git a/librispeech.py b/librispeech.py index fc7b9822..0d82e19f 100644 --- a/librispeech.py +++ b/librispeech.py @@ -23,7 +23,7 @@ parser.add_argument( "--manifest", default="./libri.manifest", type=str, - help="Filepath prefix of output manifests.") + help="Filepath prefix for output manifests.") args = parser.parse_args() diff --git a/model.py b/model.py new file mode 100644 index 00000000..67bee5f7 --- /dev/null +++ b/model.py @@ -0,0 +1,106 @@ +import paddle.v2 as paddle + + +def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, + padding, act): + conv_layer = paddle.layer.img_conv( + input=input, + filter_size=filter_size, + num_channels=num_channels_in, + num_filters=num_channels_out, + stride=stride, + padding=padding, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.batch_norm(input=conv_layer, act=act) + + +def bidirectonal_simple_rnn_bn_layer(name, input, size, act): + def __simple_rnn_step__(input): + last_state = paddle.layer.memory(name=name + "_state", size=size) + input_fc = paddle.layer.fc( + input=input, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + input_fc_bn = paddle.layer.batch_norm( + input=input_fc, act=paddle.activation.Linear()) + state_fc = paddle.layer.fc( + input=last_state, + size=size, + act=paddle.activation.Linear(), + bias_attr=False) + return paddle.layer.addto( + name=name + "_state", input=[input_fc_bn, state_fc], act=act) + + forward = paddle.layer.recurrent_group( + step=__simple_rnn_step__, input=input) + return forward + # argument reverse is not exposed in V2 recurrent_group + #backward = paddle.layer.recurrent_group( + + +#step=__simple_rnn_step__, +#input=input, +#reverse=True) +#return paddle.layer.concat(input=[forward, backward]) + + +def conv_group(input, num_stacks): + conv = conv_bn_layer( + input=input, + filter_size=(11, 41), + num_channels_in=1, + num_channels_out=32, + stride=(3, 2), + padding=(5, 20), + act=paddle.activation.BRelu()) + for i in xrange(num_stacks - 1): + conv = conv_bn_layer( + input=conv, + filter_size=(11, 21), + num_channels_in=32, + num_channels_out=32, + stride=(1, 2), + padding=(5, 10), + act=paddle.activation.BRelu()) + return conv + + +def rnn_group(input, size, num_stacks): + output = input + for i in xrange(num_stacks): + output = bidirectonal_simple_rnn_bn_layer( + name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + return output + + +def deep_speech2(audio_data, + text_data, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=256): + conv_group_output = conv_group(input=audio_data, num_stacks=num_conv_layers) + conv2seq = paddle.layer.block_expand( + input=conv_group_output, + num_channels=32, + stride_x=1, + stride_y=1, + block_x=1, + block_y=21) + rnn_group_output = rnn_group( + input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + fc = paddle.layer.fc( + input=rnn_group_output, + size=dict_size + 1, + act=paddle.activation.Linear(), + bias_attr=True) + cost = paddle.layer.warp_ctc( + input=fc, + label=text_data, + size=dict_size + 1, + blank=dict_size, + norm_by_times=True) + max_id = paddle.layer.max_id(input=fc) + return cost, max_id diff --git a/requirements.sh b/requirements.sh index 7a089169..bb1f261d 100644 --- a/requirements.sh +++ b/requirements.sh @@ -1,5 +1,5 @@ pip install wget pip install soundfile -# For Linux only +# For Ubuntu only apt-get install libsndfile1 diff --git a/train.py b/train.py index 083a718d..64be4033 100644 --- a/train.py +++ b/train.py @@ -1,6 +1,8 @@ import paddle.v2 as paddle import audio_data_utils import argparse +from model import deep_speech2 +import gzip parser = argparse.ArgumentParser( description='Simpled version of DeepSpeech2 trainer.') @@ -9,114 +11,19 @@ parser.add_argument( parser.add_argument("--trainer", default=1, type=int, help="Trainer number.") parser.add_argument( "--num_passes", default=20, type=int, help="Training pass number.") +parser.add_argument( + "--num_conv_layers", default=2, type=int, help="Convolution layer number.") +parser.add_argument( + "--num_rnn_layers", default=3, type=int, help="RNN layer number.") +parser.add_argument( + "--rnn_layer_size", default=256, type=int, help="RNN layer cell number.") +parser.add_argument( + "--use_gpu", default=True, type=bool, help="Use gpu or not.") +parser.add_argument( + "--trainer_count", default=8, type=int, help="Trainer number.") args = parser.parse_args() -def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, - padding, act): - conv_layer = paddle.layer.img_conv( - input=input, - filter_size=filter_size, - num_channels=num_channels_in, - num_filters=num_channels_out, - stride=stride, - padding=padding, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.batch_norm(input=conv_layer, act=act) - - -def bidirectonal_simple_rnn_bn_layer(name, input, size, act): - def __simple_rnn_step__(input): - last_state = paddle.layer.memory(name=name + "_state", size=size) - input_fc = paddle.layer.fc( - input=input, - size=size, - act=paddle.activation.Linear(), - bias_attr=False) - input_fc_bn = paddle.layer.batch_norm( - input=input_fc, act=paddle.activation.Linear()) - state_fc = paddle.layer.fc( - input=last_state, - size=size, - act=paddle.activation.Linear(), - bias_attr=False) - return paddle.layer.addto( - name=name + "_state", input=[input_fc_bn, state_fc], act=act) - - forward = paddle.layer.recurrent_group( - step=__simple_rnn_step__, input=input) - return forward - # argument reverse is not exposed in V2 recurrent_group - #backward = paddle.layer.recurrent_group( - - -#step=__simple_rnn_step__, -#input=input, -#reverse=True) -#return paddle.layer.concat(input=[forward, backward]) - - -def conv_group(input): - conv1 = conv_bn_layer( - input=input, - filter_size=(11, 41), - num_channels_in=1, - num_channels_out=32, - stride=(3, 2), - padding=(5, 20), - act=paddle.activation.BRelu()) - conv2 = conv_bn_layer( - input=conv1, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - conv3 = conv_bn_layer( - input=conv2, - filter_size=(11, 21), - num_channels_in=32, - num_channels_out=32, - stride=(1, 2), - padding=(5, 10), - act=paddle.activation.BRelu()) - return conv3 - - -def rnn_group(input, size, num_stacks): - output = input - for i in xrange(num_stacks): - output = bidirectonal_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) - return output - - -def deep_speech2(audio_data, text_data, dict_size): - conv_group_output = conv_group(input=audio_data) - conv2seq = paddle.layer.block_expand( - input=conv_group_output, - num_channels=32, - stride_x=1, - stride_y=1, - block_x=1, - block_y=21) - rnn_group_output = rnn_group(input=conv2seq, size=256, num_stacks=5) - fc = paddle.layer.fc( - input=rnn_group_output, - size=dict_size + 1, - act=paddle.activation.Linear(), - bias_attr=True) - cost = paddle.layer.warp_ctc( - input=fc, - label=text_data, - size=dict_size + 1, - blank=dict_size, - norm_by_times=True) - return cost - - def train(): # create network config dict_size = audio_data_utils.get_vocabulary_size() @@ -128,7 +35,13 @@ def train(): text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) - cost = deep_speech2(audio_data, text_data, dict_size) + cost, _ = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=dict_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size) # create parameters and optimizer parameters = paddle.parameters.create(cost) @@ -138,21 +51,30 @@ def train(): regularization=paddle.optimizer.L2Regularization(rate=8e-4)) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) - return # create data readers feeding = { "audio_spectrogram": 0, "transcript_text": 1, } - train_batch_reader = audio_data_utils.padding_batch_reader( + train_batch_reader_with_sortagrad = audio_data_utils.padding_batch_reader( paddle.batch( - audio_data_utils.reader_creator("./libri.manifest.dev"), + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.dev", sort_by_duration=True), + batch_size=args.batch_size // args.trainer), + padding=[-1, 1000]) + train_batch_reader_without_sortagrad = audio_data_utils.padding_batch_reader( + paddle.batch( + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.dev", + sort_by_duration=False, + shuffle=True), batch_size=args.batch_size // args.trainer), padding=[-1, 1000]) test_batch_reader = audio_data_utils.padding_batch_reader( paddle.batch( - audio_data_utils.reader_creator("./libri.manifest.test"), + audio_data_utils.reader_creator( + manifest_path="./libri.manifest.test", sort_by_duration=False), batch_size=args.batch_size // args.trainer), padding=[-1, 1000]) @@ -174,13 +96,19 @@ def train(): # run train trainer.train( - reader=train_batch_reader, + reader=train_batch_reader_with_sortagrad, + event_handler=event_handler, + num_passes=1, + feeding=feeding) + trainer.train( + reader=train_batch_reader_without_sortagrad, event_handler=event_handler, - num_passes=10, + num_passes=self.num_passes - 1, feeding=feeding) def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) train()