Merge pull request #183 from xinghai-sun/refine_decoder2

Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class for DS2.
pull/2/head
Xinghai Sun 8 years ago committed by GitHub
commit a3807d9cb5

@ -205,9 +205,9 @@ def ctc_beam_search_decoder_batch(probs_split,
:type num_processes: int :type num_processes: int
:param cutoff_prob: Cutoff probability in pruning, :param cutoff_prob: Cutoff probability in pruning,
default 1.0, no pruning. default 1.0, no pruning.
:type cutoff_prob: float
:param num_processes: Number of parallel processes. :param num_processes: Number of parallel processes.
:type num_processes: int :type num_processes: int
:type cutoff_prob: float
:param ext_scoring_func: External scoring function for :param ext_scoring_func: External scoring function for
partially decoded sentence, e.g. word count partially decoded sentence, e.g. word count
or language model. or language model.

@ -5,20 +5,24 @@ from __future__ import print_function
import distutils.util import distutils.util
import argparse import argparse
import gzip import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import DeepSpeech2Model
from decoder import *
from lm.lm_scorer import LmScorer
from error_rate import wer from error_rate import wer
import utils
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( parser.add_argument(
"--batch_size", "--batch_size",
default=100, default=128,
type=int, type=int,
help="Minibatch size for evaluation. (default: %(default)s)") help="Minibatch size for evaluation. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--num_conv_layers", "--num_conv_layers",
default=2, default=2,
@ -58,8 +62,8 @@ parser.add_argument(
"--decode_method", "--decode_method",
default='beam_search', default='beam_search',
type=str, type=str,
help="Method for ctc decoding, best_path or beam_search. (default: %(default)s)" help="Method for ctc decoding, best_path or beam_search. "
) "(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--language_model_path", "--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm", default="lm/data/common_crawl_00.prune01111.trie.klm",
@ -67,12 +71,12 @@ parser.add_argument(
help="Path for language model. (default: %(default)s)") help="Path for language model. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--alpha", "--alpha",
default=0.26, default=0.36,
type=float, type=float,
help="Parameter associated with language model. (default: %(default)f)") help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument( parser.add_argument(
"--beta", "--beta",
default=0.1, default=0.25,
type=float, type=float,
help="Parameter associated with word count. (default: %(default)f)") help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument( parser.add_argument(
@ -112,37 +116,12 @@ args = parser.parse_args()
def evaluate(): def evaluate():
"""Evaluate on whole test data for DeepSpeech2.""" """Evaluate on whole test data for DeepSpeech2."""
# initialize data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
output_probs = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.decode_manifest_path,
batch_size=args.batch_size, batch_size=args.batch_size,
@ -150,61 +129,39 @@ def evaluate():
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
# define inferer ds2_model = DeepSpeech2Model(
inferer = paddle.inference.Inference( vocab_size=data_generator.vocab_size,
output_layer=output_probs, parameters=parameters) num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
# initialize external scorer for beam search decoding rnn_layer_size=args.rnn_layer_size,
if args.decode_method == 'beam_search': pretrained_model_path=args.model_filepath)
ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path)
wer_counter, wer_sum = 0, 0.0 wer_sum, num_ins = 0.0, 0
for infer_data in batch_reader(): for infer_data in batch_reader():
# run inference result_transcripts = ds2_model.infer_batch(
infer_results = inferer.infer(input=infer_data) infer_data=infer_data,
num_steps = len(infer_results) // len(infer_data) decode_method=args.decode_method,
probs_split = [ beam_alpha=args.alpha,
infer_results[i * num_steps:(i + 1) * num_steps] beam_beta=args.beta,
for i in xrange(0, len(infer_data))
]
# target transcription
target_transcription = [
''.join([
data_generator.vocab_list[index] for index in infer_data[i][1]
]) for i, probs in enumerate(probs_split)
]
# decode and print
# best path decode
if args.decode_method == "best_path":
for i, probs in enumerate(probs_split):
output_transcription = ctc_best_path_decoder(
probs_seq=probs, vocabulary=data_generator.vocab_list)
wer_sum += wer(target_transcription[i], output_transcription)
wer_counter += 1
# beam search decode
elif args.decode_method == "beam_search":
# beam search using multiple processes
beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=data_generator.vocab_list,
beam_size=args.beam_size, beam_size=args.beam_size,
blank_id=len(data_generator.vocab_list), cutoff_prob=args.cutoff_prob,
num_processes=args.num_processes_beam_search, vocab_list=data_generator.vocab_list,
ext_scoring_func=ext_scorer, language_model_path=args.language_model_path,
cutoff_prob=args.cutoff_prob, ) num_processes=args.num_processes_beam_search)
for i, beam_search_result in enumerate(beam_search_results): target_transcripts = [
wer_sum += wer(target_transcription[i], ''.join([data_generator.vocab_list[token] for token in transcript])
beam_search_result[0][1]) for _, transcript in infer_data
wer_counter += 1 ]
else: for target, result in zip(target_transcripts, result_transcripts):
raise ValueError("Decoding method [%s] is not supported." % wer_sum += wer(target, result)
decode_method) num_ins += 1
print("WER (%d/?) = %f" % (num_ins, wer_sum / num_ins))
print("Final WER = %f" % (wer_sum / wer_counter)) print("Final WER (%d/%d) = %f" % (num_ins, num_ins, wer_sum / num_ins))
def main(): def main():
paddle.init(use_gpu=args.use_gpu, trainer_count=1) utils.print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
evaluate() evaluate()

@ -4,14 +4,11 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import gzip
import distutils.util import distutils.util
import multiprocessing import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import DeepSpeech2Model
from decoder import *
from lm.lm_scorer import LmScorer
from error_rate import wer from error_rate import wer
import utils import utils
@ -43,7 +40,7 @@ parser.add_argument(
help="Use gpu or not. (default: %(default)s)") help="Use gpu or not. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--num_threads_data", "--num_threads_data",
default=multiprocessing.cpu_count(), default=1,
type=int, type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)") help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument( parser.add_argument(
@ -57,6 +54,11 @@ parser.add_argument(
type=str, type=str,
help="Feature type of audio data: 'linear' (power spectrum)" help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)") " or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--mean_std_filepath", "--mean_std_filepath",
default='mean_std.npz', default='mean_std.npz',
@ -119,37 +121,12 @@ args = parser.parse_args()
def infer(): def infer():
"""Inference for DeepSpeech2.""" """Inference for DeepSpeech2."""
# initialize data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
output_probs = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.decode_manifest_path,
batch_size=args.num_samples, batch_size=args.num_samples,
@ -158,66 +135,36 @@ def infer():
shuffle_method=None) shuffle_method=None)
infer_data = batch_reader().next() infer_data = batch_reader().next()
# run inference ds2_model = DeepSpeech2Model(
infer_results = paddle.infer( vocab_size=data_generator.vocab_size,
output_layer=output_probs, parameters=parameters, input=infer_data) num_conv_layers=args.num_conv_layers,
num_steps = len(infer_results) // len(infer_data) num_rnn_layers=args.num_rnn_layers,
probs_split = [ rnn_layer_size=args.rnn_layer_size,
infer_results[i * num_steps:(i + 1) * num_steps] pretrained_model_path=args.model_filepath)
for i in xrange(len(infer_data)) result_transcripts = ds2_model.infer_batch(
] infer_data=infer_data,
decode_method=args.decode_method,
beam_alpha=args.alpha,
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path,
num_processes=args.num_processes_beam_search)
# targe transcription target_transcripts = [
target_transcription = [ ''.join([data_generator.vocab_list[token] for token in transcript])
''.join( for _, transcript in infer_data
[data_generator.vocab_list[index] for index in infer_data[i][1]])
for i, probs in enumerate(probs_split)
] ]
for target, result in zip(target_transcripts, result_transcripts):
## decode and print
# best path decode
wer_sum, wer_counter = 0, 0
if args.decode_method == "best_path":
for i, probs in enumerate(probs_split):
best_path_transcription = ctc_best_path_decoder(
probs_seq=probs, vocabulary=data_generator.vocab_list)
print("\nTarget Transcription: %s\nOutput Transcription: %s" % print("\nTarget Transcription: %s\nOutput Transcription: %s" %
(target_transcription[i], best_path_transcription)) (target, result))
wer_cur = wer(target_transcription[i], best_path_transcription) print("Current wer = %f" % wer(target, result))
wer_sum += wer_cur
wer_counter += 1
print("cur wer = %f, average wer = %f" %
(wer_cur, wer_sum / wer_counter))
# beam search decode
elif args.decode_method == "beam_search":
ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path)
beam_search_batch_results = ctc_beam_search_decoder_batch(
probs_split=probs_split,
vocabulary=data_generator.vocab_list,
beam_size=args.beam_size,
blank_id=len(data_generator.vocab_list),
num_processes=args.num_processes_beam_search,
cutoff_prob=args.cutoff_prob,
ext_scoring_func=ext_scorer, )
for i, beam_search_result in enumerate(beam_search_batch_results):
print("\nTarget Transcription:\t%s" % target_transcription[i])
for index in xrange(args.num_results_per_sample):
result = beam_search_result[index]
#output: index, log prob, beam result
print("Beam %d: %f \t%s" % (index, result[0], result[1]))
wer_cur = wer(target_transcription[i], beam_search_result[0][1])
wer_sum += wer_cur
wer_counter += 1
print("cur wer = %f , average wer = %f" %
(wer_cur, wer_sum / wer_counter))
else:
raise ValueError("Decoding method [%s] is not supported." %
decode_method)
def main(): def main():
utils.print_arguments(args) utils.print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=1) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer() infer()

@ -0,0 +1,177 @@
"""Contains DeepSpeech2 layers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.v2 as paddle
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
padding, act):
"""Convolution layer with batch normalization.
:param input: Input layer.
:type input: LayerOutput
:param filter_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type filter_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:type num_channels_out: Number of output channels.
:type num_channels_in: out
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type.
:type act: BaseActivation
:return: Batch norm layer after convolution layer.
:rtype: LayerOutput
"""
conv_layer = paddle.layer.img_conv(
input=input,
filter_size=filter_size,
num_channels=num_channels_in,
num_filters=num_channels_out,
stride=stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells.
:type size: int
:param act: Activation type.
:type act: BaseActivation
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=True)
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
def conv_group(input, num_stacks):
"""Convolution group with stacked convolution layers.
:param input: Input layer.
:type input: LayerOutput
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
:return: Output layer of the convolution group.
:rtype: LayerOutput
"""
conv = conv_bn_layer(
input=input,
filter_size=(11, 41),
num_channels_in=1,
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
filter_size=(11, 21),
num_channels_in=32,
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
def rnn_group(input, size, num_stacks):
"""RNN group with stacked bidirectional simple RNN layers.
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:return: Output layer of the RNN group.
:rtype: LayerOutput
"""
output = input
for i in xrange(num_stacks):
output = bidirectional_simple_rnn_bn_layer(
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
return output
def deep_speech2(audio_data,
text_data,
dict_size,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=256):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
input=audio_data, num_stacks=num_conv_layers)
# convert data form convolution feature map to sequence of vectors
conv2seq = paddle.layer.block_expand(
input=conv_group_output,
num_channels=conv_group_num_channels,
stride_x=1,
stride_y=1,
block_x=1,
block_y=conv_group_height)
# rnn group
rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
fc = paddle.layer.fc(
input=rnn_group_output,
size=dict_size + 1,
act=paddle.activation.Linear(),
bias_attr=True)
# probability distribution with softmax
log_probs = paddle.layer.mixed(
input=paddle.layer.identity_projection(input=fc),
act=paddle.activation.Softmax())
# ctc cost
ctc_loss = paddle.layer.warp_ctc(
input=fc,
label=text_data,
size=dict_size + 1,
blank=dict_size,
norm_by_times=True)
return log_probs, ctc_loss

@ -3,141 +3,220 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import sys
import os
import time
import gzip
from decoder import *
from lm.lm_scorer import LmScorer
import paddle.v2 as paddle import paddle.v2 as paddle
from layer import *
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride, class DeepSpeech2Model(object):
padding, act): """DeepSpeech2Model class.
"""
Convolution layer with batch normalization.
"""
conv_layer = paddle.layer.img_conv(
input=input,
filter_size=filter_size,
num_channels=num_channels_in,
num_filters=num_channels_out,
stride=stride,
padding=padding,
act=paddle.activation.Linear(),
bias_attr=False)
return paddle.layer.batch_norm(input=conv_layer, act=act)
def bidirectional_simple_rnn_bn_layer(name, input, size, act): :param vocab_size: Decoding vocabulary size.
""" :type vocab_size: int
Bidirectonal simple rnn layer with sequence-wise batch normalization. :param num_conv_layers: Number of stacking convolution layers.
The batch normalization is only performed on input-state weights. :type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_layer_size: RNN layer size (number of RNN cells).
:type rnn_layer_size: int
:param pretrained_model_path: Pretrained model path. If None, will train
from stratch.
:type pretrained_model_path: basestring|None
""" """
# input-hidden weights shared across bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn, act=act, reverse=True)
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, pretrained_model_path):
self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size)
self._create_parameters(pretrained_model_path)
self._inferer = None
self._ext_scorer = None
def conv_group(input, num_stacks): def train(self,
""" train_batch_reader,
Convolution group with several stacking convolution layers. dev_batch_reader,
feeding_dict,
learning_rate,
gradient_clipping,
num_passes,
output_model_dir,
num_iterations_print=100):
"""Train the model.
:param train_batch_reader: Train data reader.
:type train_batch_reader: callable
:param dev_batch_reader: Validation data reader.
:type dev_batch_reader: callable
:param feeding_dict: Feeding is a map of field name and tuple index
of the data that reader returns.
:type feeding_dict: dict|list
:param learning_rate: Learning rate for ADAM optimizer.
:type learning_rate: float
:param gradient_clipping: Gradient clipping threshold.
:type gradient_clipping: float
:param num_passes: Number of training epochs.
:type num_passes: int
:param num_iterations_print: Number of training iterations for printing
a training loss.
:type rnn_iteratons_print: int
:param output_model_dir: Directory for saving the model (every pass).
:type output_model_dir: basestring
""" """
conv = conv_bn_layer( # prepare model output directory
input=input, if not os.path.exists(output_model_dir):
filter_size=(11, 41), os.mkdir(output_model_dir)
num_channels_in=1,
num_channels_out=32,
stride=(3, 2),
padding=(5, 20),
act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1):
conv = conv_bn_layer(
input=conv,
filter_size=(11, 21),
num_channels_in=32,
num_channels_out=32,
stride=(1, 2),
padding=(5, 10),
act=paddle.activation.BRelu())
output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height
# prepare optimizer and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=learning_rate,
gradient_clipping_threshold=gradient_clipping)
trainer = paddle.trainer.SGD(
cost=self._loss,
parameters=self._parameters,
update_equation=optimizer)
def rnn_group(input, size, num_stacks): # create event handler
""" def event_handler(event):
RNN group with several stacking RNN layers. global start_time, cost_sum, cost_counter
""" if isinstance(event, paddle.event.EndIteration):
output = input cost_sum += event.cost
for i in xrange(num_stacks): cost_counter += 1
output = bidirectional_simple_rnn_bn_layer( if (event.batch_id + 1) % num_iterations_print == 0:
name=str(i), input=output, size=size, act=paddle.activation.BRelu()) output_model_path = os.path.join(output_model_dir,
return output "params.latest.tar.gz")
with gzip.open(output_model_path, 'w') as f:
self._parameters.to_tar(f)
print("\nPass: %d, Batch: %d, TrainCost: %f" %
(event.pass_id, event.batch_id + 1,
cost_sum / cost_counter))
cost_sum, cost_counter = 0.0, 0
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.BeginPass):
start_time = time.time()
cost_sum, cost_counter = 0.0, 0
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=dev_batch_reader, feeding=feeding_dict)
output_model_path = os.path.join(
output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
with gzip.open(output_model_path, 'w') as f:
self._parameters.to_tar(f)
print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" %
(time.time() - start_time, event.pass_id, result.cost))
# run train
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding_dict)
def deep_speech2(audio_data, def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta,
text_data, beam_size, cutoff_prob, vocab_list, language_model_path,
dict_size, num_processes):
num_conv_layers=2, """Model inference. Infer the transcription for a batch of speech
num_rnn_layers=3, utterances.
rnn_size=256,
is_inference=False):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer. :param infer_data: List of utterances to infer, with each utterance a
:type audio_data: LayerOutput tuple of audio features and transcription text (empty
:param text_data: Transcription text data layer. string).
:type text_data: LayerOutput :type infer_data: list
:param dict_size: Dictionary size for tokenized transcription. :param decode_method: Decoding method name, 'best_path' or
:type dict_size: int 'beam search'.
:param num_conv_layers: Number of stacking convolution layers. :param decode_method: string
:type num_conv_layers: int :param beam_alpha: Parameter associated with language model.
:param num_rnn_layers: Number of stacking RNN layers. :type beam_alpha: float
:type num_rnn_layers: int :param beam_beta: Parameter associated with word count.
:param rnn_size: RNN layer size (number of RNN cells). :type beam_beta: float
:type rnn_size: int :param beam_size: Width for Beam search.
:param is_inference: False in the training mode, and True in the :type beam_size: int
inferene mode. :param cutoff_prob: Cutoff probability in pruning,
:type is_inference: bool default 1.0, no pruning.
:return: If is_inference set False, return a ctc cost layer; :type cutoff_prob: float
if is_inference set True, return a sequence layer of output :param vocab_list: List of tokens in the vocabulary, for decoding.
probability distribution. :type vocab_list: list
:rtype: tuple of LayerOutput :param language_model_path: Filepath for language model.
:type language_model_path: basestring|None
:param num_processes: Number of processes (CPU) for decoder.
:type num_processes: int
:return: List of transcription texts.
:rtype: List of basestring
""" """
# convolution group # define inferer
conv_group_output, conv_group_num_channels, conv_group_height = conv_group( if self._inferer == None:
input=audio_data, num_stacks=num_conv_layers) self._inferer = paddle.inference.Inference(
# convert data form convolution feature map to sequence of vectors output_layer=self._log_probs, parameters=self._parameters)
conv2seq = paddle.layer.block_expand( # run inference
input=conv_group_output, infer_results = self._inferer.infer(input=infer_data)
num_channels=conv_group_num_channels, num_steps = len(infer_results) // len(infer_data)
stride_x=1, probs_split = [
stride_y=1, infer_results[i * num_steps:(i + 1) * num_steps]
block_x=1, for i in xrange(0, len(infer_data))
block_y=conv_group_height) ]
# rnn group # run decoder
rnn_group_output = rnn_group( results = []
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) if decode_method == "best_path":
fc = paddle.layer.fc( # best path decode
input=rnn_group_output, for i, probs in enumerate(probs_split):
size=dict_size + 1, output_transcription = ctc_best_path_decoder(
act=paddle.activation.Linear(), probs_seq=probs, vocabulary=data_generator.vocab_list)
bias_attr=True) results.append(output_transcription)
if is_inference: elif decode_method == "beam_search":
# probability distribution with softmax # initialize external scorer
return paddle.layer.mixed( if self._ext_scorer == None:
input=paddle.layer.identity_projection(input=fc), self._ext_scorer = LmScorer(beam_alpha, beam_beta,
act=paddle.activation.Softmax()) language_model_path)
self._loaded_lm_path = language_model_path
else: else:
# ctc cost self._ext_scorer.reset_params(beam_alpha, beam_beta)
return paddle.layer.warp_ctc( assert self._loaded_lm_path == language_model_path
input=fc,
label=text_data, # beam search decode
size=dict_size + 1, beam_search_results = ctc_beam_search_decoder_batch(
blank=dict_size, probs_split=probs_split,
norm_by_times=True) vocabulary=vocab_list,
beam_size=beam_size,
blank_id=len(vocab_list),
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob)
results = [result[0][1] for result in beam_search_results]
else:
raise ValueError("Decoding method [%s] is not supported." %
decode_method)
return results
def _create_parameters(self, model_path=None):
"""Load or create model parameters."""
if model_path is None:
self._parameters = paddle.parameters.create(self._loss)
else:
self._parameters = paddle.parameters.Parameters.from_tar(
gzip.open(model_path))
def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size):
"""Create data layers and model network."""
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram",
type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(vocab_size))
self._log_probs, self._loss = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=vocab_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_size=rnn_layer_size)

@ -26,7 +26,4 @@ if [ $? != 0 ]; then
rm libsndfile-1.0.28.tar.gz rm libsndfile-1.0.28.tar.gz
fi fi
# prepare ./checkpoints
mkdir checkpoints
echo "Install all dependencies successfully." echo "Install all dependencies successfully."

@ -3,15 +3,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import sys
import os
import argparse import argparse
import gzip
import time
import distutils.util import distutils.util
import multiprocessing import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from model import deep_speech2 from model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
import utils import utils
@ -23,6 +19,12 @@ parser.add_argument(
default=200, default=200,
type=int, type=int,
help="Training pass number. (default: %(default)s)") help="Training pass number. (default: %(default)s)")
parser.add_argument(
"--num_iterations_print",
default=100,
type=int,
help="Number of iterations for every train cost printing. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--num_conv_layers", "--num_conv_layers",
default=2, default=2,
@ -114,6 +116,11 @@ parser.add_argument(
help="If set None, the training will start from scratch. " help="If set None, the training will start from scratch. "
"Otherwise, the training will resume from " "Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)") "the existing model of this path. (default: %(default)s)")
parser.add_argument(
"--output_model_dir",
default="./checkpoints",
type=str,
help="Directory for saving models. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--augmentation_config", "--augmentation_config",
default='[{"type": "shift", ' default='[{"type": "shift", '
@ -127,10 +134,7 @@ args = parser.parse_args()
def train(): def train():
"""DeepSpeech2 training.""" """DeepSpeech2 training."""
train_generator = DataGenerator(
# initialize data generator
def data_generator():
return DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config=args.augmentation_config, augmentation_config=args.augmentation_config,
@ -138,89 +142,40 @@ def train():
min_duration=args.min_duration, min_duration=args.min_duration,
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
dev_generator = DataGenerator(
train_generator = data_generator() vocab_filepath=args.vocab_filepath,
test_generator = data_generator() mean_std_filepath=args.mean_std_filepath,
augmentation_config="{}",
# create network config specgram_type=args.specgram_type,
# paddle.data_type.dense_array is used for variable batch input. num_threads=args.num_threads_data)
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(
train_generator.vocab_size))
cost = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=train_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=False)
# create/load parameters and optimizer
if args.init_model_path is None:
parameters = paddle.parameters.create(cost)
else:
if not os.path.isfile(args.init_model_path):
raise IOError("Invalid model!")
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.init_model_path))
optimizer = paddle.optimizer.Adam(
learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# prepare data reader
train_batch_reader = train_generator.batch_reader_creator( train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest_path, manifest_path=args.train_manifest_path,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=args.trainer_count, min_batch_size=args.trainer_count,
sortagrad=args.use_sortagrad if args.init_model_path is None else False, sortagrad=args.use_sortagrad if args.init_model_path is None else False,
shuffle_method=args.shuffle_method) shuffle_method=args.shuffle_method)
test_batch_reader = test_generator.batch_reader_creator( dev_batch_reader = dev_generator.batch_reader_creator(
manifest_path=args.dev_manifest_path, manifest_path=args.dev_manifest_path,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, # must be 1, but will have errors. min_batch_size=1, # must be 1, but will have errors.
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
# create event handler ds2_model = DeepSpeech2Model(
def event_handler(event): vocab_size=train_generator.vocab_size,
global start_time, cost_sum, cost_counter num_conv_layers=args.num_conv_layers,
if isinstance(event, paddle.event.EndIteration): num_rnn_layers=args.num_rnn_layers,
cost_sum += event.cost rnn_layer_size=args.rnn_layer_size,
cost_counter += 1 pretrained_model_path=args.init_model_path)
if (event.batch_id + 1) % 100 == 0: ds2_model.train(
print("\nPass: %d, Batch: %d, TrainCost: %f" % ( train_batch_reader=train_batch_reader,
event.pass_id, event.batch_id + 1, cost_sum / cost_counter)) dev_batch_reader=dev_batch_reader,
cost_sum, cost_counter = 0.0, 0 feeding_dict=train_generator.feeding,
with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f: learning_rate=args.adam_learning_rate,
parameters.to_tar(f) gradient_clipping=400,
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.BeginPass):
start_time = time.time()
cost_sum, cost_counter = 0.0, 0
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=test_batch_reader, feeding=test_generator.feeding)
print("\n------- Time: %d sec, Pass: %d, ValidationCost: %s" %
(time.time() - start_time, event.pass_id, result.cost))
with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
'w') as f:
parameters.to_tar(f)
# run train
trainer.train(
reader=train_batch_reader,
event_handler=event_handler,
num_passes=args.num_passes, num_passes=args.num_passes,
feeding=train_generator.feeding) num_iterations_print=args.num_iterations_print,
output_model_dir=args.output_model_dir)
def main(): def main():

@ -3,14 +3,13 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np
import distutils.util import distutils.util
import argparse import argparse
import gzip import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import DeepSpeech2Model
from decoder import *
from lm.lm_scorer import LmScorer
from error_rate import wer from error_rate import wer
import utils import utils
@ -40,9 +39,14 @@ parser.add_argument(
default=True, default=True,
type=distutils.util.strtobool, type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)") help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--num_threads_data", "--num_threads_data",
default=multiprocessing.cpu_count(), default=1,
type=int, type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)") help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument( parser.add_argument(
@ -62,10 +66,10 @@ parser.add_argument(
type=str, type=str,
help="Manifest path for normalizer. (default: %(default)s)") help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--decode_manifest_path", "--tune_manifest_path",
default='datasets/manifest.test', default='datasets/manifest.dev',
type=str, type=str,
help="Manifest path for decoding. (default: %(default)s)") help="Manifest path for tuning. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--model_filepath", "--model_filepath",
default='checkpoints/params.latest.tar.gz', default='checkpoints/params.latest.tar.gz',
@ -127,96 +131,64 @@ args = parser.parse_args()
def tune(): def tune():
"""Tune parameters alpha and beta on one minibatch.""" """Tune parameters alpha and beta on one minibatch."""
if not args.num_alphas >= 0: if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!") raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0: if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!") raise ValueError("num_betas must be non-negative!")
# initialize data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_threads_data)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data = paddle.layer.data(
name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
output_probs = deep_speech2(
audio_data=audio_data,
text_data=text_data,
dict_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_size=args.rnn_layer_size,
is_inference=True)
# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(args.model_filepath))
# prepare infer data
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.tune_manifest_path,
batch_size=args.num_samples, batch_size=args.num_samples,
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
# get one batch data for tuning tune_data = batch_reader().next()
infer_data = batch_reader().next() target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript])
# run inference for _, transcript in tune_data
infer_results = paddle.infer(
output_layer=output_probs, parameters=parameters, input=infer_data)
num_steps = len(infer_results) // len(infer_data)
probs_split = [
infer_results[i * num_steps:(i + 1) * num_steps]
for i in xrange(0, len(infer_data))
] ]
ds2_model = DeepSpeech2Model(
vocab_size=data_generator.vocab_size,
num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size,
pretrained_model_path=args.model_filepath)
# create grid for search # create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
params_grid = [(alpha, beta) for alpha in cand_alphas params_grid = [(alpha, beta) for alpha in cand_alphas
for beta in cand_betas] for beta in cand_betas]
ext_scorer = LmScorer(args.alpha_from, args.beta_from,
args.language_model_path)
## tune parameters in loop ## tune parameters in loop
for alpha, beta in params_grid: for alpha, beta in params_grid:
wer_sum, wer_counter = 0, 0 result_transcripts = ds2_model.infer_batch(
# reset scorer infer_data=tune_data,
ext_scorer.reset_params(alpha, beta) decode_method='beam_search',
# beam search using multiple processes beam_alpha=alpha,
beam_search_results = ctc_beam_search_decoder_batch( beam_beta=beta,
probs_split=probs_split,
vocabulary=data_generator.vocab_list,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
blank_id=len(data_generator.vocab_list), vocab_list=data_generator.vocab_list,
num_processes=args.num_processes_beam_search, language_model_path=args.language_model_path,
ext_scoring_func=ext_scorer, ) num_processes=args.num_processes_beam_search)
for i, beam_search_result in enumerate(beam_search_results): wer_sum, num_ins = 0.0, 0
target_transcription = ''.join([ for target, result in zip(target_transcripts, result_transcripts):
data_generator.vocab_list[index] for index in infer_data[i][1] wer_sum += wer(target, result)
]) num_ins += 1
wer_sum += wer(target_transcription, beam_search_result[0][1])
wer_counter += 1
print("alpha = %f\tbeta = %f\tWER = %f" % print("alpha = %f\tbeta = %f\tWER = %f" %
(alpha, beta, wer_sum / wer_counter)) (alpha, beta, wer_sum / num_ins))
def main(): def main():
paddle.init(use_gpu=args.use_gpu, trainer_count=1) utils.print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune() tune()

Loading…
Cancel
Save