Add GRU support.

pull/2/head
Xinghai Sun 7 years ago
parent 638fae13f4
commit 5a63275845

@ -66,6 +66,11 @@ parser.add_argument(
default=512, default=512,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gpu", "--use_gpu",
default=True, default=True,
@ -199,6 +204,7 @@ def start_server():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath)
# prepare ASR inference handler # prepare ASR inference handler

@ -38,6 +38,11 @@ parser.add_argument(
default=512, default=512,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gpu", "--use_gpu",
default=True, default=True,
@ -142,6 +147,7 @@ def evaluate():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer

@ -33,6 +33,11 @@ parser.add_argument(
default=512, default=512,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gpu", "--use_gpu",
default=True, default=True,
@ -143,6 +148,7 @@ def infer():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath)
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,

@ -68,6 +68,38 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act):
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
def bidirectional_gru_bn_layer(name, input, size, act):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: LayerOutput
:param size: Number of RNN cells.
:type size: int
:param act: Activation type.
:type act: BaseActivation
:return: Bidirectional simple rnn layer.
:rtype: LayerOutput
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj = paddle.layer.fc(
input=input,
size=size * 3,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn = paddle.layer.batch_norm(
input=input_proj, act=paddle.activation.Linear())
# forward and backward in time
forward_gru = paddle.layer.grumemory(
input=input_proj_bn, act=act, reverse=False)
backward_gru = paddle.layer.grumemory(
input=input_proj_bn, act=act, reverse=True)
return paddle.layer.concat(input=[forward_gru, backward_gru])
def conv_group(input, num_stacks): def conv_group(input, num_stacks):
"""Convolution group with stacked convolution layers. """Convolution group with stacked convolution layers.
@ -83,7 +115,7 @@ def conv_group(input, num_stacks):
filter_size=(11, 41), filter_size=(11, 41),
num_channels_in=1, num_channels_in=1,
num_channels_out=32, num_channels_out=32,
stride=(3, 2), stride=(2, 2),
padding=(5, 20), padding=(5, 20),
act=paddle.activation.BRelu()) act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1): for i in xrange(num_stacks - 1):
@ -100,7 +132,7 @@ def conv_group(input, num_stacks):
return conv, output_num_channels, output_height return conv, output_num_channels, output_height
def rnn_group(input, size, num_stacks): def rnn_group(input, size, num_stacks, use_gru):
"""RNN group with stacked bidirectional simple RNN layers. """RNN group with stacked bidirectional simple RNN layers.
:param input: Input layer. :param input: Input layer.
@ -109,13 +141,25 @@ def rnn_group(input, size, num_stacks):
:type size: int :type size: int
:param num_stacks: Number of stacked rnn layers. :param num_stacks: Number of stacked rnn layers.
:type num_stacks: int :type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: Output layer of the RNN group. :return: Output layer of the RNN group.
:rtype: LayerOutput :rtype: LayerOutput
""" """
output = input output = input
for i in xrange(num_stacks): for i in xrange(num_stacks):
output = bidirectional_simple_rnn_bn_layer( if use_gru:
name=str(i), input=output, size=size, act=paddle.activation.BRelu()) output = bidirectional_gru_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.BRelu())
else:
output = bidirectional_simple_rnn_bn_layer(
name=str(i),
input=output,
size=size,
act=paddle.activation.BRelu())
return output return output
@ -124,7 +168,8 @@ def deep_speech2(audio_data,
dict_size, dict_size,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=256): rnn_size=256,
use_gru=True):
""" """
The whole DeepSpeech2 model structure (a simplified version). The whole DeepSpeech2 model structure (a simplified version).
@ -140,6 +185,8 @@ def deep_speech2(audio_data,
:type num_rnn_layers: int :type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells). :param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int :type rnn_size: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: A tuple of an output unnormalized log probability layer ( :return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer. before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput :rtype: tuple of LayerOutput
@ -157,7 +204,10 @@ def deep_speech2(audio_data,
block_y=conv_group_height) block_y=conv_group_height)
# rnn group # rnn group
rnn_group_output = rnn_group( rnn_group_output = rnn_group(
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) input=conv2seq,
size=rnn_size,
num_stacks=num_rnn_layers,
use_gru=use_gru)
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=rnn_group_output, input=rnn_group_output,
size=dict_size + 1, size=dict_size + 1,

@ -30,9 +30,9 @@ class DeepSpeech2Model(object):
""" """
def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, pretrained_model_path): rnn_layer_size, use_gru, pretrained_model_path):
self._create_network(vocab_size, num_conv_layers, num_rnn_layers, self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size) rnn_layer_size, use_gru)
self._create_parameters(pretrained_model_path) self._create_parameters(pretrained_model_path)
self._inferer = None self._inferer = None
self._loss_inferer = None self._loss_inferer = None
@ -226,7 +226,7 @@ class DeepSpeech2Model(object):
gzip.open(model_path)) gzip.open(model_path))
def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size): rnn_layer_size, use_gru):
"""Create data layers and model network.""" """Create data layers and model network."""
# paddle.data_type.dense_array is used for variable batch input. # paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape # The size 161 * 161 is only an placeholder value and the real shape
@ -243,4 +243,5 @@ class DeepSpeech2Model(object):
dict_size=vocab_size, dict_size=vocab_size,
num_conv_layers=num_conv_layers, num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers, num_rnn_layers=num_rnn_layers,
rnn_size=rnn_layer_size) rnn_size=rnn_layer_size,
use_gru=use_gru)

@ -37,9 +37,14 @@ parser.add_argument(
help="RNN layer number. (default: %(default)s)") help="RNN layer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", "--rnn_layer_size",
default=512, default=1280,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--adam_learning_rate", "--adam_learning_rate",
default=5e-4, default=5e-4,
@ -170,6 +175,7 @@ def train():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.init_model_path) pretrained_model_path=args.init_model_path)
ds2_model.train( ds2_model.train(
train_batch_reader=train_batch_reader, train_batch_reader=train_batch_reader,

@ -34,6 +34,11 @@ parser.add_argument(
default=512, default=512,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gru",
default=True,
type=bool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gpu", "--use_gpu",
default=True, default=True,
@ -158,6 +163,7 @@ def tune():
num_conv_layers=args.num_conv_layers, num_conv_layers=args.num_conv_layers,
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath)
# create grid for search # create grid for search

Loading…
Cancel
Save