Revert back to support input-hidden weights sharing between bi-directional RNNs.

1. Add options to enable and disable RNN weights sharing.
2. Set rnn_layer_size to 2048 by default.
3. Revert back the striding steps of 1st conv layer from 2 to 3.
4. Revert back to BRelu.

Above follows DS2 papers.
pull/2/head
Xinghai Sun 7 years ago
parent 177af05953
commit 2aa4af1c29

@ -63,9 +63,16 @@ parser.add_argument(
help="RNN layer number. (default: %(default)s)") help="RNN layer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", "--rnn_layer_size",
default=512, default=2048,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gru", "--use_gru",
default=False, default=False,
@ -205,7 +212,8 @@ def start_server():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)
# prepare ASR inference handler # prepare ASR inference handler
def file_to_transcript(filename): def file_to_transcript(filename):

@ -35,9 +35,16 @@ parser.add_argument(
help="RNN layer number. (default: %(default)s)") help="RNN layer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", "--rnn_layer_size",
default=512, default=2048,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gru", "--use_gru",
default=False, default=False,
@ -148,7 +155,8 @@ def evaluate():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
error_sum, num_ins = 0.0, 0 error_sum, num_ins = 0.0, 0

@ -30,9 +30,16 @@ parser.add_argument(
help="RNN layer number. (default: %(default)s)") help="RNN layer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", "--rnn_layer_size",
default=512, default=2048,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gru", "--use_gru",
default=False, default=False,
@ -149,7 +156,8 @@ def infer():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,
decode_method=args.decode_method, decode_method=args.decode_method,

@ -39,7 +39,7 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
return paddle.layer.batch_norm(input=conv_layer, act=act) return paddle.layer.batch_norm(input=conv_layer, act=act)
def bidirectional_simple_rnn_bn_layer(name, input, size, act): def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization. """Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights. The batch normalization is only performed on input-state weights.
@ -51,24 +51,50 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act):
:type size: int :type size: int
:param act: Activation type. :param act: Activation type.
:type act: BaseActivation :type act: BaseActivation
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer. :return: Bidirectional simple rnn layer.
:rtype: LayerOutput :rtype: LayerOutput
""" """
# input-hidden weights shared across bi-direcitonal rnn. if share_weights:
input_proj_forward = paddle.layer.fc( # input-hidden weights shared between bi-direcitonal rnn.
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) input_proj = paddle.layer.fc(
input_proj_backward = paddle.layer.fc( input=input,
input=input, size=size, act=paddle.activation.Linear(), bias_attr=False) size=size,
# batch norm is only performed on input-state projection act=paddle.activation.Linear(),
input_proj_bn_forward = paddle.layer.batch_norm( bias_attr=False)
input=input_proj_forward, act=paddle.activation.Linear()) # batch norm is only performed on input-state projection
input_proj_bn_backward = paddle.layer.batch_norm( input_proj_bn = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear()) input=input_proj, act=paddle.activation.Linear())
# forward and backward in time # forward and backward in time
forward_simple_rnn = paddle.layer.recurrent( forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_forward, act=act, reverse=False) input=input_proj_bn, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent( backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_backward, act=act, reverse=True) input=input_proj_bn, act=act, reverse=True)
else:
input_proj_forward = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
input_proj_backward = paddle.layer.fc(
input=input,
size=size,
act=paddle.activation.Linear(),
bias_attr=False)
# batch norm is only performed on input-state projection
input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm(
input=input_proj_backward, act=paddle.activation.Linear())
# forward and backward in time
forward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_forward, act=act, reverse=False)
backward_simple_rnn = paddle.layer.recurrent(
input=input_proj_bn_backward, act=act, reverse=True)
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
@ -87,7 +113,6 @@ def bidirectional_gru_bn_layer(name, input, size, act):
:return: Bidirectional simple rnn layer. :return: Bidirectional simple rnn layer.
:rtype: LayerOutput :rtype: LayerOutput
""" """
# input-hidden weights shared across bi-direcitonal rnn.
input_proj_forward = paddle.layer.fc( input_proj_forward = paddle.layer.fc(
input=input, input=input,
size=size * 3, size=size * 3,
@ -98,7 +123,7 @@ def bidirectional_gru_bn_layer(name, input, size, act):
size=size * 3, size=size * 3,
act=paddle.activation.Linear(), act=paddle.activation.Linear(),
bias_attr=False) bias_attr=False)
# batch norm is only performed on input-state projection # batch norm is only performed on input-related projections
input_proj_bn_forward = paddle.layer.batch_norm( input_proj_bn_forward = paddle.layer.batch_norm(
input=input_proj_forward, act=paddle.activation.Linear()) input=input_proj_forward, act=paddle.activation.Linear())
input_proj_bn_backward = paddle.layer.batch_norm( input_proj_bn_backward = paddle.layer.batch_norm(
@ -126,9 +151,9 @@ def conv_group(input, num_stacks):
filter_size=(11, 41), filter_size=(11, 41),
num_channels_in=1, num_channels_in=1,
num_channels_out=32, num_channels_out=32,
stride=(2, 2), stride=(3, 2),
padding=(5, 20), padding=(5, 20),
act=paddle.activation.Relu()) act=paddle.activation.BRelu())
for i in xrange(num_stacks - 1): for i in xrange(num_stacks - 1):
conv = conv_bn_layer( conv = conv_bn_layer(
input=conv, input=conv,
@ -137,13 +162,13 @@ def conv_group(input, num_stacks):
num_channels_out=32, num_channels_out=32,
stride=(1, 2), stride=(1, 2),
padding=(5, 10), padding=(5, 10),
act=paddle.activation.Relu()) act=paddle.activation.BRelu())
output_num_channels = 32 output_num_channels = 32
output_height = 160 // pow(2, num_stacks) + 1 output_height = 160 // pow(2, num_stacks) + 1
return conv, output_num_channels, output_height return conv, output_num_channels, output_height
def rnn_group(input, size, num_stacks, use_gru): def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
"""RNN group with stacked bidirectional simple RNN layers. """RNN group with stacked bidirectional simple RNN layers.
:param input: Input layer. :param input: Input layer.
@ -154,6 +179,10 @@ def rnn_group(input, size, num_stacks, use_gru):
:type num_stacks: int :type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False. :param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool :type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group. :return: Output layer of the RNN group.
:rtype: LayerOutput :rtype: LayerOutput
""" """
@ -165,12 +194,14 @@ def rnn_group(input, size, num_stacks, use_gru):
input=output, input=output,
size=size, size=size,
act=paddle.activation.Relu()) act=paddle.activation.Relu())
# BRelu does not support hppl, need to add later. Use Relu instead.
else: else:
output = bidirectional_simple_rnn_bn_layer( output = bidirectional_simple_rnn_bn_layer(
name=str(i), name=str(i),
input=output, input=output,
size=size, size=size,
act=paddle.activation.Relu()) act=paddle.activation.BRelu(),
share_weights=share_rnn_weights)
return output return output
@ -180,9 +211,10 @@ def deep_speech2(audio_data,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=256, rnn_size=256,
use_gru=True): use_gru=False,
share_rnn_weights=True):
""" """
The whole DeepSpeech2 model structure (a simplified version). The whole DeepSpeech2 model structure.
:param audio_data: Audio spectrogram data layer. :param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput :type audio_data: LayerOutput
@ -198,6 +230,10 @@ def deep_speech2(audio_data,
:type rnn_size: int :type rnn_size: int
:param use_gru: Use gru if set True. Use simple rnn if set False. :param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool :type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer ( :return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer. before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput :rtype: tuple of LayerOutput
@ -218,7 +254,8 @@ def deep_speech2(audio_data,
input=conv2seq, input=conv2seq,
size=rnn_size, size=rnn_size,
num_stacks=num_rnn_layers, num_stacks=num_rnn_layers,
use_gru=use_gru) use_gru=use_gru,
share_rnn_weights=share_rnn_weights)
fc = paddle.layer.fc( fc = paddle.layer.fc(
input=rnn_group_output, input=rnn_group_output,
size=dict_size + 1, size=dict_size + 1,

@ -27,12 +27,17 @@ class DeepSpeech2Model(object):
:param pretrained_model_path: Pretrained model path. If None, will train :param pretrained_model_path: Pretrained model path. If None, will train
from stratch. from stratch.
:type pretrained_model_path: basestring|None :type pretrained_model_path: basestring|None
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.Notice that
for GRU, weight sharing is not supported.
:type share_rnn_weights: bool
""" """
def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, use_gru, pretrained_model_path): rnn_layer_size, use_gru, pretrained_model_path,
share_rnn_weights):
self._create_network(vocab_size, num_conv_layers, num_rnn_layers, self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, use_gru) rnn_layer_size, use_gru, share_rnn_weights)
self._create_parameters(pretrained_model_path) self._create_parameters(pretrained_model_path)
self._inferer = None self._inferer = None
self._loss_inferer = None self._loss_inferer = None
@ -226,7 +231,7 @@ class DeepSpeech2Model(object):
gzip.open(model_path)) gzip.open(model_path))
def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
rnn_layer_size, use_gru): rnn_layer_size, use_gru, share_rnn_weights):
"""Create data layers and model network.""" """Create data layers and model network."""
# paddle.data_type.dense_array is used for variable batch input. # paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape # The size 161 * 161 is only an placeholder value and the real shape
@ -244,4 +249,5 @@ class DeepSpeech2Model(object):
num_conv_layers=num_conv_layers, num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers, num_rnn_layers=num_rnn_layers,
rnn_size=rnn_layer_size, rnn_size=rnn_layer_size,
use_gru=use_gru) use_gru=use_gru,
share_rnn_weights=share_rnn_weights)

@ -37,9 +37,16 @@ parser.add_argument(
help="RNN layer number. (default: %(default)s)") help="RNN layer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", "--rnn_layer_size",
default=1024, default=2048,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gru", "--use_gru",
default=False, default=False,
@ -176,7 +183,8 @@ def train():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.init_model_path) pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_rnn_weights)
ds2_model.train( ds2_model.train(
train_batch_reader=train_batch_reader, train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader, dev_batch_reader=dev_batch_reader,

@ -31,9 +31,16 @@ parser.add_argument(
help="RNN layer number. (default: %(default)s)") help="RNN layer number. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--rnn_layer_size", "--rnn_layer_size",
default=512, default=2048,
type=int, type=int,
help="RNN layer cell number. (default: %(default)s)") help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--use_gru", "--use_gru",
default=False, default=False,
@ -164,7 +171,8 @@ def tune():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath) pretrained_model_path=args.model_filepath,
share_rnn_weights=args.share_rnn_weights)
# create grid for search # create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)

@ -10,12 +10,12 @@ def print_arguments(args):
Usage: Usage:
.. code-block:: python .. code-block:: python
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.") parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)
:param args: Input argparse.Namespace for printing. :param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace :type args: argparse.Namespace
""" """

Loading…
Cancel
Save