|
|
@ -68,6 +68,38 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act):
|
|
|
|
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
|
|
|
return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def bidirectional_gru_bn_layer(name, input, size, act):
|
|
|
|
|
|
|
|
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
|
|
|
|
|
|
|
The batch normalization is only performed on input-state weights.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param name: Name of the layer.
|
|
|
|
|
|
|
|
:type name: string
|
|
|
|
|
|
|
|
:param input: Input layer.
|
|
|
|
|
|
|
|
:type input: LayerOutput
|
|
|
|
|
|
|
|
:param size: Number of RNN cells.
|
|
|
|
|
|
|
|
:type size: int
|
|
|
|
|
|
|
|
:param act: Activation type.
|
|
|
|
|
|
|
|
:type act: BaseActivation
|
|
|
|
|
|
|
|
:return: Bidirectional simple rnn layer.
|
|
|
|
|
|
|
|
:rtype: LayerOutput
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# input-hidden weights shared across bi-direcitonal rnn.
|
|
|
|
|
|
|
|
input_proj = paddle.layer.fc(
|
|
|
|
|
|
|
|
input=input,
|
|
|
|
|
|
|
|
size=size * 3,
|
|
|
|
|
|
|
|
act=paddle.activation.Linear(),
|
|
|
|
|
|
|
|
bias_attr=False)
|
|
|
|
|
|
|
|
# batch norm is only performed on input-state projection
|
|
|
|
|
|
|
|
input_proj_bn = paddle.layer.batch_norm(
|
|
|
|
|
|
|
|
input=input_proj, act=paddle.activation.Linear())
|
|
|
|
|
|
|
|
# forward and backward in time
|
|
|
|
|
|
|
|
forward_gru = paddle.layer.grumemory(
|
|
|
|
|
|
|
|
input=input_proj_bn, act=act, reverse=False)
|
|
|
|
|
|
|
|
backward_gru = paddle.layer.grumemory(
|
|
|
|
|
|
|
|
input=input_proj_bn, act=act, reverse=True)
|
|
|
|
|
|
|
|
return paddle.layer.concat(input=[forward_gru, backward_gru])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def conv_group(input, num_stacks):
|
|
|
|
def conv_group(input, num_stacks):
|
|
|
|
"""Convolution group with stacked convolution layers.
|
|
|
|
"""Convolution group with stacked convolution layers.
|
|
|
|
|
|
|
|
|
|
|
@ -83,7 +115,7 @@ def conv_group(input, num_stacks):
|
|
|
|
filter_size=(11, 41),
|
|
|
|
filter_size=(11, 41),
|
|
|
|
num_channels_in=1,
|
|
|
|
num_channels_in=1,
|
|
|
|
num_channels_out=32,
|
|
|
|
num_channels_out=32,
|
|
|
|
stride=(3, 2),
|
|
|
|
stride=(2, 2),
|
|
|
|
padding=(5, 20),
|
|
|
|
padding=(5, 20),
|
|
|
|
act=paddle.activation.BRelu())
|
|
|
|
act=paddle.activation.BRelu())
|
|
|
|
for i in xrange(num_stacks - 1):
|
|
|
|
for i in xrange(num_stacks - 1):
|
|
|
@ -100,7 +132,7 @@ def conv_group(input, num_stacks):
|
|
|
|
return conv, output_num_channels, output_height
|
|
|
|
return conv, output_num_channels, output_height
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def rnn_group(input, size, num_stacks):
|
|
|
|
def rnn_group(input, size, num_stacks, use_gru):
|
|
|
|
"""RNN group with stacked bidirectional simple RNN layers.
|
|
|
|
"""RNN group with stacked bidirectional simple RNN layers.
|
|
|
|
|
|
|
|
|
|
|
|
:param input: Input layer.
|
|
|
|
:param input: Input layer.
|
|
|
@ -109,13 +141,25 @@ def rnn_group(input, size, num_stacks):
|
|
|
|
:type size: int
|
|
|
|
:type size: int
|
|
|
|
:param num_stacks: Number of stacked rnn layers.
|
|
|
|
:param num_stacks: Number of stacked rnn layers.
|
|
|
|
:type num_stacks: int
|
|
|
|
:type num_stacks: int
|
|
|
|
|
|
|
|
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
|
|
|
|
|
|
|
:type use_gru: bool
|
|
|
|
:return: Output layer of the RNN group.
|
|
|
|
:return: Output layer of the RNN group.
|
|
|
|
:rtype: LayerOutput
|
|
|
|
:rtype: LayerOutput
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
output = input
|
|
|
|
output = input
|
|
|
|
for i in xrange(num_stacks):
|
|
|
|
for i in xrange(num_stacks):
|
|
|
|
output = bidirectional_simple_rnn_bn_layer(
|
|
|
|
if use_gru:
|
|
|
|
name=str(i), input=output, size=size, act=paddle.activation.BRelu())
|
|
|
|
output = bidirectional_gru_bn_layer(
|
|
|
|
|
|
|
|
name=str(i),
|
|
|
|
|
|
|
|
input=output,
|
|
|
|
|
|
|
|
size=size,
|
|
|
|
|
|
|
|
act=paddle.activation.BRelu())
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
output = bidirectional_simple_rnn_bn_layer(
|
|
|
|
|
|
|
|
name=str(i),
|
|
|
|
|
|
|
|
input=output,
|
|
|
|
|
|
|
|
size=size,
|
|
|
|
|
|
|
|
act=paddle.activation.BRelu())
|
|
|
|
return output
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -124,7 +168,8 @@ def deep_speech2(audio_data,
|
|
|
|
dict_size,
|
|
|
|
dict_size,
|
|
|
|
num_conv_layers=2,
|
|
|
|
num_conv_layers=2,
|
|
|
|
num_rnn_layers=3,
|
|
|
|
num_rnn_layers=3,
|
|
|
|
rnn_size=256):
|
|
|
|
rnn_size=256,
|
|
|
|
|
|
|
|
use_gru=True):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
The whole DeepSpeech2 model structure (a simplified version).
|
|
|
|
The whole DeepSpeech2 model structure (a simplified version).
|
|
|
|
|
|
|
|
|
|
|
@ -140,6 +185,8 @@ def deep_speech2(audio_data,
|
|
|
|
:type num_rnn_layers: int
|
|
|
|
:type num_rnn_layers: int
|
|
|
|
:param rnn_size: RNN layer size (number of RNN cells).
|
|
|
|
:param rnn_size: RNN layer size (number of RNN cells).
|
|
|
|
:type rnn_size: int
|
|
|
|
:type rnn_size: int
|
|
|
|
|
|
|
|
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
|
|
|
|
|
|
|
:type use_gru: bool
|
|
|
|
:return: A tuple of an output unnormalized log probability layer (
|
|
|
|
:return: A tuple of an output unnormalized log probability layer (
|
|
|
|
before softmax) and a ctc cost layer.
|
|
|
|
before softmax) and a ctc cost layer.
|
|
|
|
:rtype: tuple of LayerOutput
|
|
|
|
:rtype: tuple of LayerOutput
|
|
|
@ -157,7 +204,10 @@ def deep_speech2(audio_data,
|
|
|
|
block_y=conv_group_height)
|
|
|
|
block_y=conv_group_height)
|
|
|
|
# rnn group
|
|
|
|
# rnn group
|
|
|
|
rnn_group_output = rnn_group(
|
|
|
|
rnn_group_output = rnn_group(
|
|
|
|
input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
|
|
|
|
input=conv2seq,
|
|
|
|
|
|
|
|
size=rnn_size,
|
|
|
|
|
|
|
|
num_stacks=num_rnn_layers,
|
|
|
|
|
|
|
|
use_gru=use_gru)
|
|
|
|
fc = paddle.layer.fc(
|
|
|
|
fc = paddle.layer.fc(
|
|
|
|
input=rnn_group_output,
|
|
|
|
input=rnn_group_output,
|
|
|
|
size=dict_size + 1,
|
|
|
|
size=dict_size + 1,
|
|
|
|