add the subsampling as conv

pull/735/head
huangyuxin 3 years ago
parent 8716386464
commit e4ef8ed31e

@ -1,4 +1,17 @@
from .deepspeech2 import DeepSpeech2Model # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .deepspeech2 import DeepSpeech2InferModel from .deepspeech2 import DeepSpeech2InferModel
from .deepspeech2 import DeepSpeech2Model
__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']

@ -19,8 +19,8 @@ from paddle import nn
from yacs.config import CfgNode from yacs.config import CfgNode
from deepspeech.models.ds2.conv import ConvStack from deepspeech.models.ds2.conv import ConvStack
from deepspeech.modules.ctc import CTCDecoder
from deepspeech.models.ds2.rnn import RNNStack from deepspeech.models.ds2.rnn import RNNStack
from deepspeech.modules.ctc import CTCDecoder
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log from deepspeech.utils.log import Log

@ -1,7 +1,17 @@
from .deepspeech2 import DeepSpeech2ModelOnline # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .deepspeech2 import DeepSpeech2InferModelOnline from .deepspeech2 import DeepSpeech2InferModelOnline
from .deepspeech2 import DeepSpeech2ModelOnline
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline'] __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']

@ -11,162 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F
from deepspeech.modules.activation import brelu from deepspeech.modules.embedding import PositionalEncoding
from deepspeech.modules.mask import make_non_pad_mask from deepspeech.modules.subsampling import Conv2dSubsampling4
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ['ConvStack', "conv_output_size"] class Conv2dSubsampling4Online(Conv2dSubsampling4):
def __init__(self,
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: nn.Layer=PositionalEncoding):
super().__init__(idim, odim, dropout_rate, pos_enc_class)
self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
def forward(self, x: paddle.Tensor,
def conv_output_size(I, F, P, S): x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters x = x.unsqueeze(1) # (b, c=1, t, f)
# Output size after Conv:
# By noting I the length of the input volume size,
# F the length of the filter,
# P the amount of zero padding,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
# When Pstart == Pend == 0
# O = (I - F - S) // S
# https://iq.opengenus.org/output-size-of-convolution/
# Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
# Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
return (I - F + 2 * P - S) // S
# receptive field calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
class ConvBn(nn.Layer):
"""Convolution layer with batch normalization.
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type kernel_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type, relu|brelu
:type act: string
:return: Batch norm layer after convolution layer.
:rtype: Variable
"""
def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
padding, act):
super().__init__()
assert len(kernel_size) == 2
assert len(stride) == 2
assert len(padding) == 2
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.conv = nn.Conv2D(
num_channels_in,
num_channels_out,
kernel_size=kernel_size,
stride=stride,
padding=padding,
weight_attr=None,
bias_attr=False,
data_format='NCHW')
self.bn = nn.BatchNorm2D(
num_channels_out,
weight_attr=None,
bias_attr=None,
data_format='NCHW')
self.act = F.relu if act == 'relu' else brelu
def forward(self, x, x_len):
"""
x(Tensor): audio, shape [B, C, D, T]
"""
x = self.conv(x) x = self.conv(x)
x = self.bn(x) b, c, t, f = paddle.shape(x)
x = self.act(x) x = x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])
x_len = ((x_len - 1) // 2 - 1) // 2
x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
) // self.stride[1] + 1
# reset padding part to 0
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks = masks.astype(x.dtype)
x = x.multiply(masks)
return x, x_len
class ConvStack(nn.Layer):
"""Convolution group with stacked convolution layers.
:param feat_size: audio feature dim.
:type feat_size: int
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
"""
def __init__(self, feat_size, num_stacks):
super().__init__()
self.feat_size = feat_size # D
self.num_stacks = num_stacks
self.conv_in = ConvBn(
num_channels_in=1,
num_channels_out=32,
kernel_size=(41, 11), #[D, T]
stride=(2, 3),
padding=(20, 5),
act='brelu')
out_channel = 32
convs = [
ConvBn(
num_channels_in=32,
num_channels_out=out_channel,
kernel_size=(21, 11),
stride=(2, 1),
padding=(10, 5),
act='brelu') for i in range(num_stacks - 1)
]
self.conv_stack = nn.LayerList(convs)
# conv output feat_dim
output_height = (feat_size - 1) // 2 + 1
for i in range(self.num_stacks - 1):
output_height = (output_height - 1) // 2 + 1
self.output_height = out_channel * output_height
def forward(self, x, x_len):
"""
x: shape [B, C, D, T]
x_len : shape [B]
"""
x, x_len = self.conv_in(x, x_len)
for i, conv in enumerate(self.conv_stack):
x, x_len = conv(x, x_len)
return x, x_len return x, x_len

@ -11,27 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Deepspeech2 ASR Model""" """Deepspeech2 ASR Online Model"""
from typing import Optional from typing import Optional
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import nn from paddle import nn
from paddle.fluid.layers import fc
from paddle.nn import GRU
from paddle.nn import LayerList
from paddle.nn import LayerNorm
from paddle.nn import Linear
from paddle.nn import LSTM
from yacs.config import CfgNode from yacs.config import CfgNode
from deepspeech.models.ds2_online.conv import ConvStack from deepspeech.models.ds2_online.conv import Conv2dSubsampling4Online
from deepspeech.models.ds2_online.rnn import RNNStack
from deepspeech.modules.ctc import CTCDecoder from deepspeech.modules.ctc import CTCDecoder
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModeOnline'] __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModeOnline']
@ -55,46 +47,48 @@ class CRNNEncoder(nn.Layer):
self.num_rnn_layers = num_rnn_layers self.num_rnn_layers = num_rnn_layers
self.num_fc_layers = num_fc_layers self.num_fc_layers = num_fc_layers
self.fc_layers_size_list = fc_layers_size_list self.fc_layers_size_list = fc_layers_size_list
self.conv = ConvStack(feat_size, num_conv_layers) self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
i_size = self.conv.output_height # H after conv stack i_size = self.conv.output_dim
self.rnn = LayerList() self.rnn = nn.LayerList()
self.layernorm_list = LayerList() self.layernorm_list = nn.LayerList()
self.fc_layers_list = LayerList() self.fc_layers_list = nn.LayerList()
rnn_direction = 'forward' rnn_direction = 'forward'
layernorm_size = rnn_size layernorm_size = rnn_size
if use_gru == True: if use_gru == True:
self.rnn.append( self.rnn.append(
GRU(input_size=i_size, nn.GRU(
input_size=i_size,
hidden_size=rnn_size, hidden_size=rnn_size,
num_layers=1, num_layers=1,
direction=rnn_direction)) direction=rnn_direction))
self.layernorm_list.append(LayerNorm(layernorm_size)) self.layernorm_list.append(nn.LayerNorm(layernorm_size))
for i in range(1, num_rnn_layers): for i in range(1, num_rnn_layers):
self.rnn.append( self.rnn.append(
GRU(input_size=layernorm_size, nn.GRU(
input_size=layernorm_size,
hidden_size=rnn_size, hidden_size=rnn_size,
num_layers=1, num_layers=1,
direction=rnn_direction)) direction=rnn_direction))
self.layernorm_list.append(LayerNorm(layernorm_size)) self.layernorm_list.append(nn.LayerNorm(layernorm_size))
else: else:
self.rnn.append( self.rnn.append(
LSTM( nn.LSTM(
input_size=i_size, input_size=i_size,
hidden_size=rnn_size, hidden_size=rnn_size,
num_layers=1, num_layers=1,
direction=rnn_direction)) direction=rnn_direction))
self.layernorm_list.append(LayerNorm(layernorm_size)) self.layernorm_list.append(nn.LayerNorm(layernorm_size))
for i in range(1, num_rnn_layers): for i in range(1, num_rnn_layers):
self.rnn.append( self.rnn.append(
LSTM( nn.LSTM(
input_size=layernorm_size, input_size=layernorm_size,
hidden_size=rnn_size, hidden_size=rnn_size,
num_layers=1, num_layers=1,
direction=rnn_direction)) direction=rnn_direction))
self.layernorm_list.append(LayerNorm(layernorm_size)) self.layernorm_list.append(nn.LayerNorm(layernorm_size))
fc_input_size = layernorm_size fc_input_size = layernorm_size
for i in range(self.num_fc_layers): for i in range(self.num_fc_layers):
self.fc_layers_list.append( self.fc_layers_list.append(
@ -117,20 +111,16 @@ class CRNNEncoder(nn.Layer):
x (Tensor): encoder outputs, [B, T, D] x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B] x_lens (Tensor): encoder length, [B]
""" """
# [B, T, D] -> [B, D, T] # [B, T, D]
audio = audio.transpose([0, 2, 1]) x = audio
# [B, D, T] -> [B, C=1, D, T]
x = audio.unsqueeze(1)
x_lens = audio_len x_lens = audio_len
# convolution group # convolution group
x, x_lens = self.conv(x, x_lens) x, x_lens = self.conv(x, x_lens)
# convert data from convolution feature map to sequence of vectors # convert data from convolution feature map to sequence of vectors
#B, C, D, T = paddle.shape(x) # not work under jit #B, C, D, T = paddle.shape(x) # not work under jit
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] #x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
x = x.reshape([0, 0, -1]) #[B, T, C*D] #x = x.reshape([0, 0, -1]) #[B, T, C*D]
# remove padding part # remove padding part
x, output_state = self.rnn[0](x, None, x_lens) x, output_state = self.rnn[0](x, None, x_lens)

@ -36,11 +36,10 @@ collator:
model: model:
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 4 num_rnn_layers: 3
rnn_layer_size: 1024 rnn_layer_size: 1024
use_gru: True use_gru: True
share_rnn_weights: False share_rnn_weights: False
apply_online: False
training: training:
n_epoch: 50 n_epoch: 50

@ -40,7 +40,6 @@ model:
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
apply_online: False
training: training:
n_epoch: 50 n_epoch: 50

@ -41,7 +41,6 @@ model:
rnn_layer_size: 2048 rnn_layer_size: 2048
use_gru: False use_gru: False
share_rnn_weights: True share_rnn_weights: True
apply_online: True
training: training:
n_epoch: 10 n_epoch: 10

@ -16,8 +16,8 @@ import unittest
import numpy as np import numpy as np
import paddle import paddle
#from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline as DeepSpeech2Model
class TestDeepSpeech2Model(unittest.TestCase): class TestDeepSpeech2Model(unittest.TestCase):
def setUp(self): def setUp(self):

Loading…
Cancel
Save