From 2cacbaf48ee80d1c00256cab732b203d4dc00b28 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 02:14:36 +0000 Subject: [PATCH 01/24] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86deepspeech2.py?= =?UTF-8?q?=E9=83=A8=E5=88=86LSTM=E5=92=8CGRU=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E5=A2=9E=E5=8A=A0=E4=BA=86LayerNorm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/exps/deepspeech2/model.py | 6 +- deepspeech/models/ds2/deepspeech2.py | 65 +++++++++++++++---- examples/aishell/s0/conf/deepspeech2.yaml | 3 +- examples/librispeech/s0/conf/deepspeech2.yaml | 1 + examples/tiny/s0/conf/deepspeech2.yaml | 1 + 5 files changed, 62 insertions(+), 14 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 2f84b686c..544d57d1b 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -127,7 +127,8 @@ class DeepSpeech2Trainer(Trainer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) if self.parallel: model = paddle.DataParallel(model) @@ -374,7 +375,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) self.model = model logger.info("Setup model!") diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index 0bd5fb95d..7f173ce29 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -25,6 +25,11 @@ from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log +from paddle.nn import LSTM, GRU +from paddle.nn import LayerNorm +from paddle.nn import LayerList + + logger = Log(__name__).getlog() __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferMode'] @@ -38,25 +43,50 @@ class CRNNEncoder(nn.Layer): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True): + share_rnn_weights=True, + apply_online=True): super().__init__() self.rnn_size = rnn_size self.feat_size = feat_size # 161 for linear self.dict_size = dict_size - + self.num_rnn_layers = num_rnn_layers + self.apply_online = apply_online self.conv = ConvStack(feat_size, num_conv_layers) i_size = self.conv.output_height # H after conv stack + + + self.rnn = LayerList() + self.layernorm_list = LayerList() + + if (apply_online == True): + rnn_direction = 'forward' + else: + rnn_direction = 'bidirect' + + if use_gru == True: + self.rnn.append(GRU(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + for i in range(1, num_rnn_layers): + self.rnn.append(GRU(input_size=rnn_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + else: + self.rnn.append(LSTM(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + for i in range(1, num_rnn_layers): + self.rnn.append(LSTM(input_size=rnn_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + """ self.rnn = RNNStack( i_size=i_size, h_size=rnn_size, num_stacks=num_rnn_layers, use_gru=use_gru, share_rnn_weights=share_rnn_weights) - + """ @property def output_size(self): - return self.rnn_size * 2 + return self.rnn_size def forward(self, audio, audio_len): """Compute Encoder outputs @@ -86,7 +116,15 @@ class CRNNEncoder(nn.Layer): x = x.reshape([0, 0, -1]) #[B, T, C*D] # remove padding part - x, x_lens = self.rnn(x, x_lens) #[B, T, D] + print ("x.shape:", x.shape) + x, output_state = self.rnn[0](x, None, x_lens) + x = self.layernorm_list[0](x) + for i in range(1, self.num_rnn_layers): + x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] + x = self.layernorm_list[i](x) + """ + x, x_lens = self.rnn(x, x_lens) + """ return x, x_lens @@ -141,7 +179,8 @@ class DeepSpeech2Model(nn.Layer): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True): + share_rnn_weights=True, + apply_online = True): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, @@ -150,8 +189,9 @@ class DeepSpeech2Model(nn.Layer): num_rnn_layers=num_rnn_layers, rnn_size=rnn_size, use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - assert (self.encoder.output_size == rnn_size * 2) + share_rnn_weights=share_rnn_weights, + apply_online=apply_online) + assert (self.encoder.output_size == rnn_size) self.decoder = CTCDecoder( odim=dict_size, # is in vocab @@ -221,7 +261,8 @@ class DeepSpeech2Model(nn.Layer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -237,7 +278,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True): + share_rnn_weights=True, + apply_online = True): super().__init__( feat_size=feat_size, dict_size=dict_size, @@ -245,7 +287,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model): num_rnn_layers=num_rnn_layers, rnn_size=rnn_size, use_gru=use_gru, - share_rnn_weights=share_rnn_weights) + share_rnn_weights=share_rnn_weights, + apply_online=apply_online) def forward(self, audio, audio_len): """export model function diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 1c97fc607..7d0d1f895 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -36,10 +36,11 @@ collator: model: num_conv_layers: 2 - num_rnn_layers: 3 + num_rnn_layers: 4 rnn_layer_size: 1024 use_gru: True share_rnn_weights: False + apply_online: False training: n_epoch: 50 diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index acee94c3e..be1918d01 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -40,6 +40,7 @@ model: rnn_layer_size: 2048 use_gru: False share_rnn_weights: True + apply_online: False training: n_epoch: 50 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index ea433f341..8c719e5cd 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -41,6 +41,7 @@ model: rnn_layer_size: 2048 use_gru: False share_rnn_weights: True + apply_online: True training: n_epoch: 10 From 269eecb3be5511a10d97643cd3b4c0fd37554af7 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 02:45:59 +0000 Subject: [PATCH 02/24] =?UTF-8?q?=E6=96=B0=E5=BB=BAds2=5Fonline=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=A4=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/exps/deepspeech2/model.py | 6 +- deepspeech/models/ds2_online/__init__.py | 7 + deepspeech/models/ds2_online/conv.py | 172 +++++++++++ deepspeech/models/ds2_online/deepspeech2.py | 312 +++++++++++++++++++ deepspeech/models/ds2_online/rnn.py | 314 ++++++++++++++++++++ 5 files changed, 807 insertions(+), 4 deletions(-) create mode 100644 deepspeech/models/ds2_online/__init__.py create mode 100644 deepspeech/models/ds2_online/conv.py create mode 100644 deepspeech/models/ds2_online/deepspeech2.py create mode 100644 deepspeech/models/ds2_online/rnn.py diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 544d57d1b..2f84b686c 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -127,8 +127,7 @@ class DeepSpeech2Trainer(Trainer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - apply_online=config.model.apply_online) + share_rnn_weights=config.model.share_rnn_weights) if self.parallel: model = paddle.DataParallel(model) @@ -375,8 +374,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - apply_online=config.model.apply_online) + share_rnn_weights=config.model.share_rnn_weights) self.model = model logger.info("Setup model!") diff --git a/deepspeech/models/ds2_online/__init__.py b/deepspeech/models/ds2_online/__init__.py new file mode 100644 index 000000000..299f901cb --- /dev/null +++ b/deepspeech/models/ds2_online/__init__.py @@ -0,0 +1,7 @@ +from .deepspeech2 import DeepSpeech2Model +from .deepspeech2 import DeepSpeech2InferModel + +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] + + + diff --git a/deepspeech/models/ds2_online/conv.py b/deepspeech/models/ds2_online/conv.py new file mode 100644 index 000000000..8bf48b2c8 --- /dev/null +++ b/deepspeech/models/ds2_online/conv.py @@ -0,0 +1,172 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle import nn +from paddle.nn import functional as F + +from deepspeech.modules.activation import brelu +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['ConvStack', "conv_output_size"] + + +def conv_output_size(I, F, P, S): + # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters + # Output size after Conv: + # By noting I the length of the input volume size, + # F the length of the filter, + # P the amount of zero padding, + # S the stride, + # then the output size O of the feature map along that dimension is given by: + # O = (I - F + Pstart + Pend) // S + 1 + # When Pstart == Pend == P, we can replace Pstart + Pend by 2P. + # When Pstart == Pend == 0 + # O = (I - F - S) // S + # https://iq.opengenus.org/output-size-of-convolution/ + # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1 + # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1 + return (I - F + 2 * P - S) // S + + +# receptive field calculator +# https://fomoro.com/research/article/receptive-field-calculator +# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters +# https://distill.pub/2019/computing-receptive-fields/ +# Rl-1 = Sl * Rl + (Kl - Sl) + + +class ConvBn(nn.Layer): + """Convolution layer with batch normalization. + + :param kernel_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type kernel_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :param num_channels_out: Number of output channels. + :type num_channels_out: int + :param stride: The x dimension of the stride. Or input a tuple for two + image dimension. + :type stride: int|tuple|list + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type, relu|brelu + :type act: string + :return: Batch norm layer after convolution layer. + :rtype: Variable + + """ + + def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, + padding, act): + + super().__init__() + assert len(kernel_size) == 2 + assert len(stride) == 2 + assert len(padding) == 2 + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + + self.conv = nn.Conv2D( + num_channels_in, + num_channels_out, + kernel_size=kernel_size, + stride=stride, + padding=padding, + weight_attr=None, + bias_attr=False, + data_format='NCHW') + + self.bn = nn.BatchNorm2D( + num_channels_out, + weight_attr=None, + bias_attr=None, + data_format='NCHW') + self.act = F.relu if act == 'relu' else brelu + + def forward(self, x, x_len): + """ + x(Tensor): audio, shape [B, C, D, T] + """ + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] + ) // self.stride[1] + 1 + + # reset padding part to 0 + masks = make_non_pad_mask(x_len) #[B, T] + masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + # TODO(Hui Zhang): not support bool multiply + # masks = masks.type_as(x) + masks = masks.astype(x.dtype) + x = x.multiply(masks) + + return x, x_len + + +class ConvStack(nn.Layer): + """Convolution group with stacked convolution layers. + + :param feat_size: audio feature dim. + :type feat_size: int + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + """ + + def __init__(self, feat_size, num_stacks): + super().__init__() + self.feat_size = feat_size # D + self.num_stacks = num_stacks + + self.conv_in = ConvBn( + num_channels_in=1, + num_channels_out=32, + kernel_size=(41, 11), #[D, T] + stride=(2, 3), + padding=(20, 5), + act='brelu') + + out_channel = 32 + convs = [ + ConvBn( + num_channels_in=32, + num_channels_out=out_channel, + kernel_size=(21, 11), + stride=(2, 1), + padding=(10, 5), + act='brelu') for i in range(num_stacks - 1) + ] + self.conv_stack = nn.LayerList(convs) + + # conv output feat_dim + output_height = (feat_size - 1) // 2 + 1 + for i in range(self.num_stacks - 1): + output_height = (output_height - 1) // 2 + 1 + self.output_height = out_channel * output_height + + def forward(self, x, x_len): + """ + x: shape [B, C, D, T] + x_len : shape [B] + """ + x, x_len = self.conv_in(x, x_len) + for i, conv in enumerate(self.conv_stack): + x, x_len = conv(x, x_len) + return x, x_len diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py new file mode 100644 index 000000000..c5e6a92bc --- /dev/null +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -0,0 +1,312 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Deepspeech2 ASR Model""" +from typing import Optional + +import paddle +from paddle import nn +from yacs.config import CfgNode + +from deepspeech.models.ds2.conv import ConvStack +from deepspeech.modules.ctc import CTCDecoder +from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.utils import layer_tools +from deepspeech.utils.checkpoint import Checkpoint +from deepspeech.utils.log import Log + +from paddle.nn import LSTM, GRU +from paddle.nn import LayerNorm +from paddle.nn import LayerList + + +logger = Log(__name__).getlog() + +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferMode'] + + +class CRNNEncoder(nn.Layer): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True, + apply_online=True): + super().__init__() + self.rnn_size = rnn_size + self.feat_size = feat_size # 161 for linear + self.dict_size = dict_size + self.num_rnn_layers = num_rnn_layers + self.apply_online = apply_online + self.conv = ConvStack(feat_size, num_conv_layers) + + i_size = self.conv.output_height # H after conv stack + + + self.rnn = LayerList() + self.layernorm_list = LayerList() + + if (apply_online == True): + rnn_direction = 'forward' + layernorm_size = rnn_size + else: + rnn_direction = 'bidirect' + layernorm_size = 2 * rnn_size + + if use_gru == True: + self.rnn.append(GRU(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(layernorm_size)) + for i in range(1, num_rnn_layers): + self.rnn.append(GRU(input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(layernorm_size)) + else: + self.rnn.append(LSTM(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(layernorm_size)) + for i in range(1, num_rnn_layers): + self.rnn.append(LSTM(input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(layernorm_size)) + """ + self.rnn = RNNStack( + i_size=i_size, + h_size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + """ + @property + def output_size(self): + if (self.apply_online == True): + return self.rnn_size + else: + return 2 * self.rnn_size + + def forward(self, audio, audio_len): + """Compute Encoder outputs + + Args: + audio (Tensor): [B, Tmax, D] + text (Tensor): [B, Umax] + audio_len (Tensor): [B] + text_len (Tensor): [B] + Returns: + x (Tensor): encoder outputs, [B, T, D] + x_lens (Tensor): encoder length, [B] + """ + # [B, T, D] -> [B, D, T] + audio = audio.transpose([0, 2, 1]) + # [B, D, T] -> [B, C=1, D, T] + x = audio.unsqueeze(1) + x_lens = audio_len + + # convolution group + x, x_lens = self.conv(x, x_lens) + + # convert data from convolution feature map to sequence of vectors + #B, C, D, T = paddle.shape(x) # not work under jit + x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] + #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit + x = x.reshape([0, 0, -1]) #[B, T, C*D] + + # remove padding part + x, output_state = self.rnn[0](x, None, x_lens) + x = self.layernorm_list[0](x) + for i in range(1, self.num_rnn_layers): + x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] + x = self.layernorm_list[i](x) + """ + x, x_lens = self.rnn(x, x_lens) + """ + return x, x_lens + + +class DeepSpeech2Model(nn.Layer): + """The DeepSpeech2 network structure. + + :param audio_data: Audio spectrogram data layer. + :type audio_data: Variable + :param text_data: Transcription text data layer. + :type text_data: Variable + :param audio_len: Valid sequence length data layer. + :type audio_len: Variable + :param masks: Masks data layer to reset padding. + :type masks: Variable + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (dimension of RNN cells). + :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. + :rtype: tuple of LayerOutput + """ + + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True, + apply_online = True): + super().__init__() + self.encoder = CRNNEncoder( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights, + apply_online=apply_online) + if (apply_online == True): + assert (self.encoder.output_size == rnn_size) + else: + assert (self.encoder.output_size == 2 * rnn_size) + + self.decoder = CTCDecoder( + odim=dict_size, # is in vocab + enc_n_units=self.encoder.output_size, + blank_id=0, # first token is + dropout_rate=0.0, + reduction=True, # sum + batch_average=True) # sum / batch_size + + def forward(self, audio, audio_len, text, text_len): + """Compute Model loss + + Args: + audio (Tenosr): [B, T, D] + audio_len (Tensor): [B] + text (Tensor): [B, U] + text_len (Tensor): [B] + + Returns: + loss (Tenosr): [1] + """ + eouts, eouts_len = self.encoder(audio, audio_len) + loss = self.decoder(eouts, eouts_len, text, text_len) + return loss + + @paddle.no_grad() + def decode(self, audio, audio_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) + + @classmethod + def from_pretrained(cls, dataloader, config, checkpoint_path): + """Build a DeepSpeech2Model model from a pretrained model. + Parameters + ---------- + dataloader: paddle.io.DataLoader + + config: yacs.config.CfgNode + model configs + + checkpoint_path: Path or str + the path of pretrained model checkpoint, without extension name + + Returns + ------- + DeepSpeech2Model + The model built from pretrained result. + """ + model = cls(feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) + infos = Checkpoint().load_parameters( + model, checkpoint_path=checkpoint_path) + logger.info(f"checkpoint info: {infos}") + layer_tools.summary(model) + return model + + +class DeepSpeech2InferModel(DeepSpeech2Model): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True, + apply_online = True): + super().__init__( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights, + apply_online=apply_online) + + def forward(self, audio, audio_len): + """export model function + + Args: + audio (Tensor): [B, T, D] + audio_len (Tensor): [B] + + Returns: + probs: probs after softmax + """ + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs diff --git a/deepspeech/models/ds2_online/rnn.py b/deepspeech/models/ds2_online/rnn.py new file mode 100644 index 000000000..01b55c4a2 --- /dev/null +++ b/deepspeech/models/ds2_online/rnn.py @@ -0,0 +1,314 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.modules.activation import brelu +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['RNNStack'] + + +class RNNCell(nn.RNNCellBase): + r""" + Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it + computes the outputs and updates states. + The formula used is as follows: + .. math:: + h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) + y_{t} & = h_{t} + + where :math:`act` is for :attr:`activation`. + """ + + def __init__(self, + hidden_size: int, + activation="tanh", + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + if activation not in ["tanh", "relu", "brelu"]: + raise ValueError( + "activation for SimpleRNNCell should be tanh or relu, " + "but get {}".format(activation)) + self.activation = activation + self._activation_fn = paddle.tanh \ + if activation == "tanh" \ + else F.relu + if activation == 'brelu': + self._activation_fn = brelu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + pre_h = states + i2h = inputs + if self.bias_ih is not None: + i2h += self.bias_ih + h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h2h += self.bias_hh + h = self._activation_fn(i2h + h2h) + return h, h + + @property + def state_shape(self): + return (self.hidden_size, ) + + +class GRUCell(nn.RNNCellBase): + r""" + Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, + it computes the outputs and updates states. + The formula for GRU used is as follows: + .. math:: + r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) + z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) + \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) + h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise + multiplication operator. + """ + + def __init__(self, + input_size: int, + hidden_size: int, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (3 * hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (3 * hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = F.sigmoid + self._activation = paddle.tanh + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + + pre_hidden = states + x_gates = inputs + if self.bias_ih is not None: + x_gates = x_gates + self.bias_ih + h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h_gates = h_gates + self.bias_hh + + x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) + h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) + + r = self._gate_activation(x_r + h_r) + z = self._gate_activation(x_z + h_z) + c = self._activation(x_c + r * h_c) # apply reset gate after mm + h = (pre_hidden - c) * z + c + # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru + + return h, h + + @property + def state_shape(self): + r""" + The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch + size would be automatically inserted into shape). The shape corresponds + to the shape of :math:`h_{t-1}`. + """ + return (self.hidden_size, ) + + +class BiRNNWithBN(nn.Layer): + """Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param size: Dimension of RNN cells. + :type size: int + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool + :return: Bidirectional simple rnn layer. + :rtype: Variable + """ + + def __init__(self, i_size: int, h_size: int, share_weights: bool): + super().__init__() + self.share_weights = share_weights + if self.share_weights: + #input-hidden weights shared between bi-directional rnn. + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + # batch norm is only performed on input-state projection + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = self.fw_fc + self.bw_bn = self.fw_bn + else: + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + + self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class BiGRUWithBN(nn.Layer): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: Variable + :param size: Dimension of GRU cells. + :type size: int + :param act: Activation type. + :type act: string + :return: Bidirectional GRU layer. + :rtype: Variable + """ + + def __init__(self, i_size: int, h_size: int): + super().__init__() + hidden_size = h_size * 3 + + self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + + self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x, x_len): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class RNNStack(nn.Layer): + """RNN group with stacked bidirectional simple RNN or GRU layers. + + :param input: Input layer. + :type input: Variable + :param size: Dimension of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: Output layer of the RNN group. + :rtype: Variable + """ + + def __init__(self, + i_size: int, + h_size: int, + num_stacks: int, + use_gru: bool, + share_rnn_weights: bool): + super().__init__() + rnn_stacks = [] + for i in range(num_stacks): + if use_gru: + #default:GRU using tanh + rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) + else: + rnn_stacks.append( + BiRNNWithBN( + i_size=i_size, + h_size=h_size, + share_weights=share_rnn_weights)) + i_size = h_size * 2 + + self.rnn_stacks = nn.ModuleList(rnn_stacks) + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): + """ + x: shape [B, T, D] + x_len: shpae [B] + """ + for i, rnn in enumerate(self.rnn_stacks): + x, x_len = rnn(x, x_len) + masks = make_non_pad_mask(x_len) #[B, T] + masks = masks.unsqueeze(-1) # [B, T, 1] + # TODO(Hui Zhang): not support bool multiply + masks = masks.astype(x.dtype) + x = x.multiply(masks) + return x, x_len From 4f392e28b1e84f0e1b1218792dbf8a6f7b456287 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 05:45:25 +0000 Subject: [PATCH 03/24] complete the encoder of ds_online --- deepspeech/exps/deepspeech2/model.py | 66 ++-- deepspeech/models/ds2/__init__.py | 7 - deepspeech/models/ds2/conv.py | 172 ----------- deepspeech/models/ds2/rnn.py | 314 -------------------- deepspeech/models/ds2_online/__init__.py | 6 +- deepspeech/models/ds2_online/deepspeech2.py | 106 +++---- env.sh | 2 +- examples/aishell/s0/run.sh | 2 +- tests/deepspeech2_model_test.py | 4 +- 9 files changed, 107 insertions(+), 572 deletions(-) delete mode 100644 deepspeech/models/ds2/__init__.py delete mode 100644 deepspeech/models/ds2/conv.py delete mode 100644 deepspeech/models/ds2/rnn.py diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 2f84b686c..5c6c5fa18 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -29,6 +29,8 @@ from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.ds2 import DeepSpeech2InferModel from deepspeech.models.ds2 import DeepSpeech2Model +from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.trainer import Trainer from deepspeech.utils import error_rate @@ -120,15 +122,25 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config - model = DeepSpeech2Model( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - + if (config.model.apply_online == False): + model = DeepSpeech2Model( + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + else: + model = DeepSpeech2ModelOnline( + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + if self.parallel: model = paddle.DataParallel(model) @@ -329,8 +341,13 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): exit(-1) def export(self): - infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader, self.config, self.args.checkpoint_path) + if self.config.model.apply_online == False: + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) + else: + infer_model = DeepSpeech2InferModelOnline.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) + infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size static_model = paddle.jit.to_static( @@ -367,14 +384,25 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def setup_model(self): config = self.config - model = DeepSpeech2Model( - feat_size=self.test_loader.collate_fn.feature_size, - dict_size=self.test_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + if config.model.apply_online == False: + model = DeepSpeech2Model( + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + else: + model = DeepSpeech2ModelOnline( + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + self.model = model logger.info("Setup model!") diff --git a/deepspeech/models/ds2/__init__.py b/deepspeech/models/ds2/__init__.py deleted file mode 100644 index 299f901cb..000000000 --- a/deepspeech/models/ds2/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .deepspeech2 import DeepSpeech2Model -from .deepspeech2 import DeepSpeech2InferModel - -__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] - - - diff --git a/deepspeech/models/ds2/conv.py b/deepspeech/models/ds2/conv.py deleted file mode 100644 index 8bf48b2c8..000000000 --- a/deepspeech/models/ds2/conv.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from paddle import nn -from paddle.nn import functional as F - -from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.utils.log import Log - -logger = Log(__name__).getlog() - -__all__ = ['ConvStack', "conv_output_size"] - - -def conv_output_size(I, F, P, S): - # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters - # Output size after Conv: - # By noting I the length of the input volume size, - # F the length of the filter, - # P the amount of zero padding, - # S the stride, - # then the output size O of the feature map along that dimension is given by: - # O = (I - F + Pstart + Pend) // S + 1 - # When Pstart == Pend == P, we can replace Pstart + Pend by 2P. - # When Pstart == Pend == 0 - # O = (I - F - S) // S - # https://iq.opengenus.org/output-size-of-convolution/ - # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1 - # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1 - return (I - F + 2 * P - S) // S - - -# receptive field calculator -# https://fomoro.com/research/article/receptive-field-calculator -# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters -# https://distill.pub/2019/computing-receptive-fields/ -# Rl-1 = Sl * Rl + (Kl - Sl) - - -class ConvBn(nn.Layer): - """Convolution layer with batch normalization. - - :param kernel_size: The x dimension of a filter kernel. Or input a tuple for - two image dimension. - :type kernel_size: int|tuple|list - :param num_channels_in: Number of input channels. - :type num_channels_in: int - :param num_channels_out: Number of output channels. - :type num_channels_out: int - :param stride: The x dimension of the stride. Or input a tuple for two - image dimension. - :type stride: int|tuple|list - :param padding: The x dimension of the padding. Or input a tuple for two - image dimension. - :type padding: int|tuple|list - :param act: Activation type, relu|brelu - :type act: string - :return: Batch norm layer after convolution layer. - :rtype: Variable - - """ - - def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, - padding, act): - - super().__init__() - assert len(kernel_size) == 2 - assert len(stride) == 2 - assert len(padding) == 2 - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - - self.conv = nn.Conv2D( - num_channels_in, - num_channels_out, - kernel_size=kernel_size, - stride=stride, - padding=padding, - weight_attr=None, - bias_attr=False, - data_format='NCHW') - - self.bn = nn.BatchNorm2D( - num_channels_out, - weight_attr=None, - bias_attr=None, - data_format='NCHW') - self.act = F.relu if act == 'relu' else brelu - - def forward(self, x, x_len): - """ - x(Tensor): audio, shape [B, C, D, T] - """ - x = self.conv(x) - x = self.bn(x) - x = self.act(x) - - x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] - ) // self.stride[1] + 1 - - # reset padding part to 0 - masks = make_non_pad_mask(x_len) #[B, T] - masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] - # TODO(Hui Zhang): not support bool multiply - # masks = masks.type_as(x) - masks = masks.astype(x.dtype) - x = x.multiply(masks) - - return x, x_len - - -class ConvStack(nn.Layer): - """Convolution group with stacked convolution layers. - - :param feat_size: audio feature dim. - :type feat_size: int - :param num_stacks: Number of stacked convolution layers. - :type num_stacks: int - """ - - def __init__(self, feat_size, num_stacks): - super().__init__() - self.feat_size = feat_size # D - self.num_stacks = num_stacks - - self.conv_in = ConvBn( - num_channels_in=1, - num_channels_out=32, - kernel_size=(41, 11), #[D, T] - stride=(2, 3), - padding=(20, 5), - act='brelu') - - out_channel = 32 - convs = [ - ConvBn( - num_channels_in=32, - num_channels_out=out_channel, - kernel_size=(21, 11), - stride=(2, 1), - padding=(10, 5), - act='brelu') for i in range(num_stacks - 1) - ] - self.conv_stack = nn.LayerList(convs) - - # conv output feat_dim - output_height = (feat_size - 1) // 2 + 1 - for i in range(self.num_stacks - 1): - output_height = (output_height - 1) // 2 + 1 - self.output_height = out_channel * output_height - - def forward(self, x, x_len): - """ - x: shape [B, C, D, T] - x_len : shape [B] - """ - x, x_len = self.conv_in(x, x_len) - for i, conv in enumerate(self.conv_stack): - x, x_len = conv(x, x_len) - return x, x_len diff --git a/deepspeech/models/ds2/rnn.py b/deepspeech/models/ds2/rnn.py deleted file mode 100644 index 01b55c4a2..000000000 --- a/deepspeech/models/ds2/rnn.py +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math - -import paddle -from paddle import nn -from paddle.nn import functional as F -from paddle.nn import initializer as I - -from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.utils.log import Log - -logger = Log(__name__).getlog() - -__all__ = ['RNNStack'] - - -class RNNCell(nn.RNNCellBase): - r""" - Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it - computes the outputs and updates states. - The formula used is as follows: - .. math:: - h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) - y_{t} & = h_{t} - - where :math:`act` is for :attr:`activation`. - """ - - def __init__(self, - hidden_size: int, - activation="tanh", - weight_ih_attr=None, - weight_hh_attr=None, - bias_ih_attr=None, - bias_hh_attr=None, - name=None): - super().__init__() - std = 1.0 / math.sqrt(hidden_size) - self.weight_hh = self.create_parameter( - (hidden_size, hidden_size), - weight_hh_attr, - default_initializer=I.Uniform(-std, std)) - self.bias_ih = None - self.bias_hh = self.create_parameter( - (hidden_size, ), - bias_hh_attr, - is_bias=True, - default_initializer=I.Uniform(-std, std)) - - self.hidden_size = hidden_size - if activation not in ["tanh", "relu", "brelu"]: - raise ValueError( - "activation for SimpleRNNCell should be tanh or relu, " - "but get {}".format(activation)) - self.activation = activation - self._activation_fn = paddle.tanh \ - if activation == "tanh" \ - else F.relu - if activation == 'brelu': - self._activation_fn = brelu - - def forward(self, inputs, states=None): - if states is None: - states = self.get_initial_states(inputs, self.state_shape) - pre_h = states - i2h = inputs - if self.bias_ih is not None: - i2h += self.bias_ih - h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) - if self.bias_hh is not None: - h2h += self.bias_hh - h = self._activation_fn(i2h + h2h) - return h, h - - @property - def state_shape(self): - return (self.hidden_size, ) - - -class GRUCell(nn.RNNCellBase): - r""" - Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, - it computes the outputs and updates states. - The formula for GRU used is as follows: - .. math:: - r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) - z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) - \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) - h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} - y_{t} & = h_{t} - - where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise - multiplication operator. - """ - - def __init__(self, - input_size: int, - hidden_size: int, - weight_ih_attr=None, - weight_hh_attr=None, - bias_ih_attr=None, - bias_hh_attr=None, - name=None): - super().__init__() - std = 1.0 / math.sqrt(hidden_size) - self.weight_hh = self.create_parameter( - (3 * hidden_size, hidden_size), - weight_hh_attr, - default_initializer=I.Uniform(-std, std)) - self.bias_ih = None - self.bias_hh = self.create_parameter( - (3 * hidden_size, ), - bias_hh_attr, - is_bias=True, - default_initializer=I.Uniform(-std, std)) - - self.hidden_size = hidden_size - self.input_size = input_size - self._gate_activation = F.sigmoid - self._activation = paddle.tanh - - def forward(self, inputs, states=None): - if states is None: - states = self.get_initial_states(inputs, self.state_shape) - - pre_hidden = states - x_gates = inputs - if self.bias_ih is not None: - x_gates = x_gates + self.bias_ih - h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) - if self.bias_hh is not None: - h_gates = h_gates + self.bias_hh - - x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) - h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) - - r = self._gate_activation(x_r + h_r) - z = self._gate_activation(x_z + h_z) - c = self._activation(x_c + r * h_c) # apply reset gate after mm - h = (pre_hidden - c) * z + c - # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru - - return h, h - - @property - def state_shape(self): - r""" - The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch - size would be automatically inserted into shape). The shape corresponds - to the shape of :math:`h_{t-1}`. - """ - return (self.hidden_size, ) - - -class BiRNNWithBN(nn.Layer): - """Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param size: Dimension of RNN cells. - :type size: int - :param share_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - :type share_weights: bool - :return: Bidirectional simple rnn layer. - :rtype: Variable - """ - - def __init__(self, i_size: int, h_size: int, share_weights: bool): - super().__init__() - self.share_weights = share_weights - if self.share_weights: - #input-hidden weights shared between bi-directional rnn. - self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) - # batch norm is only performed on input-state projection - self.fw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - self.bw_fc = self.fw_fc - self.bw_bn = self.fw_bn - else: - self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) - self.fw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) - self.bw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - - self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') - self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') - self.fw_rnn = nn.RNN( - self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] - self.bw_rnn = nn.RNN( - self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - - def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): - # x, shape [B, T, D] - fw_x = self.fw_bn(self.fw_fc(x)) - bw_x = self.bw_bn(self.bw_fc(x)) - fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) - bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) - x = paddle.concat([fw_x, bw_x], axis=-1) - return x, x_len - - -class BiGRUWithBN(nn.Layer): - """Bidirectonal gru layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param name: Name of the layer. - :type name: string - :param input: Input layer. - :type input: Variable - :param size: Dimension of GRU cells. - :type size: int - :param act: Activation type. - :type act: string - :return: Bidirectional GRU layer. - :rtype: Variable - """ - - def __init__(self, i_size: int, h_size: int): - super().__init__() - hidden_size = h_size * 3 - - self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) - self.fw_bn = nn.BatchNorm1D( - hidden_size, bias_attr=None, data_format='NLC') - self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) - self.bw_bn = nn.BatchNorm1D( - hidden_size, bias_attr=None, data_format='NLC') - - self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) - self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) - self.fw_rnn = nn.RNN( - self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] - self.bw_rnn = nn.RNN( - self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - - def forward(self, x, x_len): - # x, shape [B, T, D] - fw_x = self.fw_bn(self.fw_fc(x)) - bw_x = self.bw_bn(self.bw_fc(x)) - fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) - bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) - x = paddle.concat([fw_x, bw_x], axis=-1) - return x, x_len - - -class RNNStack(nn.Layer): - """RNN group with stacked bidirectional simple RNN or GRU layers. - - :param input: Input layer. - :type input: Variable - :param size: Dimension of RNN cells in each layer. - :type size: int - :param num_stacks: Number of stacked rnn layers. - :type num_stacks: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: Output layer of the RNN group. - :rtype: Variable - """ - - def __init__(self, - i_size: int, - h_size: int, - num_stacks: int, - use_gru: bool, - share_rnn_weights: bool): - super().__init__() - rnn_stacks = [] - for i in range(num_stacks): - if use_gru: - #default:GRU using tanh - rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) - else: - rnn_stacks.append( - BiRNNWithBN( - i_size=i_size, - h_size=h_size, - share_weights=share_rnn_weights)) - i_size = h_size * 2 - - self.rnn_stacks = nn.ModuleList(rnn_stacks) - - def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): - """ - x: shape [B, T, D] - x_len: shpae [B] - """ - for i, rnn in enumerate(self.rnn_stacks): - x, x_len = rnn(x, x_len) - masks = make_non_pad_mask(x_len) #[B, T] - masks = masks.unsqueeze(-1) # [B, T, 1] - # TODO(Hui Zhang): not support bool multiply - masks = masks.astype(x.dtype) - x = x.multiply(masks) - return x, x_len diff --git a/deepspeech/models/ds2_online/__init__.py b/deepspeech/models/ds2_online/__init__.py index 299f901cb..88076667c 100644 --- a/deepspeech/models/ds2_online/__init__.py +++ b/deepspeech/models/ds2_online/__init__.py @@ -1,7 +1,7 @@ -from .deepspeech2 import DeepSpeech2Model -from .deepspeech2 import DeepSpeech2InferModel +from .deepspeech2 import DeepSpeech2ModelOnline +from .deepspeech2 import DeepSpeech2InferModelOnline -__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] +__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline'] diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index c5e6a92bc..4ac6384e3 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -16,23 +16,27 @@ from typing import Optional import paddle from paddle import nn +import paddle.nn.functional as F + from yacs.config import CfgNode -from deepspeech.models.ds2.conv import ConvStack +from deepspeech.models.ds2_online.conv import ConvStack from deepspeech.modules.ctc import CTCDecoder -from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.models.ds2_online.rnn import RNNStack from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log -from paddle.nn import LSTM, GRU +from paddle.nn import LSTM, GRU, Linear from paddle.nn import LayerNorm from paddle.nn import LayerList +from paddle.fluid.layers import fc + logger = Log(__name__).getlog() -__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferMode'] +__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModeOnline'] class CRNNEncoder(nn.Layer): @@ -40,31 +44,28 @@ class CRNNEncoder(nn.Layer): feat_size, dict_size, num_conv_layers=2, - num_rnn_layers=3, + num_rnn_layers=4, rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], use_gru=False, - share_rnn_weights=True, - apply_online=True): + share_rnn_weights=True): super().__init__() self.rnn_size = rnn_size self.feat_size = feat_size # 161 for linear self.dict_size = dict_size self.num_rnn_layers = num_rnn_layers - self.apply_online = apply_online + self.num_fc_layers = num_fc_layers + self.fc_layers_size_list = fc_layers_size_list self.conv = ConvStack(feat_size, num_conv_layers) i_size = self.conv.output_height # H after conv stack - - + self.rnn = LayerList() self.layernorm_list = LayerList() - - if (apply_online == True): - rnn_direction = 'forward' - layernorm_size = rnn_size - else: - rnn_direction = 'bidirect' - layernorm_size = 2 * rnn_size + self.fc_layers_list = LayerList() + rnn_direction = 'forward' + layernorm_size = rnn_size if use_gru == True: self.rnn.append(GRU(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) @@ -78,20 +79,14 @@ class CRNNEncoder(nn.Layer): for i in range(1, num_rnn_layers): self.rnn.append(LSTM(input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) self.layernorm_list.append(LayerNorm(layernorm_size)) - """ - self.rnn = RNNStack( - i_size=i_size, - h_size=rnn_size, - num_stacks=num_rnn_layers, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - """ + fc_input_size = layernorm_size + for i in range(self.num_fc_layers): + self.fc_layers_list.append(nn.Linear(fc_input_size, fc_layers_size_list[i])) + fc_input_size = fc_layers_size_list[i] + @property def output_size(self): - if (self.apply_online == True): - return self.rnn_size - else: - return 2 * self.rnn_size + return self.fc_layers_size_list[-1] def forward(self, audio, audio_len): """Compute Encoder outputs @@ -126,14 +121,15 @@ class CRNNEncoder(nn.Layer): for i in range(1, self.num_rnn_layers): x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] x = self.layernorm_list[i](x) - """ - x, x_lens = self.rnn(x, x_lens) - """ + + for i in range(self.num_fc_layers): + x = self.fc_layers_list[i](x) + x = F.relu(x) return x, x_lens -class DeepSpeech2Model(nn.Layer): - """The DeepSpeech2 network structure. +class DeepSpeech2ModelOnline(nn.Layer): + """The DeepSpeech2 network structure for online. :param audio_data: Audio spectrogram data layer. :type audio_data: Variable @@ -159,7 +155,7 @@ class DeepSpeech2Model(nn.Layer): :type share_weights: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. - :rtype: tuple of LayerOutput + :rtype: tuple of LayerOutput """ @classmethod @@ -167,8 +163,10 @@ class DeepSpeech2Model(nn.Layer): default = CfgNode( dict( num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. + num_rnn_layers=4, #Number of stacking RNN layers. rnn_layer_size=1024, #RNN layer size (number of RNN cells). + num_fc_layers=2, + fc_layers_size_list = [512,256], use_gru=True, #Use gru if set True. Use simple rnn if set False. share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. )) @@ -182,23 +180,22 @@ class DeepSpeech2Model(nn.Layer): num_conv_layers=2, num_rnn_layers=3, rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], use_gru=False, - share_rnn_weights=True, - apply_online = True): + share_rnn_weights=True): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, dict_size=dict_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, + num_fc_layers=num_fc_layers, + fc_layers_size_list=fc_layers_size_list, rnn_size=rnn_size, use_gru=use_gru, - share_rnn_weights=share_rnn_weights, - apply_online=apply_online) - if (apply_online == True): - assert (self.encoder.output_size == rnn_size) - else: - assert (self.encoder.output_size == 2 * rnn_size) + share_rnn_weights=share_rnn_weights) + assert (self.encoder.output_size == fc_layers_size_list[-1]) self.decoder = CTCDecoder( odim=dict_size, # is in vocab @@ -253,10 +250,10 @@ class DeepSpeech2Model(nn.Layer): config: yacs.config.CfgNode model configs - + checkpoint_path: Path or str the path of pretrained model checkpoint, without extension name - + Returns ------- DeepSpeech2Model @@ -267,9 +264,10 @@ class DeepSpeech2Model(nn.Layer): num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, + num_fc_layers=config.model.num_fc_layers, + fc_layers_size_list=config.model.fc_layers_size_list, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - apply_online=config.model.apply_online) + share_rnn_weights=config.model.share_rnn_weights) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -277,25 +275,27 @@ class DeepSpeech2Model(nn.Layer): return model -class DeepSpeech2InferModel(DeepSpeech2Model): +class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): def __init__(self, feat_size, dict_size, num_conv_layers=2, num_rnn_layers=3, rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], use_gru=False, - share_rnn_weights=True, - apply_online = True): + share_rnn_weights=True): super().__init__( feat_size=feat_size, dict_size=dict_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_size, + num_fc_layers=num_fc_layers, + fc_layers_size_list=fc_layers_size_list, use_gru=use_gru, - share_rnn_weights=share_rnn_weights, - apply_online=apply_online) + share_rnn_weights=share_rnn_weights) def forward(self, audio, audio_len): """export model function diff --git a/env.sh b/env.sh index c5acd0112..9d22259df 100644 --- a/env.sh +++ b/env.sh @@ -4,7 +4,7 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 +export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index c9708dcc9..33b02f7ce 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -2,7 +2,7 @@ set -e source path.sh -gpus=0,1,2,3 +gpus=2,3,4,5 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml diff --git a/tests/deepspeech2_model_test.py b/tests/deepspeech2_model_test.py index 1776736f5..bb40802d7 100644 --- a/tests/deepspeech2_model_test.py +++ b/tests/deepspeech2_model_test.py @@ -16,8 +16,8 @@ import unittest import numpy as np import paddle -from deepspeech.models.deepspeech2 import DeepSpeech2Model - +#from deepspeech.models.deepspeech2 import DeepSpeech2Model +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline as DeepSpeech2Model class TestDeepSpeech2Model(unittest.TestCase): def setUp(self): From 7b201ba45730bb9bcbc9e9412c661b7eeb02a141 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 07:50:57 +0000 Subject: [PATCH 04/24] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=86online?= =?UTF-8?q?=E7=9A=84=E6=A8=A1=E5=9E=8B=EF=BC=8C=E9=80=9A=E8=BF=87=E4=BA=86?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=EF=BC=8C=E8=BF=98=E9=9C=80=E8=A6=81=E6=90=AD?= =?UTF-8?q?=E5=BB=BA=E9=85=8D=E5=A5=97=E7=9A=84=E5=AE=9E=E9=AA=8C=E6=B5=81?= =?UTF-8?q?=E7=A8=8B=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/models/ds2/conv.py | 172 +++++++++++++++ deepspeech/models/ds2/deepspeech2.py | 305 -------------------------- deepspeech/models/ds2/rnn.py | 314 +++++++++++++++++++++++++++ 3 files changed, 486 insertions(+), 305 deletions(-) create mode 100644 deepspeech/models/ds2/conv.py delete mode 100644 deepspeech/models/ds2/deepspeech2.py create mode 100644 deepspeech/models/ds2/rnn.py diff --git a/deepspeech/models/ds2/conv.py b/deepspeech/models/ds2/conv.py new file mode 100644 index 000000000..8bf48b2c8 --- /dev/null +++ b/deepspeech/models/ds2/conv.py @@ -0,0 +1,172 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle import nn +from paddle.nn import functional as F + +from deepspeech.modules.activation import brelu +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['ConvStack', "conv_output_size"] + + +def conv_output_size(I, F, P, S): + # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters + # Output size after Conv: + # By noting I the length of the input volume size, + # F the length of the filter, + # P the amount of zero padding, + # S the stride, + # then the output size O of the feature map along that dimension is given by: + # O = (I - F + Pstart + Pend) // S + 1 + # When Pstart == Pend == P, we can replace Pstart + Pend by 2P. + # When Pstart == Pend == 0 + # O = (I - F - S) // S + # https://iq.opengenus.org/output-size-of-convolution/ + # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1 + # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1 + return (I - F + 2 * P - S) // S + + +# receptive field calculator +# https://fomoro.com/research/article/receptive-field-calculator +# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters +# https://distill.pub/2019/computing-receptive-fields/ +# Rl-1 = Sl * Rl + (Kl - Sl) + + +class ConvBn(nn.Layer): + """Convolution layer with batch normalization. + + :param kernel_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type kernel_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :param num_channels_out: Number of output channels. + :type num_channels_out: int + :param stride: The x dimension of the stride. Or input a tuple for two + image dimension. + :type stride: int|tuple|list + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type, relu|brelu + :type act: string + :return: Batch norm layer after convolution layer. + :rtype: Variable + + """ + + def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, + padding, act): + + super().__init__() + assert len(kernel_size) == 2 + assert len(stride) == 2 + assert len(padding) == 2 + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + + self.conv = nn.Conv2D( + num_channels_in, + num_channels_out, + kernel_size=kernel_size, + stride=stride, + padding=padding, + weight_attr=None, + bias_attr=False, + data_format='NCHW') + + self.bn = nn.BatchNorm2D( + num_channels_out, + weight_attr=None, + bias_attr=None, + data_format='NCHW') + self.act = F.relu if act == 'relu' else brelu + + def forward(self, x, x_len): + """ + x(Tensor): audio, shape [B, C, D, T] + """ + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] + ) // self.stride[1] + 1 + + # reset padding part to 0 + masks = make_non_pad_mask(x_len) #[B, T] + masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + # TODO(Hui Zhang): not support bool multiply + # masks = masks.type_as(x) + masks = masks.astype(x.dtype) + x = x.multiply(masks) + + return x, x_len + + +class ConvStack(nn.Layer): + """Convolution group with stacked convolution layers. + + :param feat_size: audio feature dim. + :type feat_size: int + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + """ + + def __init__(self, feat_size, num_stacks): + super().__init__() + self.feat_size = feat_size # D + self.num_stacks = num_stacks + + self.conv_in = ConvBn( + num_channels_in=1, + num_channels_out=32, + kernel_size=(41, 11), #[D, T] + stride=(2, 3), + padding=(20, 5), + act='brelu') + + out_channel = 32 + convs = [ + ConvBn( + num_channels_in=32, + num_channels_out=out_channel, + kernel_size=(21, 11), + stride=(2, 1), + padding=(10, 5), + act='brelu') for i in range(num_stacks - 1) + ] + self.conv_stack = nn.LayerList(convs) + + # conv output feat_dim + output_height = (feat_size - 1) // 2 + 1 + for i in range(self.num_stacks - 1): + output_height = (output_height - 1) // 2 + 1 + self.output_height = out_channel * output_height + + def forward(self, x, x_len): + """ + x: shape [B, C, D, T] + x_len : shape [B] + """ + x, x_len = self.conv_in(x, x_len) + for i, conv in enumerate(self.conv_stack): + x, x_len = conv(x, x_len) + return x, x_len diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py deleted file mode 100644 index 7f173ce29..000000000 --- a/deepspeech/models/ds2/deepspeech2.py +++ /dev/null @@ -1,305 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Deepspeech2 ASR Model""" -from typing import Optional - -import paddle -from paddle import nn -from yacs.config import CfgNode - -from deepspeech.models.ds2.conv import ConvStack -from deepspeech.modules.ctc import CTCDecoder -from deepspeech.models.ds2.rnn import RNNStack -from deepspeech.utils import layer_tools -from deepspeech.utils.checkpoint import Checkpoint -from deepspeech.utils.log import Log - -from paddle.nn import LSTM, GRU -from paddle.nn import LayerNorm -from paddle.nn import LayerList - - -logger = Log(__name__).getlog() - -__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferMode'] - - -class CRNNEncoder(nn.Layer): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True, - apply_online=True): - super().__init__() - self.rnn_size = rnn_size - self.feat_size = feat_size # 161 for linear - self.dict_size = dict_size - self.num_rnn_layers = num_rnn_layers - self.apply_online = apply_online - self.conv = ConvStack(feat_size, num_conv_layers) - - i_size = self.conv.output_height # H after conv stack - - - self.rnn = LayerList() - self.layernorm_list = LayerList() - - if (apply_online == True): - rnn_direction = 'forward' - else: - rnn_direction = 'bidirect' - - if use_gru == True: - self.rnn.append(GRU(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) - self.layernorm_list.append(LayerNorm(rnn_size)) - for i in range(1, num_rnn_layers): - self.rnn.append(GRU(input_size=rnn_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) - self.layernorm_list.append(LayerNorm(rnn_size)) - else: - self.rnn.append(LSTM(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) - self.layernorm_list.append(LayerNorm(rnn_size)) - for i in range(1, num_rnn_layers): - self.rnn.append(LSTM(input_size=rnn_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) - self.layernorm_list.append(LayerNorm(rnn_size)) - """ - self.rnn = RNNStack( - i_size=i_size, - h_size=rnn_size, - num_stacks=num_rnn_layers, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - """ - @property - def output_size(self): - return self.rnn_size - - def forward(self, audio, audio_len): - """Compute Encoder outputs - - Args: - audio (Tensor): [B, Tmax, D] - text (Tensor): [B, Umax] - audio_len (Tensor): [B] - text_len (Tensor): [B] - Returns: - x (Tensor): encoder outputs, [B, T, D] - x_lens (Tensor): encoder length, [B] - """ - # [B, T, D] -> [B, D, T] - audio = audio.transpose([0, 2, 1]) - # [B, D, T] -> [B, C=1, D, T] - x = audio.unsqueeze(1) - x_lens = audio_len - - # convolution group - x, x_lens = self.conv(x, x_lens) - - # convert data from convolution feature map to sequence of vectors - #B, C, D, T = paddle.shape(x) # not work under jit - x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] - #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit - x = x.reshape([0, 0, -1]) #[B, T, C*D] - - # remove padding part - print ("x.shape:", x.shape) - x, output_state = self.rnn[0](x, None, x_lens) - x = self.layernorm_list[0](x) - for i in range(1, self.num_rnn_layers): - x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] - x = self.layernorm_list[i](x) - """ - x, x_lens = self.rnn(x, x_lens) - """ - return x, x_lens - - -class DeepSpeech2Model(nn.Layer): - """The DeepSpeech2 network structure. - - :param audio_data: Audio spectrogram data layer. - :type audio_data: Variable - :param text_data: Transcription text data layer. - :type text_data: Variable - :param audio_len: Valid sequence length data layer. - :type audio_len: Variable - :param masks: Masks data layer to reset padding. - :type masks: Variable - :param dict_size: Dictionary size for tokenized transcription. - :type dict_size: int - :param num_conv_layers: Number of stacking convolution layers. - :type num_conv_layers: int - :param num_rnn_layers: Number of stacking RNN layers. - :type num_rnn_layers: int - :param rnn_size: RNN layer size (dimension of RNN cells). - :type rnn_size: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward direction RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: A tuple of an output unnormalized log probability layer ( - before softmax) and a ctc cost layer. - :rtype: tuple of LayerOutput - """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True, - apply_online = True): - super().__init__() - self.encoder = CRNNEncoder( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights, - apply_online=apply_online) - assert (self.encoder.output_size == rnn_size) - - self.decoder = CTCDecoder( - odim=dict_size, # is in vocab - enc_n_units=self.encoder.output_size, - blank_id=0, # first token is - dropout_rate=0.0, - reduction=True, # sum - batch_average=True) # sum / batch_size - - def forward(self, audio, audio_len, text, text_len): - """Compute Model loss - - Args: - audio (Tenosr): [B, T, D] - audio_len (Tensor): [B] - text (Tensor): [B, U] - text_len (Tensor): [B] - - Returns: - loss (Tenosr): [1] - """ - eouts, eouts_len = self.encoder(audio, audio_len) - loss = self.decoder(eouts, eouts_len, text, text_len) - return loss - - @paddle.no_grad() - def decode(self, audio, audio_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes): - # init once - # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) - - eouts, eouts_len = self.encoder(audio, audio_len) - probs = self.decoder.softmax(eouts) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes) - - @classmethod - def from_pretrained(cls, dataloader, config, checkpoint_path): - """Build a DeepSpeech2Model model from a pretrained model. - Parameters - ---------- - dataloader: paddle.io.DataLoader - - config: yacs.config.CfgNode - model configs - - checkpoint_path: Path or str - the path of pretrained model checkpoint, without extension name - - Returns - ------- - DeepSpeech2Model - The model built from pretrained result. - """ - model = cls(feat_size=dataloader.collate_fn.feature_size, - dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - apply_online=config.model.apply_online) - infos = Checkpoint().load_parameters( - model, checkpoint_path=checkpoint_path) - logger.info(f"checkpoint info: {infos}") - layer_tools.summary(model) - return model - - -class DeepSpeech2InferModel(DeepSpeech2Model): - def __init__(self, - feat_size, - dict_size, - num_conv_layers=2, - num_rnn_layers=3, - rnn_size=1024, - use_gru=False, - share_rnn_weights=True, - apply_online = True): - super().__init__( - feat_size=feat_size, - dict_size=dict_size, - num_conv_layers=num_conv_layers, - num_rnn_layers=num_rnn_layers, - rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights, - apply_online=apply_online) - - def forward(self, audio, audio_len): - """export model function - - Args: - audio (Tensor): [B, T, D] - audio_len (Tensor): [B] - - Returns: - probs: probs after softmax - """ - eouts, eouts_len = self.encoder(audio, audio_len) - probs = self.decoder.softmax(eouts) - return probs diff --git a/deepspeech/models/ds2/rnn.py b/deepspeech/models/ds2/rnn.py new file mode 100644 index 000000000..01b55c4a2 --- /dev/null +++ b/deepspeech/models/ds2/rnn.py @@ -0,0 +1,314 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.modules.activation import brelu +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['RNNStack'] + + +class RNNCell(nn.RNNCellBase): + r""" + Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it + computes the outputs and updates states. + The formula used is as follows: + .. math:: + h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) + y_{t} & = h_{t} + + where :math:`act` is for :attr:`activation`. + """ + + def __init__(self, + hidden_size: int, + activation="tanh", + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + if activation not in ["tanh", "relu", "brelu"]: + raise ValueError( + "activation for SimpleRNNCell should be tanh or relu, " + "but get {}".format(activation)) + self.activation = activation + self._activation_fn = paddle.tanh \ + if activation == "tanh" \ + else F.relu + if activation == 'brelu': + self._activation_fn = brelu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + pre_h = states + i2h = inputs + if self.bias_ih is not None: + i2h += self.bias_ih + h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h2h += self.bias_hh + h = self._activation_fn(i2h + h2h) + return h, h + + @property + def state_shape(self): + return (self.hidden_size, ) + + +class GRUCell(nn.RNNCellBase): + r""" + Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, + it computes the outputs and updates states. + The formula for GRU used is as follows: + .. math:: + r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) + z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) + \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) + h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise + multiplication operator. + """ + + def __init__(self, + input_size: int, + hidden_size: int, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (3 * hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (3 * hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = F.sigmoid + self._activation = paddle.tanh + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + + pre_hidden = states + x_gates = inputs + if self.bias_ih is not None: + x_gates = x_gates + self.bias_ih + h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h_gates = h_gates + self.bias_hh + + x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) + h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) + + r = self._gate_activation(x_r + h_r) + z = self._gate_activation(x_z + h_z) + c = self._activation(x_c + r * h_c) # apply reset gate after mm + h = (pre_hidden - c) * z + c + # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru + + return h, h + + @property + def state_shape(self): + r""" + The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch + size would be automatically inserted into shape). The shape corresponds + to the shape of :math:`h_{t-1}`. + """ + return (self.hidden_size, ) + + +class BiRNNWithBN(nn.Layer): + """Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param size: Dimension of RNN cells. + :type size: int + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool + :return: Bidirectional simple rnn layer. + :rtype: Variable + """ + + def __init__(self, i_size: int, h_size: int, share_weights: bool): + super().__init__() + self.share_weights = share_weights + if self.share_weights: + #input-hidden weights shared between bi-directional rnn. + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + # batch norm is only performed on input-state projection + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = self.fw_fc + self.bw_bn = self.fw_bn + else: + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + + self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class BiGRUWithBN(nn.Layer): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: Variable + :param size: Dimension of GRU cells. + :type size: int + :param act: Activation type. + :type act: string + :return: Bidirectional GRU layer. + :rtype: Variable + """ + + def __init__(self, i_size: int, h_size: int): + super().__init__() + hidden_size = h_size * 3 + + self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + + self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x, x_len): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class RNNStack(nn.Layer): + """RNN group with stacked bidirectional simple RNN or GRU layers. + + :param input: Input layer. + :type input: Variable + :param size: Dimension of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: Output layer of the RNN group. + :rtype: Variable + """ + + def __init__(self, + i_size: int, + h_size: int, + num_stacks: int, + use_gru: bool, + share_rnn_weights: bool): + super().__init__() + rnn_stacks = [] + for i in range(num_stacks): + if use_gru: + #default:GRU using tanh + rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) + else: + rnn_stacks.append( + BiRNNWithBN( + i_size=i_size, + h_size=h_size, + share_weights=share_rnn_weights)) + i_size = h_size * 2 + + self.rnn_stacks = nn.ModuleList(rnn_stacks) + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): + """ + x: shape [B, T, D] + x_len: shpae [B] + """ + for i, rnn in enumerate(self.rnn_stacks): + x, x_len = rnn(x, x_len) + masks = make_non_pad_mask(x_len) #[B, T] + masks = masks.unsqueeze(-1) # [B, T, 1] + # TODO(Hui Zhang): not support bool multiply + masks = masks.astype(x.dtype) + x = x.multiply(masks) + return x, x_len From 66c59cdeae1ce9952766046fb67ce4272ece0d1f Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 07:52:59 +0000 Subject: [PATCH 05/24] adding pre-commit --- deepspeech/models/ds2_online/deepspeech2.py | 51 ++++++++++++++------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 4ac6384e3..3c77209fa 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -15,25 +15,23 @@ from typing import Optional import paddle -from paddle import nn import paddle.nn.functional as F - +from paddle import nn +from paddle.fluid.layers import fc +from paddle.nn import GRU +from paddle.nn import LayerList +from paddle.nn import LayerNorm +from paddle.nn import Linear +from paddle.nn import LSTM from yacs.config import CfgNode from deepspeech.models.ds2_online.conv import ConvStack -from deepspeech.modules.ctc import CTCDecoder from deepspeech.models.ds2_online.rnn import RNNStack +from deepspeech.modules.ctc import CTCDecoder from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log -from paddle.nn import LSTM, GRU, Linear -from paddle.nn import LayerNorm -from paddle.nn import LayerList - -from paddle.fluid.layers import fc - - logger = Log(__name__).getlog() __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModeOnline'] @@ -68,20 +66,39 @@ class CRNNEncoder(nn.Layer): layernorm_size = rnn_size if use_gru == True: - self.rnn.append(GRU(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.rnn.append( + GRU(input_size=i_size, + hidden_size=rnn_size, + num_layers=1, + direction=rnn_direction)) self.layernorm_list.append(LayerNorm(layernorm_size)) for i in range(1, num_rnn_layers): - self.rnn.append(GRU(input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.rnn.append( + GRU(input_size=layernorm_size, + hidden_size=rnn_size, + num_layers=1, + direction=rnn_direction)) self.layernorm_list.append(LayerNorm(layernorm_size)) else: - self.rnn.append(LSTM(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.rnn.append( + LSTM( + input_size=i_size, + hidden_size=rnn_size, + num_layers=1, + direction=rnn_direction)) self.layernorm_list.append(LayerNorm(layernorm_size)) for i in range(1, num_rnn_layers): - self.rnn.append(LSTM(input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.rnn.append( + LSTM( + input_size=layernorm_size, + hidden_size=rnn_size, + num_layers=1, + direction=rnn_direction)) self.layernorm_list.append(LayerNorm(layernorm_size)) fc_input_size = layernorm_size for i in range(self.num_fc_layers): - self.fc_layers_list.append(nn.Linear(fc_input_size, fc_layers_size_list[i])) + self.fc_layers_list.append( + nn.Linear(fc_input_size, fc_layers_size_list[i])) fc_input_size = fc_layers_size_list[i] @property @@ -119,7 +136,7 @@ class CRNNEncoder(nn.Layer): x, output_state = self.rnn[0](x, None, x_lens) x = self.layernorm_list[0](x) for i in range(1, self.num_rnn_layers): - x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] + x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): @@ -166,7 +183,7 @@ class DeepSpeech2ModelOnline(nn.Layer): num_rnn_layers=4, #Number of stacking RNN layers. rnn_layer_size=1024, #RNN layer size (number of RNN cells). num_fc_layers=2, - fc_layers_size_list = [512,256], + fc_layers_size_list=[512, 256], use_gru=True, #Use gru if set True. Use simple rnn if set False. share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. )) From 6079a2495dd89a8e081ff5fb020fc08f0ad11b11 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 08:32:04 +0000 Subject: [PATCH 06/24] =?UTF-8?q?=E6=8A=8Ads2=E4=B8=AD=E7=9A=84deepspeech2?= =?UTF-8?q?.py=E6=81=A2=E5=A4=8D=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/models/ds2/deepspeech2.py | 262 +++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 deepspeech/models/ds2/deepspeech2.py diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py new file mode 100644 index 000000000..4026c89a7 --- /dev/null +++ b/deepspeech/models/ds2/deepspeech2.py @@ -0,0 +1,262 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Deepspeech2 ASR Model""" +from typing import Optional + +import paddle +from paddle import nn +from yacs.config import CfgNode + +from deepspeech.models.ds2.conv import ConvStack +from deepspeech.modules.ctc import CTCDecoder +from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.utils import layer_tools +from deepspeech.utils.checkpoint import Checkpoint +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] + + +class CRNNEncoder(nn.Layer): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__() + self.rnn_size = rnn_size + self.feat_size = feat_size # 161 for linear + self.dict_size = dict_size + + self.conv = ConvStack(feat_size, num_conv_layers) + + i_size = self.conv.output_height # H after conv stack + self.rnn = RNNStack( + i_size=i_size, + h_size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + + @property + def output_size(self): + return self.rnn_size * 2 + + def forward(self, audio, audio_len): + """Compute Encoder outputs + + Args: + audio (Tensor): [B, Tmax, D] + text (Tensor): [B, Umax] + audio_len (Tensor): [B] + text_len (Tensor): [B] + Returns: + x (Tensor): encoder outputs, [B, T, D] + x_lens (Tensor): encoder length, [B] + """ + # [B, T, D] -> [B, D, T] + audio = audio.transpose([0, 2, 1]) + # [B, D, T] -> [B, C=1, D, T] + x = audio.unsqueeze(1) + x_lens = audio_len + + # convolution group + x, x_lens = self.conv(x, x_lens) + + # convert data from convolution feature map to sequence of vectors + #B, C, D, T = paddle.shape(x) # not work under jit + x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] + #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit + x = x.reshape([0, 0, -1]) #[B, T, C*D] + + # remove padding part + x, x_lens = self.rnn(x, x_lens) #[B, T, D] + return x, x_lens + + +class DeepSpeech2Model(nn.Layer): + """The DeepSpeech2 network structure. + + :param audio_data: Audio spectrogram data layer. + :type audio_data: Variable + :param text_data: Transcription text data layer. + :type text_data: Variable + :param audio_len: Valid sequence length data layer. + :type audio_len: Variable + :param masks: Masks data layer to reset padding. + :type masks: Variable + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (dimension of RNN cells). + :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. + :rtype: tuple of LayerOutput + """ + + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__() + self.encoder = CRNNEncoder( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + assert (self.encoder.output_size == rnn_size * 2) + + self.decoder = CTCDecoder( + odim=dict_size, # is in vocab + enc_n_units=self.encoder.output_size, + blank_id=0, # first token is + dropout_rate=0.0, + reduction=True, # sum + batch_average=True) # sum / batch_size + + def forward(self, audio, audio_len, text, text_len): + """Compute Model loss + + Args: + audio (Tenosr): [B, T, D] + audio_len (Tensor): [B] + text (Tensor): [B, U] + text_len (Tensor): [B] + + Returns: + loss (Tenosr): [1] + """ + eouts, eouts_len = self.encoder(audio, audio_len) + loss = self.decoder(eouts, eouts_len, text, text_len) + return loss + + @paddle.no_grad() + def decode(self, audio, audio_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) + + @classmethod + def from_pretrained(cls, dataloader, config, checkpoint_path): + """Build a DeepSpeech2Model model from a pretrained model. + Parameters + ---------- + dataloader: paddle.io.DataLoader + + config: yacs.config.CfgNode + model configs + + checkpoint_path: Path or str + the path of pretrained model checkpoint, without extension name + + Returns + ------- + DeepSpeech2Model + The model built from pretrained result. + """ + model = cls(feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + infos = Checkpoint().load_parameters( + model, checkpoint_path=checkpoint_path) + logger.info(f"checkpoint info: {infos}") + layer_tools.summary(model) + return model + + +class DeepSpeech2InferModel(DeepSpeech2Model): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + + def forward(self, audio, audio_len): + """export model function + + Args: + audio (Tensor): [B, T, D] + audio_len (Tensor): [B] + + Returns: + probs: probs after softmax + """ + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs From 5dd9e2f8ec81255537b3c6c66ca829c85ecd813a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 08:39:01 +0000 Subject: [PATCH 07/24] =?UTF-8?q?=E5=85=88=E4=B8=8D=E6=9A=B4=E9=9C=B2?= =?UTF-8?q?=E5=87=BAonline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/exps/deepspeech2/model.py | 62 ++++++++++------------------ 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 5c6c5fa18..91b7a1bfe 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -29,8 +29,8 @@ from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.ds2 import DeepSpeech2InferModel from deepspeech.models.ds2 import DeepSpeech2Model -from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline -from deepspeech.models.ds2_online import DeepSpeech2ModelOnline +#from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline +#from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.trainer import Trainer from deepspeech.utils import error_rate @@ -122,25 +122,15 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config - if (config.model.apply_online == False): - model = DeepSpeech2Model( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - else: - model = DeepSpeech2ModelOnline( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - + model = DeepSpeech2Model( + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + if self.parallel: model = paddle.DataParallel(model) @@ -347,7 +337,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): else: infer_model = DeepSpeech2InferModelOnline.from_pretrained( self.test_loader, self.config, self.args.checkpoint_path) - + infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size static_model = paddle.jit.to_static( @@ -384,25 +374,15 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def setup_model(self): config = self.config - if config.model.apply_online == False: - model = DeepSpeech2Model( - feat_size=self.test_loader.collate_fn.feature_size, - dict_size=self.test_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - else: - model = DeepSpeech2ModelOnline( - feat_size=self.test_loader.collate_fn.feature_size, - dict_size=self.test_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - + model = DeepSpeech2Model( + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + self.model = model logger.info("Setup model!") From 2c8d28111a52b0cabac127b98e18366ce002d442 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 08:49:49 +0000 Subject: [PATCH 08/24] fix some small mistakes --- deepspeech/exps/deepspeech2/model.py | 8 ++------ deepspeech/models/ds2/__init__.py | 4 ++++ examples/aishell/s0/run.sh | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) create mode 100644 deepspeech/models/ds2/__init__.py diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 91b7a1bfe..fab94ced8 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -331,12 +331,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): exit(-1) def export(self): - if self.config.model.apply_online == False: - infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader, self.config, self.args.checkpoint_path) - else: - infer_model = DeepSpeech2InferModelOnline.from_pretrained( - self.test_loader, self.config, self.args.checkpoint_path) + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size diff --git a/deepspeech/models/ds2/__init__.py b/deepspeech/models/ds2/__init__.py new file mode 100644 index 000000000..de78ebe91 --- /dev/null +++ b/deepspeech/models/ds2/__init__.py @@ -0,0 +1,4 @@ +from .deepspeech2 import DeepSpeech2Model +from .deepspeech2 import DeepSpeech2InferModel + +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index 33b02f7ce..c9708dcc9 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -2,7 +2,7 @@ set -e source path.sh -gpus=2,3,4,5 +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml From 87163864644767b5f0773dfa954746fa11d332fe Mon Sep 17 00:00:00 2001 From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com> Date: Wed, 28 Jul 2021 16:52:32 +0800 Subject: [PATCH 09/24] Update model.py --- deepspeech/exps/deepspeech2/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index fab94ced8..7d0a26d78 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -332,7 +332,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def export(self): infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader, self.config, self.args.checkpoint_path) + self.test_loader, self.config, self.args.checkpoint_path) infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size From 6baf9f0620a797512e9e3ae0f27a6cd4217fc9c8 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Thu, 29 Jul 2021 05:03:51 +0000 Subject: [PATCH 10/24] =?UTF-8?q?=E8=B7=91=E9=80=9A=E4=BA=86deeppseech=5Fo?= =?UTF-8?q?nline=E7=9A=84=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/exps/deepspeech2/model.py | 68 +++++++++++++++++------- deepspeech/models/ds2/deepspeech2.py | 2 +- deepspeech/models/ds2_online/__init__.py | 18 +++++-- tests/deepspeech2_model_test.py | 6 ++- 4 files changed, 67 insertions(+), 27 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index fab94ced8..9e870b13e 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -29,8 +29,8 @@ from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.ds2 import DeepSpeech2InferModel from deepspeech.models.ds2 import DeepSpeech2Model -#from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline -#from deepspeech.models.ds2_online import DeepSpeech2ModelOnline +from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.trainer import Trainer from deepspeech.utils import error_rate @@ -122,14 +122,26 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config - model = DeepSpeech2Model( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + if config.model.apply_online == True: + model = DeepSpeech2Model( + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + else: + model = DeepSpeech2ModelOnline( + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + num_fc_layers=config.model.num_fc_layers, + fc_layers_size_list=config.model.fc_layers_size_list, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) if self.parallel: model = paddle.DataParallel(model) @@ -331,8 +343,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): exit(-1) def export(self): - infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader, self.config, self.args.checkpoint_path) + if self.config.model.apply_online == True: + infer_model = DeepSpeech2InferModelOnline.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) + else: + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size @@ -370,14 +386,26 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def setup_model(self): config = self.config - model = DeepSpeech2Model( - feat_size=self.test_loader.collate_fn.feature_size, - dict_size=self.test_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + if config.model.apply_online == True: + model = DeepSpeech2Model( + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + else: + model = DeepSpeech2ModelOnline( + feat_size=self.train_loader.collate_fn.feature_size, + dict_size=self.train_loader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + num_fc_layers=config.model.num_fc_layers, + fc_layers_size_list=config.model.fc_layers_size_list, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) self.model = model logger.info("Setup model!") diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index 4026c89a7..8d737e800 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -19,8 +19,8 @@ from paddle import nn from yacs.config import CfgNode from deepspeech.models.ds2.conv import ConvStack -from deepspeech.modules.ctc import CTCDecoder from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.modules.ctc import CTCDecoder from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log diff --git a/deepspeech/models/ds2_online/__init__.py b/deepspeech/models/ds2_online/__init__.py index 88076667c..255000eeb 100644 --- a/deepspeech/models/ds2_online/__init__.py +++ b/deepspeech/models/ds2_online/__init__.py @@ -1,7 +1,17 @@ -from .deepspeech2 import DeepSpeech2ModelOnline +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .deepspeech2 import DeepSpeech2InferModelOnline +from .deepspeech2 import DeepSpeech2ModelOnline __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline'] - - - diff --git a/tests/deepspeech2_model_test.py b/tests/deepspeech2_model_test.py index bb40802d7..1938f7147 100644 --- a/tests/deepspeech2_model_test.py +++ b/tests/deepspeech2_model_test.py @@ -16,8 +16,10 @@ import unittest import numpy as np import paddle -#from deepspeech.models.deepspeech2 import DeepSpeech2Model -from deepspeech.models.ds2_online import DeepSpeech2ModelOnline as DeepSpeech2Model +from deepspeech.models.deepspeech2 import DeepSpeech2Model + +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline + class TestDeepSpeech2Model(unittest.TestCase): def setUp(self): From e4ef8ed31eaadf7f4fcac974ef76af9cd5961ccc Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Thu, 29 Jul 2021 13:04:50 +0000 Subject: [PATCH 11/24] add the subsampling as conv --- deepspeech/models/ds2/__init__.py | 15 +- deepspeech/models/ds2/deepspeech2.py | 2 +- deepspeech/models/ds2_online/__init__.py | 18 +- deepspeech/models/ds2_online/conv.py | 169 ++---------------- deepspeech/models/ds2_online/deepspeech2.py | 52 +++--- examples/aishell/s0/conf/deepspeech2.yaml | 3 +- examples/librispeech/s0/conf/deepspeech2.yaml | 1 - examples/tiny/s0/conf/deepspeech2.yaml | 1 - tests/deepspeech2_model_test.py | 4 +- 9 files changed, 70 insertions(+), 195 deletions(-) diff --git a/deepspeech/models/ds2/__init__.py b/deepspeech/models/ds2/__init__.py index de78ebe91..39bea5bf9 100644 --- a/deepspeech/models/ds2/__init__.py +++ b/deepspeech/models/ds2/__init__.py @@ -1,4 +1,17 @@ -from .deepspeech2 import DeepSpeech2Model +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .deepspeech2 import DeepSpeech2InferModel +from .deepspeech2 import DeepSpeech2Model __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index 4026c89a7..8d737e800 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -19,8 +19,8 @@ from paddle import nn from yacs.config import CfgNode from deepspeech.models.ds2.conv import ConvStack -from deepspeech.modules.ctc import CTCDecoder from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.modules.ctc import CTCDecoder from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log diff --git a/deepspeech/models/ds2_online/__init__.py b/deepspeech/models/ds2_online/__init__.py index 88076667c..255000eeb 100644 --- a/deepspeech/models/ds2_online/__init__.py +++ b/deepspeech/models/ds2_online/__init__.py @@ -1,7 +1,17 @@ -from .deepspeech2 import DeepSpeech2ModelOnline +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from .deepspeech2 import DeepSpeech2InferModelOnline +from .deepspeech2 import DeepSpeech2ModelOnline __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline'] - - - diff --git a/deepspeech/models/ds2_online/conv.py b/deepspeech/models/ds2_online/conv.py index 8bf48b2c8..13c3d3308 100644 --- a/deepspeech/models/ds2_online/conv.py +++ b/deepspeech/models/ds2_online/conv.py @@ -11,162 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import paddle from paddle import nn -from paddle.nn import functional as F -from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.utils.log import Log +from deepspeech.modules.embedding import PositionalEncoding +from deepspeech.modules.subsampling import Conv2dSubsampling4 -logger = Log(__name__).getlog() -__all__ = ['ConvStack', "conv_output_size"] +class Conv2dSubsampling4Online(Conv2dSubsampling4): + def __init__(self, + idim: int, + odim: int, + dropout_rate: float, + pos_enc_class: nn.Layer=PositionalEncoding): + super().__init__(idim, odim, dropout_rate, pos_enc_class) + self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim - -def conv_output_size(I, F, P, S): - # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters - # Output size after Conv: - # By noting I the length of the input volume size, - # F the length of the filter, - # P the amount of zero padding, - # S the stride, - # then the output size O of the feature map along that dimension is given by: - # O = (I - F + Pstart + Pend) // S + 1 - # When Pstart == Pend == P, we can replace Pstart + Pend by 2P. - # When Pstart == Pend == 0 - # O = (I - F - S) // S - # https://iq.opengenus.org/output-size-of-convolution/ - # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1 - # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1 - return (I - F + 2 * P - S) // S - - -# receptive field calculator -# https://fomoro.com/research/article/receptive-field-calculator -# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters -# https://distill.pub/2019/computing-receptive-fields/ -# Rl-1 = Sl * Rl + (Kl - Sl) - - -class ConvBn(nn.Layer): - """Convolution layer with batch normalization. - - :param kernel_size: The x dimension of a filter kernel. Or input a tuple for - two image dimension. - :type kernel_size: int|tuple|list - :param num_channels_in: Number of input channels. - :type num_channels_in: int - :param num_channels_out: Number of output channels. - :type num_channels_out: int - :param stride: The x dimension of the stride. Or input a tuple for two - image dimension. - :type stride: int|tuple|list - :param padding: The x dimension of the padding. Or input a tuple for two - image dimension. - :type padding: int|tuple|list - :param act: Activation type, relu|brelu - :type act: string - :return: Batch norm layer after convolution layer. - :rtype: Variable - - """ - - def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, - padding, act): - - super().__init__() - assert len(kernel_size) == 2 - assert len(stride) == 2 - assert len(padding) == 2 - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - - self.conv = nn.Conv2D( - num_channels_in, - num_channels_out, - kernel_size=kernel_size, - stride=stride, - padding=padding, - weight_attr=None, - bias_attr=False, - data_format='NCHW') - - self.bn = nn.BatchNorm2D( - num_channels_out, - weight_attr=None, - bias_attr=None, - data_format='NCHW') - self.act = F.relu if act == 'relu' else brelu - - def forward(self, x, x_len): - """ - x(Tensor): audio, shape [B, C, D, T] - """ + def forward(self, x: paddle.Tensor, + x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]: + x = x.unsqueeze(1) # (b, c=1, t, f) x = self.conv(x) - x = self.bn(x) - x = self.act(x) - - x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] - ) // self.stride[1] + 1 - - # reset padding part to 0 - masks = make_non_pad_mask(x_len) #[B, T] - masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] - # TODO(Hui Zhang): not support bool multiply - # masks = masks.type_as(x) - masks = masks.astype(x.dtype) - x = x.multiply(masks) - - return x, x_len - - -class ConvStack(nn.Layer): - """Convolution group with stacked convolution layers. - - :param feat_size: audio feature dim. - :type feat_size: int - :param num_stacks: Number of stacked convolution layers. - :type num_stacks: int - """ - - def __init__(self, feat_size, num_stacks): - super().__init__() - self.feat_size = feat_size # D - self.num_stacks = num_stacks - - self.conv_in = ConvBn( - num_channels_in=1, - num_channels_out=32, - kernel_size=(41, 11), #[D, T] - stride=(2, 3), - padding=(20, 5), - act='brelu') - - out_channel = 32 - convs = [ - ConvBn( - num_channels_in=32, - num_channels_out=out_channel, - kernel_size=(21, 11), - stride=(2, 1), - padding=(10, 5), - act='brelu') for i in range(num_stacks - 1) - ] - self.conv_stack = nn.LayerList(convs) - - # conv output feat_dim - output_height = (feat_size - 1) // 2 + 1 - for i in range(self.num_stacks - 1): - output_height = (output_height - 1) // 2 + 1 - self.output_height = out_channel * output_height - - def forward(self, x, x_len): - """ - x: shape [B, C, D, T] - x_len : shape [B] - """ - x, x_len = self.conv_in(x, x_len) - for i, conv in enumerate(self.conv_stack): - x, x_len = conv(x, x_len) + b, c, t, f = paddle.shape(x) + x = x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]) + x_len = ((x_len - 1) // 2 - 1) // 2 return x, x_len diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 3c77209fa..4fa6da0df 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -11,27 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Deepspeech2 ASR Model""" +"""Deepspeech2 ASR Online Model""" from typing import Optional import paddle import paddle.nn.functional as F from paddle import nn -from paddle.fluid.layers import fc -from paddle.nn import GRU -from paddle.nn import LayerList -from paddle.nn import LayerNorm -from paddle.nn import Linear -from paddle.nn import LSTM from yacs.config import CfgNode -from deepspeech.models.ds2_online.conv import ConvStack -from deepspeech.models.ds2_online.rnn import RNNStack +from deepspeech.models.ds2_online.conv import Conv2dSubsampling4Online from deepspeech.modules.ctc import CTCDecoder from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log - logger = Log(__name__).getlog() __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModeOnline'] @@ -55,46 +47,48 @@ class CRNNEncoder(nn.Layer): self.num_rnn_layers = num_rnn_layers self.num_fc_layers = num_fc_layers self.fc_layers_size_list = fc_layers_size_list - self.conv = ConvStack(feat_size, num_conv_layers) + self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0) - i_size = self.conv.output_height # H after conv stack + i_size = self.conv.output_dim - self.rnn = LayerList() - self.layernorm_list = LayerList() - self.fc_layers_list = LayerList() + self.rnn = nn.LayerList() + self.layernorm_list = nn.LayerList() + self.fc_layers_list = nn.LayerList() rnn_direction = 'forward' layernorm_size = rnn_size if use_gru == True: self.rnn.append( - GRU(input_size=i_size, + nn.GRU( + input_size=i_size, hidden_size=rnn_size, num_layers=1, direction=rnn_direction)) - self.layernorm_list.append(LayerNorm(layernorm_size)) + self.layernorm_list.append(nn.LayerNorm(layernorm_size)) for i in range(1, num_rnn_layers): self.rnn.append( - GRU(input_size=layernorm_size, + nn.GRU( + input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction=rnn_direction)) - self.layernorm_list.append(LayerNorm(layernorm_size)) + self.layernorm_list.append(nn.LayerNorm(layernorm_size)) else: self.rnn.append( - LSTM( + nn.LSTM( input_size=i_size, hidden_size=rnn_size, num_layers=1, direction=rnn_direction)) - self.layernorm_list.append(LayerNorm(layernorm_size)) + self.layernorm_list.append(nn.LayerNorm(layernorm_size)) for i in range(1, num_rnn_layers): self.rnn.append( - LSTM( + nn.LSTM( input_size=layernorm_size, hidden_size=rnn_size, num_layers=1, direction=rnn_direction)) - self.layernorm_list.append(LayerNorm(layernorm_size)) + self.layernorm_list.append(nn.LayerNorm(layernorm_size)) fc_input_size = layernorm_size for i in range(self.num_fc_layers): self.fc_layers_list.append( @@ -117,20 +111,16 @@ class CRNNEncoder(nn.Layer): x (Tensor): encoder outputs, [B, T, D] x_lens (Tensor): encoder length, [B] """ - # [B, T, D] -> [B, D, T] - audio = audio.transpose([0, 2, 1]) - # [B, D, T] -> [B, C=1, D, T] - x = audio.unsqueeze(1) + # [B, T, D] + x = audio x_lens = audio_len - # convolution group x, x_lens = self.conv(x, x_lens) - # convert data from convolution feature map to sequence of vectors #B, C, D, T = paddle.shape(x) # not work under jit - x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] + #x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit - x = x.reshape([0, 0, -1]) #[B, T, C*D] + #x = x.reshape([0, 0, -1]) #[B, T, C*D] # remove padding part x, output_state = self.rnn[0](x, None, x_lens) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 7d0d1f895..1c97fc607 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -36,11 +36,10 @@ collator: model: num_conv_layers: 2 - num_rnn_layers: 4 + num_rnn_layers: 3 rnn_layer_size: 1024 use_gru: True share_rnn_weights: False - apply_online: False training: n_epoch: 50 diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index be1918d01..acee94c3e 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -40,7 +40,6 @@ model: rnn_layer_size: 2048 use_gru: False share_rnn_weights: True - apply_online: False training: n_epoch: 50 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 8c719e5cd..ea433f341 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -41,7 +41,6 @@ model: rnn_layer_size: 2048 use_gru: False share_rnn_weights: True - apply_online: True training: n_epoch: 10 diff --git a/tests/deepspeech2_model_test.py b/tests/deepspeech2_model_test.py index bb40802d7..00df8195b 100644 --- a/tests/deepspeech2_model_test.py +++ b/tests/deepspeech2_model_test.py @@ -16,8 +16,8 @@ import unittest import numpy as np import paddle -#from deepspeech.models.deepspeech2 import DeepSpeech2Model -from deepspeech.models.ds2_online import DeepSpeech2ModelOnline as DeepSpeech2Model +from deepspeech.models.ds2 import DeepSpeech2Model + class TestDeepSpeech2Model(unittest.TestCase): def setUp(self): From 745df04f2839713c64c1f368e50605882cfc6e4e Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 30 Jul 2021 05:53:13 +0000 Subject: [PATCH 12/24] complete the pipline of tiny --- deepspeech/exps/deepspeech2/bin/export.py | 4 ++- deepspeech/exps/deepspeech2/bin/test.py | 3 +- deepspeech/exps/deepspeech2/bin/train.py | 3 +- deepspeech/exps/deepspeech2/config.py | 24 +++++++++---- deepspeech/exps/deepspeech2/model.py | 24 ++++++++----- examples/tiny/s0/local/export.sh | 7 ++-- examples/tiny/s0/local/test.sh | 6 ++-- examples/tiny/s0/local/train.sh | 6 ++-- examples/tiny/s0/run.sh | 7 ++-- examples/tiny/s0/run_online.sh | 41 +++++++++++++++++++++++ 10 files changed, 96 insertions(+), 29 deletions(-) create mode 100755 examples/tiny/s0/run_online.sh diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py index a1607d583..9ae045c48 100644 --- a/deepspeech/exps/deepspeech2/bin/export.py +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -30,11 +30,13 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() + parser.add_argument("--model_type") args = parser.parse_args() + print_arguments(args) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/deepspeech/exps/deepspeech2/bin/test.py b/deepspeech/exps/deepspeech2/bin/test.py index f4edf08a8..49bca73d2 100644 --- a/deepspeech/exps/deepspeech2/bin/test.py +++ b/deepspeech/exps/deepspeech2/bin/test.py @@ -30,11 +30,12 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() + parser.add_argument("--model_type") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/deepspeech/exps/deepspeech2/bin/train.py b/deepspeech/exps/deepspeech2/bin/train.py index 5e5c1e2a4..253806af1 100644 --- a/deepspeech/exps/deepspeech2/bin/train.py +++ b/deepspeech/exps/deepspeech2/bin/train.py @@ -35,11 +35,12 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() + parser.add_argument("--model_type") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index a851e1f72..4b3f724ff 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -18,21 +18,31 @@ from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.ds2 import DeepSpeech2Model +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline -_C = CfgNode() -_C.data = ManifestDataset.params() +def get_cfg_defaults(model_type): + _C = CfgNode() + if (model_type == 'offline'): + _C.data = ManifestDataset.params() -_C.collator = SpeechCollator.params() + _C.collator = SpeechCollator.params() -_C.model = DeepSpeech2Model.params() + _C.model = DeepSpeech2Model.params() -_C.training = DeepSpeech2Trainer.params() + _C.training = DeepSpeech2Trainer.params() -_C.decoding = DeepSpeech2Tester.params() + _C.decoding = DeepSpeech2Tester.params() + else: + _C.data = ManifestDataset.params() + _C.collator = SpeechCollator.params() -def get_cfg_defaults(): + _C.model = DeepSpeech2ModelOnline.params() + + _C.training = DeepSpeech2Trainer.params() + + _C.decoding = DeepSpeech2Tester.params() """Get a yacs CfgNode object with default values for my_project.""" # Return a clone so that the defaults will not be altered # This is for the "local variable" use pattern diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 9e870b13e..c654dc011 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -122,7 +122,7 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config - if config.model.apply_online == True: + if self.args.model_type == 'offline': model = DeepSpeech2Model( feat_size=self.train_loader.collate_fn.feature_size, dict_size=self.train_loader.collate_fn.vocab_size, @@ -131,7 +131,7 @@ class DeepSpeech2Trainer(Trainer): rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) - else: + elif self.args.model_type == 'online': model = DeepSpeech2ModelOnline( feat_size=self.train_loader.collate_fn.feature_size, dict_size=self.train_loader.collate_fn.vocab_size, @@ -142,6 +142,8 @@ class DeepSpeech2Trainer(Trainer): rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) + else: + raise Exception("wrong model type") if self.parallel: model = paddle.DataParallel(model) @@ -343,12 +345,14 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): exit(-1) def export(self): - if self.config.model.apply_online == True: + if self.args.model_type == 'offline': + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) + elif self.args.model_type == 'online': infer_model = DeepSpeech2InferModelOnline.from_pretrained( self.test_loader, self.config, self.args.checkpoint_path) else: - infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader, self.config, self.args.checkpoint_path) + raise Exception("wrong model tyep") infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size @@ -386,7 +390,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def setup_model(self): config = self.config - if config.model.apply_online == True: + if self.args.model_type == 'offline': model = DeepSpeech2Model( feat_size=self.test_loader.collate_fn.feature_size, dict_size=self.test_loader.collate_fn.vocab_size, @@ -395,10 +399,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) - else: + elif self.args.model_type == 'online': model = DeepSpeech2ModelOnline( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, + feat_size=self.test_loader.collate_fn.feature_size, + dict_size=self.test_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, num_fc_layers=config.model.num_fc_layers, @@ -406,6 +410,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) + else: + raise Exception("Wrong model type") self.model = model logger.info("Setup model!") diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/s0/local/export.sh index f99a15bad..6955239c7 100755 --- a/examples/tiny/s0/local/export.sh +++ b/examples/tiny/s0/local/export.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ $# != 3 ];then +if [ $# != 4 ];then echo "usage: $0 config_path ckpt_prefix jit_model_path" exit -1 fi @@ -11,6 +11,7 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_path_prefix=$2 jit_model_export_path=$3 +model_type=$4 device=gpu if [ ${ngpu} == 0 ];then @@ -22,8 +23,8 @@ python3 -u ${BIN_DIR}/export.py \ --nproc ${ngpu} \ --config ${config_path} \ --checkpoint_path ${ckpt_path_prefix} \ ---export_path ${jit_model_export_path} - +--export_path ${jit_model_export_path} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in export!" diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/s0/local/test.sh index 16a5e9ef0..2f74491a1 100755 --- a/examples/tiny/s0/local/test.sh +++ b/examples/tiny/s0/local/test.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ $# != 2 ];then +if [ $# != 3 ];then echo "usage: ${0} config_path ckpt_path_prefix" exit -1 fi @@ -14,6 +14,7 @@ if [ ${ngpu} == 0 ];then fi config_path=$1 ckpt_prefix=$2 +model_type=$3 # download language model bash local/download_lm_en.sh @@ -26,7 +27,8 @@ python3 -u ${BIN_DIR}/test.py \ --nproc 1 \ --config ${config_path} \ --result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh index f6bd2c983..1d49dcd1d 100755 --- a/examples/tiny/s0/local/train.sh +++ b/examples/tiny/s0/local/train.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ $# != 2 ];then +if [ $# != 3 ];then echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" exit -1 fi @@ -10,6 +10,7 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +model_type=$3 device=gpu if [ ${ngpu} == 0 ];then @@ -22,7 +23,8 @@ python3 -u ${BIN_DIR}/train.py \ --device ${device} \ --nproc ${ngpu} \ --config ${config_path} \ ---output exp/${ckpt_name} +--output exp/${ckpt_name} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in training!" diff --git a/examples/tiny/s0/run.sh b/examples/tiny/s0/run.sh index d7e153e8d..a4506e4c5 100755 --- a/examples/tiny/s0/run.sh +++ b/examples/tiny/s0/run.sh @@ -7,6 +7,7 @@ stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml avg_num=1 +model_type=online source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -21,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -31,10 +32,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} fi diff --git a/examples/tiny/s0/run_online.sh b/examples/tiny/s0/run_online.sh new file mode 100755 index 000000000..4c3602045 --- /dev/null +++ b/examples/tiny/s0/run_online.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -e +source path.sh + +gpus=7 +stage=1 +stop_stage=100 +conf_path=conf/deepspeech2.yaml +avg_num=1 +model_type=online + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ###ckpt = deepspeech2 +echo "checkpoint name ${ckpt}" + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} +fi From 2537221b61ead68f47fcf3497b5e55b1caba85b3 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 30 Jul 2021 09:34:28 +0000 Subject: [PATCH 13/24] Complete the modification according to the comments , but still need to be unit tested --- deepspeech/exps/deepspeech2/model.py | 11 +- deepspeech/models/ds2_online/conv.py | 8 +- deepspeech/models/ds2_online/deepspeech2.py | 104 +++++-- deepspeech/models/ds2_online/rnn.py | 314 -------------------- deepspeech/modules/subsampling.py | 6 +- 5 files changed, 83 insertions(+), 360 deletions(-) delete mode 100644 deepspeech/models/ds2_online/rnn.py diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 7d0a26d78..8b47892a5 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -29,8 +29,6 @@ from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.ds2 import DeepSpeech2InferModel from deepspeech.models.ds2 import DeepSpeech2Model -#from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline -#from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.trainer import Trainer from deepspeech.utils import error_rate @@ -38,6 +36,8 @@ from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Autolog from deepspeech.utils.log import Log +#from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline +#from deepspeech.models.ds2_online import DeepSpeech2ModelOnline logger = Log(__name__).getlog() @@ -128,9 +128,7 @@ class DeepSpeech2Trainer(Trainer): num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - + use_gru=config.model.use_gru) if self.parallel: model = paddle.DataParallel(model) @@ -376,8 +374,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + use_gru=config.model.use_gru) self.model = model logger.info("Setup model!") diff --git a/deepspeech/models/ds2_online/conv.py b/deepspeech/models/ds2_online/conv.py index 13c3d3308..13c35ef2b 100644 --- a/deepspeech/models/ds2_online/conv.py +++ b/deepspeech/models/ds2_online/conv.py @@ -19,12 +19,8 @@ from deepspeech.modules.subsampling import Conv2dSubsampling4 class Conv2dSubsampling4Online(Conv2dSubsampling4): - def __init__(self, - idim: int, - odim: int, - dropout_rate: float, - pos_enc_class: nn.Layer=PositionalEncoding): - super().__init__(idim, odim, dropout_rate, pos_enc_class) + def __init__(self, idim: int, odim: int, dropout_rate: float): + super().__init__(idim, odim, dropout_rate, None) self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim def forward(self, x: paddle.Tensor, diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 4fa6da0df..e9e81d5d9 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -36,16 +36,17 @@ class CRNNEncoder(nn.Layer): num_conv_layers=2, num_rnn_layers=4, rnn_size=1024, + rnn_direction='forward', num_fc_layers=2, fc_layers_size_list=[512, 256], - use_gru=False, - share_rnn_weights=True): + use_gru=False): super().__init__() self.rnn_size = rnn_size self.feat_size = feat_size # 161 for linear self.dict_size = dict_size self.num_rnn_layers = num_rnn_layers self.num_fc_layers = num_fc_layers + self.rnn_direction = rnn_direction self.fc_layers_size_list = fc_layers_size_list self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0) @@ -54,7 +55,6 @@ class CRNNEncoder(nn.Layer): self.rnn = nn.LayerList() self.layernorm_list = nn.LayerList() self.fc_layers_list = nn.LayerList() - rnn_direction = 'forward' layernorm_size = rnn_size if use_gru == True: @@ -99,21 +99,18 @@ class CRNNEncoder(nn.Layer): def output_size(self): return self.fc_layers_size_list[-1] - def forward(self, audio, audio_len): + def forward(self, x, x_lens): """Compute Encoder outputs Args: - audio (Tensor): [B, Tmax, D] - text (Tensor): [B, Umax] - audio_len (Tensor): [B] - text_len (Tensor): [B] + x (Tensor): [B, T_input, D] + x_lens (Tensor): [B] Returns: - x (Tensor): encoder outputs, [B, T, D] + x (Tensor): encoder outputs, [B, T_output, D] x_lens (Tensor): encoder length, [B] + rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers """ # [B, T, D] - x = audio - x_lens = audio_len # convolution group x, x_lens = self.conv(x, x_lens) # convert data from convolution feature map to sequence of vectors @@ -123,16 +120,47 @@ class CRNNEncoder(nn.Layer): #x = x.reshape([0, 0, -1]) #[B, T, C*D] # remove padding part - x, output_state = self.rnn[0](x, None, x_lens) + init_state = None + rnn_final_state_list = [] + x, final_state = self.rnn[0](x, init_state, x_lens) + rnn_final_state_list.append(final_state) x = self.layernorm_list[0](x) for i in range(1, self.num_rnn_layers): - x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] + x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D] + rnn_final_state_list.append(final_state) x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens + return x, x_lens, rnn_final_state_list + + def forward(self, x, x_lens, init_state_list): + """Compute Encoder outputs + + Args: + x (Tensor): [B, feature_chunk_size, D] + x_lens (Tensor): [B] + init_state_list (list of Tensors): [ num_directions, batch_size, hidden_size] * num_rnn_layers + Returns: + x (Tensor): encoder outputs, [B, chunk_size, D] + x_lens (Tensor): encoder length, [B] + rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + """ + rnn_final_state_list = [] + x, final_state = self.rnn[0](x, init_state_list[0], x_lens) + rnn_final_state_list.append(final_state) + x = self.layernorm_list[0](x) + for i in range(1, self.num_rnn_layers): + x, final_state = self.rnn[i](x, init_state_list[i], + x_lens) #[B, T, D] + rnn_final_state_list.append(final_state) + x = self.layernorm_list[i](x) + + for i in range(self.num_fc_layers): + x = self.fc_layers_list[i](x) + x = F.relu(x) + return x, x_lens, rnn_final_state_list class DeepSpeech2ModelOnline(nn.Layer): @@ -156,9 +184,6 @@ class DeepSpeech2ModelOnline(nn.Layer): :type rnn_size: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward direction RNNs. - It is only available when use_gru=False. :type share_weights: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. @@ -175,7 +200,6 @@ class DeepSpeech2ModelOnline(nn.Layer): num_fc_layers=2, fc_layers_size_list=[512, 256], use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. )) if config is not None: config.merge_from_other_cfg(default) @@ -187,21 +211,21 @@ class DeepSpeech2ModelOnline(nn.Layer): num_conv_layers=2, num_rnn_layers=3, rnn_size=1024, + rnn_direction='forward', num_fc_layers=2, fc_layers_size_list=[512, 256], - use_gru=False, - share_rnn_weights=True): + use_gru=False): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, dict_size=dict_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, + rnn_direction=rnn_direction, num_fc_layers=num_fc_layers, fc_layers_size_list=fc_layers_size_list, rnn_size=rnn_size, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) + use_gru=use_gru) assert (self.encoder.output_size == fc_layers_size_list[-1]) self.decoder = CTCDecoder( @@ -224,7 +248,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Returns: loss (Tenosr): [1] """ - eouts, eouts_len = self.encoder(audio, audio_len) + eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) return loss @@ -271,10 +295,10 @@ class DeepSpeech2ModelOnline(nn.Layer): num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, + rnn_direction=config.model.rnn_direction, num_fc_layers=config.model.num_fc_layers, fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + use_gru=config.model.use_gru) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -289,20 +313,20 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): num_conv_layers=2, num_rnn_layers=3, rnn_size=1024, + rnn_direction='forward', num_fc_layers=2, fc_layers_size_list=[512, 256], - use_gru=False, - share_rnn_weights=True): + use_gru=False): super().__init__( feat_size=feat_size, dict_size=dict_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, rnn_size=rnn_size, + rnn_direction=rnn_direction, num_fc_layers=num_fc_layers, fc_layers_size_list=fc_layers_size_list, - use_gru=use_gru, - share_rnn_weights=share_rnn_weights) + use_gru=use_gru) def forward(self, audio, audio_len): """export model function @@ -314,6 +338,26 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): Returns: probs: probs after softmax """ - eouts, eouts_len = self.encoder(audio, audio_len) + eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len) probs = self.decoder.softmax(eouts) return probs + + def forward(self, eouts_chunk_prefix, eouts_chunk_lens_prefix, audio_chunk, + audio_chunk_len, init_state_list): + """export model function + + Args: + audio_chunk (Tensor): [B, T, D] + audio_chunk_len (Tensor): [B] + + Returns: + probs: probs after softmax + """ + eouts_chunk, eouts_chunk_lens, rnn_final_state_list = self.encoder( + audio_chunk, audio_chunk_len, init_state_list) + eouts_chunk_new_prefix = paddle.concat( + [eouts_chunk_prefix, eouts_chunk], axis=1) + eouts_chunk_lens_new_prefix = paddle.add(eouts_chunk_lens_prefix, + eouts_chunk_lens) + probs_chunk = self.decoder.softmax(eouts_chunk_new_prefix) + return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, rnn_final_state_list diff --git a/deepspeech/models/ds2_online/rnn.py b/deepspeech/models/ds2_online/rnn.py deleted file mode 100644 index 01b55c4a2..000000000 --- a/deepspeech/models/ds2_online/rnn.py +++ /dev/null @@ -1,314 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math - -import paddle -from paddle import nn -from paddle.nn import functional as F -from paddle.nn import initializer as I - -from deepspeech.modules.activation import brelu -from deepspeech.modules.mask import make_non_pad_mask -from deepspeech.utils.log import Log - -logger = Log(__name__).getlog() - -__all__ = ['RNNStack'] - - -class RNNCell(nn.RNNCellBase): - r""" - Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it - computes the outputs and updates states. - The formula used is as follows: - .. math:: - h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) - y_{t} & = h_{t} - - where :math:`act` is for :attr:`activation`. - """ - - def __init__(self, - hidden_size: int, - activation="tanh", - weight_ih_attr=None, - weight_hh_attr=None, - bias_ih_attr=None, - bias_hh_attr=None, - name=None): - super().__init__() - std = 1.0 / math.sqrt(hidden_size) - self.weight_hh = self.create_parameter( - (hidden_size, hidden_size), - weight_hh_attr, - default_initializer=I.Uniform(-std, std)) - self.bias_ih = None - self.bias_hh = self.create_parameter( - (hidden_size, ), - bias_hh_attr, - is_bias=True, - default_initializer=I.Uniform(-std, std)) - - self.hidden_size = hidden_size - if activation not in ["tanh", "relu", "brelu"]: - raise ValueError( - "activation for SimpleRNNCell should be tanh or relu, " - "but get {}".format(activation)) - self.activation = activation - self._activation_fn = paddle.tanh \ - if activation == "tanh" \ - else F.relu - if activation == 'brelu': - self._activation_fn = brelu - - def forward(self, inputs, states=None): - if states is None: - states = self.get_initial_states(inputs, self.state_shape) - pre_h = states - i2h = inputs - if self.bias_ih is not None: - i2h += self.bias_ih - h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) - if self.bias_hh is not None: - h2h += self.bias_hh - h = self._activation_fn(i2h + h2h) - return h, h - - @property - def state_shape(self): - return (self.hidden_size, ) - - -class GRUCell(nn.RNNCellBase): - r""" - Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, - it computes the outputs and updates states. - The formula for GRU used is as follows: - .. math:: - r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) - z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) - \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) - h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} - y_{t} & = h_{t} - - where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise - multiplication operator. - """ - - def __init__(self, - input_size: int, - hidden_size: int, - weight_ih_attr=None, - weight_hh_attr=None, - bias_ih_attr=None, - bias_hh_attr=None, - name=None): - super().__init__() - std = 1.0 / math.sqrt(hidden_size) - self.weight_hh = self.create_parameter( - (3 * hidden_size, hidden_size), - weight_hh_attr, - default_initializer=I.Uniform(-std, std)) - self.bias_ih = None - self.bias_hh = self.create_parameter( - (3 * hidden_size, ), - bias_hh_attr, - is_bias=True, - default_initializer=I.Uniform(-std, std)) - - self.hidden_size = hidden_size - self.input_size = input_size - self._gate_activation = F.sigmoid - self._activation = paddle.tanh - - def forward(self, inputs, states=None): - if states is None: - states = self.get_initial_states(inputs, self.state_shape) - - pre_hidden = states - x_gates = inputs - if self.bias_ih is not None: - x_gates = x_gates + self.bias_ih - h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True) - if self.bias_hh is not None: - h_gates = h_gates + self.bias_hh - - x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1) - h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1) - - r = self._gate_activation(x_r + h_r) - z = self._gate_activation(x_z + h_z) - c = self._activation(x_c + r * h_c) # apply reset gate after mm - h = (pre_hidden - c) * z + c - # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru - - return h, h - - @property - def state_shape(self): - r""" - The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch - size would be automatically inserted into shape). The shape corresponds - to the shape of :math:`h_{t-1}`. - """ - return (self.hidden_size, ) - - -class BiRNNWithBN(nn.Layer): - """Bidirectonal simple rnn layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param size: Dimension of RNN cells. - :type size: int - :param share_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - :type share_weights: bool - :return: Bidirectional simple rnn layer. - :rtype: Variable - """ - - def __init__(self, i_size: int, h_size: int, share_weights: bool): - super().__init__() - self.share_weights = share_weights - if self.share_weights: - #input-hidden weights shared between bi-directional rnn. - self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) - # batch norm is only performed on input-state projection - self.fw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - self.bw_fc = self.fw_fc - self.bw_bn = self.fw_bn - else: - self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) - self.fw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) - self.bw_bn = nn.BatchNorm1D( - h_size, bias_attr=None, data_format='NLC') - - self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') - self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') - self.fw_rnn = nn.RNN( - self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] - self.bw_rnn = nn.RNN( - self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - - def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): - # x, shape [B, T, D] - fw_x = self.fw_bn(self.fw_fc(x)) - bw_x = self.bw_bn(self.bw_fc(x)) - fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) - bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) - x = paddle.concat([fw_x, bw_x], axis=-1) - return x, x_len - - -class BiGRUWithBN(nn.Layer): - """Bidirectonal gru layer with sequence-wise batch normalization. - The batch normalization is only performed on input-state weights. - - :param name: Name of the layer. - :type name: string - :param input: Input layer. - :type input: Variable - :param size: Dimension of GRU cells. - :type size: int - :param act: Activation type. - :type act: string - :return: Bidirectional GRU layer. - :rtype: Variable - """ - - def __init__(self, i_size: int, h_size: int): - super().__init__() - hidden_size = h_size * 3 - - self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) - self.fw_bn = nn.BatchNorm1D( - hidden_size, bias_attr=None, data_format='NLC') - self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) - self.bw_bn = nn.BatchNorm1D( - hidden_size, bias_attr=None, data_format='NLC') - - self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) - self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) - self.fw_rnn = nn.RNN( - self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] - self.bw_rnn = nn.RNN( - self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - - def forward(self, x, x_len): - # x, shape [B, T, D] - fw_x = self.fw_bn(self.fw_fc(x)) - bw_x = self.bw_bn(self.bw_fc(x)) - fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) - bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) - x = paddle.concat([fw_x, bw_x], axis=-1) - return x, x_len - - -class RNNStack(nn.Layer): - """RNN group with stacked bidirectional simple RNN or GRU layers. - - :param input: Input layer. - :type input: Variable - :param size: Dimension of RNN cells in each layer. - :type size: int - :param num_stacks: Number of stacked rnn layers. - :type num_stacks: int - :param use_gru: Use gru if set True. Use simple rnn if set False. - :type use_gru: bool - :param share_rnn_weights: Whether to share input-hidden weights between - forward and backward directional RNNs. - It is only available when use_gru=False. - :type share_weights: bool - :return: Output layer of the RNN group. - :rtype: Variable - """ - - def __init__(self, - i_size: int, - h_size: int, - num_stacks: int, - use_gru: bool, - share_rnn_weights: bool): - super().__init__() - rnn_stacks = [] - for i in range(num_stacks): - if use_gru: - #default:GRU using tanh - rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) - else: - rnn_stacks.append( - BiRNNWithBN( - i_size=i_size, - h_size=h_size, - share_weights=share_rnn_weights)) - i_size = h_size * 2 - - self.rnn_stacks = nn.ModuleList(rnn_stacks) - - def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): - """ - x: shape [B, T, D] - x_len: shpae [B] - """ - for i, rnn in enumerate(self.rnn_stacks): - x, x_len = rnn(x, x_len) - masks = make_non_pad_mask(x_len) #[B, T] - masks = masks.unsqueeze(-1) # [B, T, 1] - # TODO(Hui Zhang): not support bool multiply - masks = masks.astype(x.dtype) - x = x.multiply(masks) - return x, x_len diff --git a/deepspeech/modules/subsampling.py b/deepspeech/modules/subsampling.py index 5aa2fd8ea..40fa7b00a 100644 --- a/deepspeech/modules/subsampling.py +++ b/deepspeech/modules/subsampling.py @@ -92,7 +92,7 @@ class Conv2dSubsampling4(BaseSubsampling): dropout_rate: float, pos_enc_class: nn.Layer=PositionalEncoding): """Construct an Conv2dSubsampling4 object. - + Args: idim (int): Input dimension. odim (int): Output dimension. @@ -143,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling): dropout_rate: float, pos_enc_class: nn.Layer=PositionalEncoding): """Construct an Conv2dSubsampling6 object. - + Args: idim (int): Input dimension. odim (int): Output dimension. @@ -196,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling): dropout_rate: float, pos_enc_class: nn.Layer=PositionalEncoding): """Construct an Conv2dSubsampling8 object. - + Args: idim (int): Input dimension. odim (int): Output dimension. From d398270f95b584f7729d50e3218be45e39ca7db2 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 2 Aug 2021 06:46:30 +0000 Subject: [PATCH 14/24] =?UTF-8?q?=C3=A6=C2=98=C3=A5=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=BA=86chunk=5Fby=5Fchunk,=E5=88=9D=E6=AD=A5=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=C3=A5=C2=8F=E9=80=9A=E8=BF=87=C3=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/models/ds2_online/conv.py | 2 + deepspeech/models/ds2_online/deepspeech2.py | 117 ++++++++++++++--- deepspeech/utils/log.py | 10 +- tests/deepspeech2_online_model_test.py | 134 ++++++++++++++++++++ 4 files changed, 244 insertions(+), 19 deletions(-) create mode 100644 tests/deepspeech2_online_model_test.py diff --git a/deepspeech/models/ds2_online/conv.py b/deepspeech/models/ds2_online/conv.py index 13c35ef2b..83d98e410 100644 --- a/deepspeech/models/ds2_online/conv.py +++ b/deepspeech/models/ds2_online/conv.py @@ -22,6 +22,8 @@ class Conv2dSubsampling4Online(Conv2dSubsampling4): def __init__(self, idim: int, odim: int, dropout_rate: float): super().__init__(idim, odim, dropout_rate, None) self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim + self.receptive_field_length = 2 * ( + 3 - 1) + 3 # stride_1 * (kernel_size_2 - 1) + kerel_size_1 def forward(self, x: paddle.Tensor, x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]: diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index e9e81d5d9..0b3c632be 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -108,7 +108,7 @@ class CRNNEncoder(nn.Layer): Returns: x (Tensor): encoder outputs, [B, T_output, D] x_lens (Tensor): encoder length, [B] - rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers """ # [B, T, D] # convolution group @@ -121,21 +121,21 @@ class CRNNEncoder(nn.Layer): # remove padding part init_state = None - rnn_final_state_list = [] + final_state_list = [] x, final_state = self.rnn[0](x, init_state, x_lens) - rnn_final_state_list.append(final_state) + final_state_list.append(final_state) x = self.layernorm_list[0](x) for i in range(1, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D] - rnn_final_state_list.append(final_state) + final_state_list.append(final_state) x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens, rnn_final_state_list + return x, x_lens, final_state_list - def forward(self, x, x_lens, init_state_list): + def forward_chunk(self, x, x_lens, init_state_list): """Compute Encoder outputs Args: @@ -145,22 +145,59 @@ class CRNNEncoder(nn.Layer): Returns: x (Tensor): encoder outputs, [B, chunk_size, D] x_lens (Tensor): encoder length, [B] - rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + chunk_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers """ - rnn_final_state_list = [] + x, x_lens = self.conv(x, x_lens) + chunk_final_state_list = [] x, final_state = self.rnn[0](x, init_state_list[0], x_lens) - rnn_final_state_list.append(final_state) + chunk_final_state_list.append(final_state) x = self.layernorm_list[0](x) for i in range(1, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state_list[i], x_lens) #[B, T, D] - rnn_final_state_list.append(final_state) + chunk_final_state_list.append(final_state) x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens, rnn_final_state_list + return x, x_lens, chunk_final_state_list + + def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8): + subsampling_rate = self.conv.subsampling_rate + receptive_field_length = self.conv.receptive_field_length + chunk_size = (decoder_chunk_size - 1 + ) * subsampling_rate + receptive_field_length + chunk_stride = subsampling_rate * decoder_chunk_size + max_len = x.shape[1] + assert (chunk_size <= max_len) + + eouts_chunk_list = [] + eouts_chunk_lens_list = [] + + padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride + padding = paddle.zeros((x.shape[0], padding_len, x.shape[2])) + x_padded = paddle.concat([x, padding], axis=1) + num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 + num_chunk = int(num_chunk) + chunk_init_state_list = [None] * self.num_rnn_layers + for i in range(0, num_chunk): + start = i * chunk_stride + end = start + chunk_size + x_chunk = x_padded[:, start:end, :] + x_len_left = x_lens - i * chunk_stride + x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size + x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, + x_len_left, x_chunk_len_tmp) + + eouts_chunk, eouts_chunk_lens, chunk_final_state_list = self.forward_chunk( + x_chunk, x_chunk_lens, chunk_init_state_list) + + chunk_init_state_list = chunk_final_state_list + eouts_chunk_list.append(eouts_chunk) + eouts_chunk_lens_list.append(eouts_chunk_lens) + + return eouts_chunk_list, eouts_chunk_lens_list, chunk_final_state_list class DeepSpeech2ModelOnline(nn.Layer): @@ -248,7 +285,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Returns: loss (Tenosr): [1] """ - eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) return loss @@ -265,13 +302,54 @@ class DeepSpeech2ModelOnline(nn.Layer): vocab_list=vocab_list, decoding_method=decoding_method) - eouts, eouts_len = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) + + @paddle.no_grad() + def decode_chunk_by_chunk(self, audio, audio_len, vocab_list, + decoding_method, lang_model_path, beam_alpha, + beam_beta, beam_size, cutoff_prob, cutoff_top_n, + num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( + audio, audio_len) + eouts = paddle.concat(eouts_chunk_list, axis=1) + eouts_len = paddle.add_n(eouts_chunk_len_list) + probs = self.decoder.softmax(eouts) return self.decoder.decode_probs( probs.numpy(), eouts_len, vocab_list, decoding_method, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) + @paddle.no_grad() + def decode_prob(self, audio, audio_len): + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs, eouts, eouts_len, final_state_list + + @paddle.no_grad() + def decode_prob_chunk_by_chunk(self, audio, audio_len): + + eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( + audio, audio_len) + eouts = paddle.concat(eouts_chunk_list, axis=1) + eouts_len = paddle.add_n(eouts_chunk_len_list) + probs = self.decoder.softmax(eouts) + return probs, eouts, eouts_len, final_state_list + @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. @@ -338,7 +416,14 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): Returns: probs: probs after softmax """ - eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs + + def forward_chunk_by_chunk(self, audio, audio_len): + eouts_chunk_list, eouts_chunk_lens_list, final_state_list = self.encoder.forward_chunk_by_chunk( + audio_chunk, audio_chunk_len) + eouts = paddle.concat(eouts_chunk_list, axis=1) probs = self.decoder.softmax(eouts) return probs @@ -353,11 +438,11 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): Returns: probs: probs after softmax """ - eouts_chunk, eouts_chunk_lens, rnn_final_state_list = self.encoder( + eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder( audio_chunk, audio_chunk_len, init_state_list) eouts_chunk_new_prefix = paddle.concat( [eouts_chunk_prefix, eouts_chunk], axis=1) eouts_chunk_lens_new_prefix = paddle.add(eouts_chunk_lens_prefix, eouts_chunk_lens) probs_chunk = self.decoder.softmax(eouts_chunk_new_prefix) - return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, rnn_final_state_list + return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, final_state_list diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index e99dacece..065a4c84d 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -157,9 +157,13 @@ class Autolog: model_precision="fp32"): import auto_log pid = os.getpid() - gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) - infer_config = inference.Config() - infer_config.enable_use_gpu(100, gpu_id) + if (os.environ['CUDA_VISIBLE_DEVICES'] != ''): + gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) + infer_config = inference.Config() + infer_config.enable_use_gpu(100, gpu_id) + else: + gpu_id = None + infer_config = inference.Config() autolog = auto_log.AutoLogger( model_name=model_name, model_precision=model_precision, diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py new file mode 100644 index 000000000..80547544d --- /dev/null +++ b/tests/deepspeech2_online_model_test.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle + +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline + + +class TestDeepSpeech2ModelOnline(unittest.TestCase): + def setUp(self): + paddle.set_device('cpu') + + self.batch_size = 2 + self.feat_dim = 161 + max_len = 64 + + # (B, T, D) + audio = np.random.randn(self.batch_size, max_len, self.feat_dim) + audio_len = np.random.randint(max_len, size=self.batch_size) + audio_len[-1] = max_len + # (B, U) + text = np.array([[1, 2], [1, 2]]) + text_len = np.array([2] * self.batch_size) + + self.audio = paddle.to_tensor(audio, dtype='float32') + self.audio_len = paddle.to_tensor(audio_len, dtype='int64') + self.text = paddle.to_tensor(text, dtype='int32') + self.text_len = paddle.to_tensor(text_len, dtype='int64') + + def test_ds2_1(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_2(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=True) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_3(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_4(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=True) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_5(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_6(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + model.eval() + + probs, eouts, eouts_len, final_state_list = model.decode_prob( + self.audio, self.audio_len) + probs_chk, eouts_chk, eouts_len_chk, final_state_list_chk = model.decode_prob_chunk_by_chunk( + self.audio, self.audio_len) + for i in range(len(final_state_list)): + for j in range(2): + self.assertEqual( + np.sum( + np.abs(final_state_list[i][j].numpy() - + final_state_list_chk[i][j].numpy())), 0) + + +if __name__ == '__main__': + unittest.main() From 4b5cbe9a12640697af3dd9b3f74138a29faeec8a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 3 Aug 2021 07:36:46 +0000 Subject: [PATCH 15/24] ds2_online alignment, include prob_chunk_forward, prob_chunk_by_chunk_forward --- deepspeech/io/sampler.py | 2 +- deepspeech/models/ds2_online/deepspeech2.py | 135 ++++++++++++-------- tests/deepspeech2_online_model_test.py | 113 ++++++++++++++-- 3 files changed, 186 insertions(+), 64 deletions(-) diff --git a/deepspeech/io/sampler.py b/deepspeech/io/sampler.py index 763a3781e..3b2ef757d 100644 --- a/deepspeech/io/sampler.py +++ b/deepspeech/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 0b3c632be..97842e8e6 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -56,40 +56,28 @@ class CRNNEncoder(nn.Layer): self.layernorm_list = nn.LayerList() self.fc_layers_list = nn.LayerList() layernorm_size = rnn_size - - if use_gru == True: - self.rnn.append( - nn.GRU( - input_size=i_size, - hidden_size=rnn_size, - num_layers=1, - direction=rnn_direction)) - self.layernorm_list.append(nn.LayerNorm(layernorm_size)) - for i in range(1, num_rnn_layers): + for i in range(0, num_rnn_layers): + if i == 0: + rnn_input_size = i_size + else: + rnn_input_size = rnn_size + if (use_gru == True): self.rnn.append( nn.GRU( - input_size=layernorm_size, + input_size=rnn_input_size, hidden_size=rnn_size, num_layers=1, direction=rnn_direction)) - self.layernorm_list.append(nn.LayerNorm(layernorm_size)) - else: - self.rnn.append( - nn.LSTM( - input_size=i_size, - hidden_size=rnn_size, - num_layers=1, - direction=rnn_direction)) - self.layernorm_list.append(nn.LayerNorm(layernorm_size)) - for i in range(1, num_rnn_layers): + else: self.rnn.append( nn.LSTM( - input_size=layernorm_size, + input_size=rnn_input_size, hidden_size=rnn_size, num_layers=1, direction=rnn_direction)) - self.layernorm_list.append(nn.LayerNorm(layernorm_size)) - fc_input_size = layernorm_size + self.layernorm_list.append(nn.LayerNorm(layernorm_size)) + + fc_input_size = rnn_size for i in range(self.num_fc_layers): self.fc_layers_list.append( nn.Linear(fc_input_size, fc_layers_size_list[i])) @@ -122,10 +110,7 @@ class CRNNEncoder(nn.Layer): # remove padding part init_state = None final_state_list = [] - x, final_state = self.rnn[0](x, init_state, x_lens) - final_state_list.append(final_state) - x = self.layernorm_list[0](x) - for i in range(1, self.num_rnn_layers): + for i in range(0, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D] final_state_list.append(final_state) x = self.layernorm_list[i](x) @@ -149,10 +134,7 @@ class CRNNEncoder(nn.Layer): """ x, x_lens = self.conv(x, x_lens) chunk_final_state_list = [] - x, final_state = self.rnn[0](x, init_state_list[0], x_lens) - chunk_final_state_list.append(final_state) - x = self.layernorm_list[0](x) - for i in range(1, self.num_rnn_layers): + for i in range(0, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state_list[i], x_lens) #[B, T, D] chunk_final_state_list.append(final_state) @@ -177,27 +159,32 @@ class CRNNEncoder(nn.Layer): padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride padding = paddle.zeros((x.shape[0], padding_len, x.shape[2])) - x_padded = paddle.concat([x, padding], axis=1) + padded_x = paddle.concat([x, padding], axis=1) num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 num_chunk = int(num_chunk) - chunk_init_state_list = [None] * self.num_rnn_layers + chunk_state_list = [None] * self.num_rnn_layers for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size - x_chunk = x_padded[:, start:end, :] - x_len_left = x_lens - i * chunk_stride + # end = min(start + chunk_size, max_len) + # if (end - start < receptive_field_length): + # break + x_chunk = padded_x[:, start:end, :] + + x_len_left = paddle.where(x_lens - i * chunk_stride < 0, + paddle.zeros_like(x_lens), + x_lens - i * chunk_stride) x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, x_len_left, x_chunk_len_tmp) - eouts_chunk, eouts_chunk_lens, chunk_final_state_list = self.forward_chunk( - x_chunk, x_chunk_lens, chunk_init_state_list) + eouts_chunk, eouts_chunk_lens, chunk_state_list = self.forward_chunk( + x_chunk, x_chunk_lens, chunk_state_list) - chunk_init_state_list = chunk_final_state_list eouts_chunk_list.append(eouts_chunk) eouts_chunk_lens_list.append(eouts_chunk_lens) - return eouts_chunk_list, eouts_chunk_lens_list, chunk_final_state_list + return eouts_chunk_list, eouts_chunk_lens_list, chunk_state_list class DeepSpeech2ModelOnline(nn.Layer): @@ -309,6 +296,35 @@ class DeepSpeech2ModelOnline(nn.Layer): lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) + @paddle.no_grad() + def decode_by_chunk(self, eouts_prefix, eouts_len_prefix, chunk_state_list, + audio_chunk, audio_len_chunk, vocab_list, + decoding_method, lang_model_path, beam_alpha, beam_beta, + beam_size, cutoff_prob, cutoff_top_n, num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts_chunk, eouts_chunk_len, final_state_list = self.encoder.forward_chunk( + audio_chunk, audio_len_chunk, chunk_state_list) + if eouts_prefix is not None: + eouts = paddle.concat([eouts_prefix, eouts_chunk], axis=1) + eouts_len = paddle.add_n([eouts_len_prefix, eouts_chunk_len]) + else: + eouts = eouts_chunk + eouts_len = eouts_chunk_len + + probs = self.decoder.softmax(eouts) + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes), eouts, eouts_len, final_state_list + @paddle.no_grad() def decode_chunk_by_chunk(self, audio, audio_len, vocab_list, decoding_method, lang_model_path, beam_alpha, @@ -334,6 +350,13 @@ class DeepSpeech2ModelOnline(nn.Layer): lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) + """ + decocd_prob, + decode_prob_chunk_by_chunk + decode_prob_by_chunk + is only used for test + """ + @paddle.no_grad() def decode_prob(self, audio, audio_len): eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) @@ -341,15 +364,28 @@ class DeepSpeech2ModelOnline(nn.Layer): return probs, eouts, eouts_len, final_state_list @paddle.no_grad() - def decode_prob_chunk_by_chunk(self, audio, audio_len): - + def decode_prob_chunk_by_chunk(self, audio, audio_len, decoder_chunk_size): eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( - audio, audio_len) + audio, audio_len, decoder_chunk_size) eouts = paddle.concat(eouts_chunk_list, axis=1) eouts_len = paddle.add_n(eouts_chunk_len_list) probs = self.decoder.softmax(eouts) return probs, eouts, eouts_len, final_state_list + @paddle.no_grad() + def decode_prob_by_chunk(self, audio, audio_len, eouts_prefix, + eouts_lens_prefix, chunk_state_list): + eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( + audio, audio_len, chunk_state_list) + if eouts_prefix is not None: + eouts = paddle.concat([eouts_prefix, eouts_chunk], axis=1) + eouts_lens = paddle.add_n([eouts_lens_prefix, eouts_chunk_lens]) + else: + eouts = eouts_chunk + eouts_lens = eouts_chunk_lens + probs = self.decoder.softmax(eouts) + return probs, eouts, eouts_lens, final_state_list + @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. @@ -420,15 +456,14 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): probs = self.decoder.softmax(eouts) return probs - def forward_chunk_by_chunk(self, audio, audio_len): - eouts_chunk_list, eouts_chunk_lens_list, final_state_list = self.encoder.forward_chunk_by_chunk( - audio_chunk, audio_chunk_len) - eouts = paddle.concat(eouts_chunk_list, axis=1) + def forward_chunk(self, audio_chunk, audio_chunk_lens): + eouts_chunkt, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( + audio_chunk, audio_chunk_lens) probs = self.decoder.softmax(eouts) return probs def forward(self, eouts_chunk_prefix, eouts_chunk_lens_prefix, audio_chunk, - audio_chunk_len, init_state_list): + audio_chunk_lens, chunk_state_list): """export model function Args: @@ -438,8 +473,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): Returns: probs: probs after softmax """ - eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder( - audio_chunk, audio_chunk_len, init_state_list) + eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( + audio_chunk, audio_chunk_lens, chunk_state_list) eouts_chunk_new_prefix = paddle.concat( [eouts_chunk_prefix, eouts_chunk], axis=1) eouts_chunk_lens_new_prefix = paddle.add(eouts_chunk_lens_prefix, diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py index 80547544d..7f06b9d2c 100644 --- a/tests/deepspeech2_online_model_test.py +++ b/tests/deepspeech2_online_model_test.py @@ -25,7 +25,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.batch_size = 2 self.feat_dim = 161 - max_len = 64 + max_len = 210 # (B, T, D) audio = np.random.randn(self.batch_size, max_len, self.feat_dim) @@ -105,29 +105,116 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) + def split_into_chunk(self, x, x_lens, decoder_chunk_size, subsampling_rate, + receptive_field_length): + chunk_size = (decoder_chunk_size - 1 + ) * subsampling_rate + receptive_field_length + chunk_stride = subsampling_rate * decoder_chunk_size + max_len = x.shape[1] + assert (chunk_size <= max_len) + x_chunk_list = [] + x_chunk_lens_list = [] + padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride + padding = paddle.zeros((x.shape[0], padding_len, x.shape[2])) + padded_x = paddle.concat([x, padding], axis=1) + num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 + num_chunk = int(num_chunk) + for i in range(0, num_chunk): + start = i * chunk_stride + end = start + chunk_size + x_chunk = padded_x[:, start:end, :] + x_len_left = paddle.where(x_lens - i * chunk_stride < 0, + paddle.zeros_like(x_lens), + x_lens - i * chunk_stride) + x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size + x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, + x_len_left, x_chunk_len_tmp) + x_chunk_list.append(x_chunk) + x_chunk_lens_list.append(x_chunk_lens) + + return x_chunk_list, x_chunk_lens_list + def test_ds2_6(self): model = DeepSpeech2ModelOnline( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, - num_rnn_layers=3, + num_rnn_layers=1, rnn_size=1024, num_fc_layers=2, fc_layers_size_list=[512, 256], - use_gru=False) - loss = model(self.audio, self.audio_len, self.text, self.text_len) + use_gru=True) model.eval() - - probs, eouts, eouts_len, final_state_list = model.decode_prob( + paddle.device.set_device("cpu") + de_ch_size = 9 + + audio_chunk_list, audio_chunk_lens_list = self.split_into_chunk( + self.audio, self.audio_len, de_ch_size, + model.encoder.conv.subsampling_rate, + model.encoder.conv.receptive_field_length) + eouts_prefix = None + eouts_lens_prefix = None + chunk_state_list = [None] * model.encoder.num_rnn_layers + for i, audio_chunk in enumerate(audio_chunk_list): + audio_chunk_lens = audio_chunk_lens_list[i] + probs_pre_chunks, eouts_prefix, eouts_lens_prefix, chunk_state_list = model.decode_prob_by_chunk( + audio_chunk, audio_chunk_lens, eouts_prefix, eouts_lens_prefix, + chunk_state_list) + # print (i, probs_pre_chunks.shape) + + probs, eouts, eouts_lens, final_state_list = model.decode_prob( self.audio, self.audio_len) - probs_chk, eouts_chk, eouts_len_chk, final_state_list_chk = model.decode_prob_chunk_by_chunk( + + decode_max_len = probs.shape[1] + probs_pre_chunks = probs_pre_chunks[:, :decode_max_len, :] + self.assertEqual(paddle.allclose(probs, probs_pre_chunks), True) + + def test_ds2_7(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=1, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=True) + model.eval() + paddle.device.set_device("cpu") + de_ch_size = 9 + + probs, eouts, eouts_lens, final_state_list = model.decode_prob( self.audio, self.audio_len) - for i in range(len(final_state_list)): - for j in range(2): - self.assertEqual( - np.sum( - np.abs(final_state_list[i][j].numpy() - - final_state_list_chk[i][j].numpy())), 0) + probs_by_chk, eouts_by_chk, eouts_lens_by_chk, final_state_list_by_chk = model.decode_prob_chunk_by_chunk( + self.audio, self.audio_len, de_ch_size) + decode_max_len = probs.shape[1] + probs_by_chk = probs_by_chk[:, :decode_max_len, :] + eouts_by_chk = eouts_by_chk[:, :decode_max_len, :] + self.assertEqual( + paddle.sum( + paddle.abs(paddle.subtract(eouts_lens, eouts_lens_by_chk))), 0) + self.assertEqual( + paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk))), 0) + self.assertEqual( + paddle.sum( + paddle.abs(paddle.subtract(probs, probs_by_chk))).numpy(), 0) + self.assertEqual(paddle.allclose(eouts_by_chk, eouts), True) + self.assertEqual(paddle.allclose(probs_by_chk, probs), True) + """ + print ("conv_x", conv_x) + print ("conv_x_by_chk", conv_x_by_chk) + print ("final_state_list", final_state_list) + #print ("final_state_list_by_chk", final_state_list_by_chk) + print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])))) + print (paddle.allclose(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])) + print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])))) + print (paddle.allclose(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])) + print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])))) + print (paddle.allclose(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])) + print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) + print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) + print (paddle.allclose(eouts[:,:,:], eouts_by_chk[:,:,:])) + """ if __name__ == '__main__': From 2f64ae6495032a73d7d72b152a2f17e6ba27f73c Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 4 Aug 2021 04:49:35 +0000 Subject: [PATCH 16/24] not change decoder --- deepspeech/models/ds2_online/deepspeech2.py | 7 +- tests/deepspeech2_online_model_test.py | 97 ++++++++++----------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 97842e8e6..8a3d7210f 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -295,7 +295,7 @@ class DeepSpeech2ModelOnline(nn.Layer): probs.numpy(), eouts_len, vocab_list, decoding_method, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) - + """ @paddle.no_grad() def decode_by_chunk(self, eouts_prefix, eouts_len_prefix, chunk_state_list, audio_chunk, audio_len_chunk, vocab_list, @@ -349,14 +349,14 @@ class DeepSpeech2ModelOnline(nn.Layer): probs.numpy(), eouts_len, vocab_list, decoding_method, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) - + """ """ decocd_prob, decode_prob_chunk_by_chunk decode_prob_by_chunk is only used for test """ - + """ @paddle.no_grad() def decode_prob(self, audio, audio_len): eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) @@ -385,6 +385,7 @@ class DeepSpeech2ModelOnline(nn.Layer): eouts_lens = eouts_chunk_lens probs = self.decoder.softmax(eouts) return probs, eouts, eouts_lens, final_state_list + """ @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py index 7f06b9d2c..307d64955 100644 --- a/tests/deepspeech2_online_model_test.py +++ b/tests/deepspeech2_online_model_test.py @@ -105,6 +105,51 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): loss = model(self.audio, self.audio_len, self.text, self.text_len) self.assertEqual(loss.numel(), 1) + def test_ds2_6(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=1, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=True) + model.eval() + paddle.device.set_device("cpu") + de_ch_size = 9 + + eouts, eouts_lens, final_state_list = model.encoder( + self.audio, self.audio_len) + eouts_by_chk_list, eouts_lens_by_chk_list, final_state_list_by_chk = model.encoder.forward_chunk_by_chunk( + self.audio, self.audio_len, de_ch_size) + eouts_by_chk = paddle.concat(eouts_by_chk_list, axis = 1) + eouts_lens_by_chk = paddle.add_n(eouts_lens_by_chk_list) + decode_max_len = eouts.shape[1] + print ("dml", decode_max_len) + eouts_by_chk = eouts_by_chk[:, :decode_max_len, :] + self.assertEqual( + paddle.sum( + paddle.abs(paddle.subtract(eouts_lens, eouts_lens_by_chk))), 0) + self.assertEqual( + paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk))), 0) + self.assertEqual(paddle.allclose(eouts_by_chk, eouts), True) + """ + print ("conv_x", conv_x) + print ("conv_x_by_chk", conv_x_by_chk) + print ("final_state_list", final_state_list) + #print ("final_state_list_by_chk", final_state_list_by_chk) + print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])))) + print (paddle.allclose(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])) + print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])))) + print (paddle.allclose(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])) + print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])))) + print (paddle.allclose(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])) + print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) + print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) + print (paddle.allclose(eouts[:,:,:], eouts_by_chk[:,:,:])) + """ + """ def split_into_chunk(self, x, x_lens, decoder_chunk_size, subsampling_rate, receptive_field_length): chunk_size = (decoder_chunk_size - 1 @@ -134,7 +179,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): return x_chunk_list, x_chunk_lens_list - def test_ds2_6(self): + def test_ds2_7(self): model = DeepSpeech2ModelOnline( feat_size=self.feat_dim, dict_size=10, @@ -157,7 +202,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): chunk_state_list = [None] * model.encoder.num_rnn_layers for i, audio_chunk in enumerate(audio_chunk_list): audio_chunk_lens = audio_chunk_lens_list[i] - probs_pre_chunks, eouts_prefix, eouts_lens_prefix, chunk_state_list = model.decode_prob_by_chunk( + eouts_prefix, eouts_lens_prefix, chunk_state_list = model.decode_prob_by_chunk( audio_chunk, audio_chunk_lens, eouts_prefix, eouts_lens_prefix, chunk_state_list) # print (i, probs_pre_chunks.shape) @@ -168,53 +213,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): decode_max_len = probs.shape[1] probs_pre_chunks = probs_pre_chunks[:, :decode_max_len, :] self.assertEqual(paddle.allclose(probs, probs_pre_chunks), True) - - def test_ds2_7(self): - model = DeepSpeech2ModelOnline( - feat_size=self.feat_dim, - dict_size=10, - num_conv_layers=2, - num_rnn_layers=1, - rnn_size=1024, - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=True) - model.eval() - paddle.device.set_device("cpu") - de_ch_size = 9 - - probs, eouts, eouts_lens, final_state_list = model.decode_prob( - self.audio, self.audio_len) - probs_by_chk, eouts_by_chk, eouts_lens_by_chk, final_state_list_by_chk = model.decode_prob_chunk_by_chunk( - self.audio, self.audio_len, de_ch_size) - decode_max_len = probs.shape[1] - probs_by_chk = probs_by_chk[:, :decode_max_len, :] - eouts_by_chk = eouts_by_chk[:, :decode_max_len, :] - self.assertEqual( - paddle.sum( - paddle.abs(paddle.subtract(eouts_lens, eouts_lens_by_chk))), 0) - self.assertEqual( - paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk))), 0) - self.assertEqual( - paddle.sum( - paddle.abs(paddle.subtract(probs, probs_by_chk))).numpy(), 0) - self.assertEqual(paddle.allclose(eouts_by_chk, eouts), True) - self.assertEqual(paddle.allclose(probs_by_chk, probs), True) - """ - print ("conv_x", conv_x) - print ("conv_x_by_chk", conv_x_by_chk) - print ("final_state_list", final_state_list) - #print ("final_state_list_by_chk", final_state_list_by_chk) - print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])))) - print (paddle.allclose(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])) - print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])))) - print (paddle.allclose(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])) - print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])))) - print (paddle.allclose(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])) - print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) - print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) - print (paddle.allclose(eouts[:,:,:], eouts_by_chk[:,:,:])) - """ + """ if __name__ == '__main__': From 3fb9f6885a1af22b8b66a0322ee45a09d6c14823 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 4 Aug 2021 07:30:43 +0000 Subject: [PATCH 17/24] complete model export for ds2_online --- deepspeech/exps/deepspeech2/bin/export.py | 3 +- deepspeech/exps/deepspeech2/bin/test.py | 2 + deepspeech/exps/deepspeech2/bin/train.py | 2 + deepspeech/exps/deepspeech2/config.py | 2 +- deepspeech/exps/deepspeech2/model.py | 45 ++++-- deepspeech/models/ds2_online/conv.py | 4 +- deepspeech/models/ds2_online/deepspeech2.py | 149 +++----------------- examples/tiny/s0/run.sh | 2 +- examples/tiny/s0/run_online.sh | 6 +- tests/deepspeech2_model_test.py | 1 - tests/deepspeech2_online_model_test.py | 9 +- 11 files changed, 70 insertions(+), 155 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py index 9ae045c48..8ae987947 100644 --- a/deepspeech/exps/deepspeech2/bin/export.py +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -32,7 +32,8 @@ if __name__ == "__main__": parser = default_argument_parser() parser.add_argument("--model_type") args = parser.parse_args() - + if args.model_type is None: + args.model_type = 'offline' print_arguments(args) # https://yaml.org/type/float.html diff --git a/deepspeech/exps/deepspeech2/bin/test.py b/deepspeech/exps/deepspeech2/bin/test.py index 49bca73d2..78a99b892 100644 --- a/deepspeech/exps/deepspeech2/bin/test.py +++ b/deepspeech/exps/deepspeech2/bin/test.py @@ -33,6 +33,8 @@ if __name__ == "__main__": parser.add_argument("--model_type") args = parser.parse_args() print_arguments(args, globals()) + if args.model_type is None: + args.model_type = 'offline' # https://yaml.org/type/float.html config = get_cfg_defaults(args.model_type) diff --git a/deepspeech/exps/deepspeech2/bin/train.py b/deepspeech/exps/deepspeech2/bin/train.py index 253806af1..dcfa62f45 100644 --- a/deepspeech/exps/deepspeech2/bin/train.py +++ b/deepspeech/exps/deepspeech2/bin/train.py @@ -37,6 +37,8 @@ if __name__ == "__main__": parser = default_argument_parser() parser.add_argument("--model_type") args = parser.parse_args() + if args.model_type is None: + args.model_type = 'offline' print_arguments(args, globals()) # https://yaml.org/type/float.html diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 4b3f724ff..66516b35d 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -21,7 +21,7 @@ from deepspeech.models.ds2 import DeepSpeech2Model from deepspeech.models.ds2_online import DeepSpeech2ModelOnline -def get_cfg_defaults(model_type): +def get_cfg_defaults(model_type='offline'): _C = CfgNode() if (model_type == 'offline'): _C.data = ManifestDataset.params() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index bac305f55..1fd47bd1e 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -134,6 +134,7 @@ class DeepSpeech2Trainer(Trainer): use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) elif self.args.model_type == 'online': + print("fc_layers_size_list", config.model.fc_layers_size_list) model = DeepSpeech2ModelOnline( feat_size=self.train_loader.collate_fn.feature_size, dict_size=self.train_loader.collate_fn.vocab_size, @@ -352,19 +353,43 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): infer_model = DeepSpeech2InferModelOnline.from_pretrained( self.test_loader, self.config, self.args.checkpoint_path) else: - raise Exception("wrong model tyep") + raise Exception("wrong model type") infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size - static_model = paddle.jit.to_static( - infer_model, - input_spec=[ - paddle.static.InputSpec( - shape=[None, None, feat_dim], - dtype='float32'), # audio, [B,T,D] - paddle.static.InputSpec(shape=[None], - dtype='int64'), # audio_length, [B] - ]) + if self.args.model_type == 'offline': + static_model = paddle.jit.to_static( + infer_model, + input_spec=[ + paddle.static.InputSpec( + shape=[None, None, feat_dim], + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[None], + dtype='int64'), # audio_length, [B] + ]) + elif self.args.model_type == 'online': + static_model = paddle.jit.to_static( + infer_model, + input_spec=[ + paddle.static.InputSpec( + shape=[None, None, + feat_dim], #[B, chunk_size, feat_dim] + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[None], + dtype='int64'), # audio_length, [B] + [ + ( + paddle.static.InputSpec( + shape=[None, None, None], dtype='float32' + ), #num_rnn_layers * num_dirctions, rnn_size + paddle.static.InputSpec( + shape=[None, None, None], dtype='float32' + ) #num_rnn_layers * num_dirctions, rnn_size + ) for i in range(self.config.model.num_rnn_layers) + ] + ]) + else: + raise Exception("wrong model type") logger.info(f"Export code: {static_model.forward.code}") paddle.jit.save(static_model, self.args.export_path) diff --git a/deepspeech/models/ds2_online/conv.py b/deepspeech/models/ds2_online/conv.py index 83d98e410..1af69e28c 100644 --- a/deepspeech/models/ds2_online/conv.py +++ b/deepspeech/models/ds2_online/conv.py @@ -29,7 +29,7 @@ class Conv2dSubsampling4Online(Conv2dSubsampling4): x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]: x = x.unsqueeze(1) # (b, c=1, t, f) x = self.conv(x) - b, c, t, f = paddle.shape(x) - x = x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]) + #b, c, t, f = paddle.shape(x) #not work under jit + x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1]) x_len = ((x_len - 1) // 2 - 1) // 2 return x, x_len diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 8a3d7210f..d97e95740 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -61,7 +61,7 @@ class CRNNEncoder(nn.Layer): rnn_input_size = i_size else: rnn_input_size = rnn_size - if (use_gru == True): + if use_gru == True: self.rnn.append( nn.GRU( input_size=rnn_input_size, @@ -146,6 +146,17 @@ class CRNNEncoder(nn.Layer): return x, x_lens, chunk_final_state_list def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8): + """Compute Encoder outputs + + Args: + x (Tensor): [B, T, D] + x_lens (Tensor): [B] + decoder_chunk_size: The chunk size of decoder + Returns: + eouts_chunk_list (List of Tensor): The list of encoder outputs in chunk_size, [B, chunk_size, D] * num_chunks + eouts_chunk_lens_list (List of Tensor): The list of encoder length in chunk_size, [B] * num_chunks + final_chunk_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + """ subsampling_rate = self.conv.subsampling_rate receptive_field_length = self.conv.receptive_field_length chunk_size = (decoder_chunk_size - 1 @@ -183,8 +194,8 @@ class CRNNEncoder(nn.Layer): eouts_chunk_list.append(eouts_chunk) eouts_chunk_lens_list.append(eouts_chunk_lens) - - return eouts_chunk_list, eouts_chunk_lens_list, chunk_state_list + final_chunk_state_list = chunk_state_list + return eouts_chunk_list, eouts_chunk_lens_list, final_chunk_state_list class DeepSpeech2ModelOnline(nn.Layer): @@ -208,7 +219,6 @@ class DeepSpeech2ModelOnline(nn.Layer): :type rnn_size: int :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool - :type share_weights: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput @@ -295,97 +305,6 @@ class DeepSpeech2ModelOnline(nn.Layer): probs.numpy(), eouts_len, vocab_list, decoding_method, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) - """ - @paddle.no_grad() - def decode_by_chunk(self, eouts_prefix, eouts_len_prefix, chunk_state_list, - audio_chunk, audio_len_chunk, vocab_list, - decoding_method, lang_model_path, beam_alpha, beam_beta, - beam_size, cutoff_prob, cutoff_top_n, num_processes): - # init once - # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) - - eouts_chunk, eouts_chunk_len, final_state_list = self.encoder.forward_chunk( - audio_chunk, audio_len_chunk, chunk_state_list) - if eouts_prefix is not None: - eouts = paddle.concat([eouts_prefix, eouts_chunk], axis=1) - eouts_len = paddle.add_n([eouts_len_prefix, eouts_chunk_len]) - else: - eouts = eouts_chunk - eouts_len = eouts_chunk_len - - probs = self.decoder.softmax(eouts) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes), eouts, eouts_len, final_state_list - - @paddle.no_grad() - def decode_chunk_by_chunk(self, audio, audio_len, vocab_list, - decoding_method, lang_model_path, beam_alpha, - beam_beta, beam_size, cutoff_prob, cutoff_top_n, - num_processes): - # init once - # decoders only accept string encoded in utf-8 - self.decoder.init_decode( - beam_alpha=beam_alpha, - beam_beta=beam_beta, - lang_model_path=lang_model_path, - vocab_list=vocab_list, - decoding_method=decoding_method) - - eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( - audio, audio_len) - eouts = paddle.concat(eouts_chunk_list, axis=1) - eouts_len = paddle.add_n(eouts_chunk_len_list) - - probs = self.decoder.softmax(eouts) - return self.decoder.decode_probs( - probs.numpy(), eouts_len, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, - cutoff_top_n, num_processes) - """ - """ - decocd_prob, - decode_prob_chunk_by_chunk - decode_prob_by_chunk - is only used for test - """ - """ - @paddle.no_grad() - def decode_prob(self, audio, audio_len): - eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) - probs = self.decoder.softmax(eouts) - return probs, eouts, eouts_len, final_state_list - - @paddle.no_grad() - def decode_prob_chunk_by_chunk(self, audio, audio_len, decoder_chunk_size): - eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( - audio, audio_len, decoder_chunk_size) - eouts = paddle.concat(eouts_chunk_list, axis=1) - eouts_len = paddle.add_n(eouts_chunk_len_list) - probs = self.decoder.softmax(eouts) - return probs, eouts, eouts_len, final_state_list - - @paddle.no_grad() - def decode_prob_by_chunk(self, audio, audio_len, eouts_prefix, - eouts_lens_prefix, chunk_state_list): - eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( - audio, audio_len, chunk_state_list) - if eouts_prefix is not None: - eouts = paddle.concat([eouts_prefix, eouts_chunk], axis=1) - eouts_lens = paddle.add_n([eouts_lens_prefix, eouts_chunk_lens]) - else: - eouts = eouts_chunk - eouts_lens = eouts_chunk_lens - probs = self.decoder.softmax(eouts) - return probs, eouts, eouts_lens, final_state_list - """ @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): @@ -443,42 +362,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): fc_layers_size_list=fc_layers_size_list, use_gru=use_gru) - def forward(self, audio, audio_len): - """export model function - - Args: - audio (Tensor): [B, T, D] - audio_len (Tensor): [B] - - Returns: - probs: probs after softmax - """ - eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) - probs = self.decoder.softmax(eouts) - return probs - - def forward_chunk(self, audio_chunk, audio_chunk_lens): - eouts_chunkt, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( - audio_chunk, audio_chunk_lens) - probs = self.decoder.softmax(eouts) - return probs - - def forward(self, eouts_chunk_prefix, eouts_chunk_lens_prefix, audio_chunk, - audio_chunk_lens, chunk_state_list): - """export model function - - Args: - audio_chunk (Tensor): [B, T, D] - audio_chunk_len (Tensor): [B] - - Returns: - probs: probs after softmax - """ + def forward(self, audio_chunk, audio_chunk_lens, chunk_state_list): eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( audio_chunk, audio_chunk_lens, chunk_state_list) - eouts_chunk_new_prefix = paddle.concat( - [eouts_chunk_prefix, eouts_chunk], axis=1) - eouts_chunk_lens_new_prefix = paddle.add(eouts_chunk_lens_prefix, - eouts_chunk_lens) - probs_chunk = self.decoder.softmax(eouts_chunk_new_prefix) - return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, final_state_list + probs_chunk = self.decoder.softmax(eouts_chunk) + return probs_chunk, final_state_list diff --git a/examples/tiny/s0/run.sh b/examples/tiny/s0/run.sh index a4506e4c5..408b28fd0 100755 --- a/examples/tiny/s0/run.sh +++ b/examples/tiny/s0/run.sh @@ -7,7 +7,7 @@ stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml avg_num=1 -model_type=online +model_type=offline source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; diff --git a/examples/tiny/s0/run_online.sh b/examples/tiny/s0/run_online.sh index 4c3602045..3f5ecbb66 100755 --- a/examples/tiny/s0/run_online.sh +++ b/examples/tiny/s0/run_online.sh @@ -4,10 +4,10 @@ source path.sh gpus=7 stage=1 -stop_stage=100 -conf_path=conf/deepspeech2.yaml +stop_stage=1 +conf_path=conf/deepspeech2_online.yaml avg_num=1 -model_type=online +model_type=online #online | offline source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; diff --git a/tests/deepspeech2_model_test.py b/tests/deepspeech2_model_test.py index c6decdb6b..00df8195b 100644 --- a/tests/deepspeech2_model_test.py +++ b/tests/deepspeech2_model_test.py @@ -19,7 +19,6 @@ import paddle from deepspeech.models.ds2 import DeepSpeech2Model - class TestDeepSpeech2Model(unittest.TestCase): def setUp(self): paddle.set_device('cpu') diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py index 307d64955..ce235cd6d 100644 --- a/tests/deepspeech2_online_model_test.py +++ b/tests/deepspeech2_online_model_test.py @@ -119,14 +119,14 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): paddle.device.set_device("cpu") de_ch_size = 9 - eouts, eouts_lens, final_state_list = model.encoder( - self.audio, self.audio_len) + eouts, eouts_lens, final_state_list = model.encoder(self.audio, + self.audio_len) eouts_by_chk_list, eouts_lens_by_chk_list, final_state_list_by_chk = model.encoder.forward_chunk_by_chunk( self.audio, self.audio_len, de_ch_size) - eouts_by_chk = paddle.concat(eouts_by_chk_list, axis = 1) + eouts_by_chk = paddle.concat(eouts_by_chk_list, axis=1) eouts_lens_by_chk = paddle.add_n(eouts_lens_by_chk_list) decode_max_len = eouts.shape[1] - print ("dml", decode_max_len) + print("dml", decode_max_len) eouts_by_chk = eouts_by_chk[:, :decode_max_len, :] self.assertEqual( paddle.sum( @@ -149,6 +149,7 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) print (paddle.allclose(eouts[:,:,:], eouts_by_chk[:,:,:])) """ + """ def split_into_chunk(self, x, x_lens, decoder_chunk_size, subsampling_rate, receptive_field_length): From 8f062cad6ba5f818cf85c0322ebcf325082c8dbb Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 4 Aug 2021 11:36:54 +0000 Subject: [PATCH 18/24] fixed the small problems --- deepspeech/exps/deepspeech2/bin/export.py | 1 + deepspeech/exps/deepspeech2/bin/test.py | 1 + deepspeech/exps/deepspeech2/bin/train.py | 1 + deepspeech/exps/deepspeech2/config.py | 20 ++-------- deepspeech/exps/deepspeech2/model.py | 1 - deepspeech/models/ds2_online/deepspeech2.py | 3 +- examples/tiny/s0/local/export.sh | 2 +- examples/tiny/s0/local/test.sh | 2 +- examples/tiny/s0/local/train.sh | 2 +- examples/tiny/s0/run_online.sh | 41 --------------------- 10 files changed, 12 insertions(+), 62 deletions(-) delete mode 100755 examples/tiny/s0/run_online.sh diff --git a/deepspeech/exps/deepspeech2/bin/export.py b/deepspeech/exps/deepspeech2/bin/export.py index 8ae987947..f8764fde3 100644 --- a/deepspeech/exps/deepspeech2/bin/export.py +++ b/deepspeech/exps/deepspeech2/bin/export.py @@ -34,6 +34,7 @@ if __name__ == "__main__": args = parser.parse_args() if args.model_type is None: args.model_type = 'offline' + print("model_type:{}".format(args.model_type)) print_arguments(args) # https://yaml.org/type/float.html diff --git a/deepspeech/exps/deepspeech2/bin/test.py b/deepspeech/exps/deepspeech2/bin/test.py index 78a99b892..376e18e38 100644 --- a/deepspeech/exps/deepspeech2/bin/test.py +++ b/deepspeech/exps/deepspeech2/bin/test.py @@ -35,6 +35,7 @@ if __name__ == "__main__": print_arguments(args, globals()) if args.model_type is None: args.model_type = 'offline' + print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html config = get_cfg_defaults(args.model_type) diff --git a/deepspeech/exps/deepspeech2/bin/train.py b/deepspeech/exps/deepspeech2/bin/train.py index dcfa62f45..69ff043a0 100644 --- a/deepspeech/exps/deepspeech2/bin/train.py +++ b/deepspeech/exps/deepspeech2/bin/train.py @@ -39,6 +39,7 @@ if __name__ == "__main__": args = parser.parse_args() if args.model_type is None: args.model_type = 'offline' + print("model_type:{}".format(args.model_type)) print_arguments(args, globals()) # https://yaml.org/type/float.html diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 66516b35d..53358014c 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -23,26 +23,14 @@ from deepspeech.models.ds2_online import DeepSpeech2ModelOnline def get_cfg_defaults(model_type='offline'): _C = CfgNode() + _C.data = ManifestDataset.params() + _C.collator = SpeechCollator.params() + _C.training = DeepSpeech2Trainer.params() + _C.decoding = DeepSpeech2Tester.params() if (model_type == 'offline'): - _C.data = ManifestDataset.params() - - _C.collator = SpeechCollator.params() - _C.model = DeepSpeech2Model.params() - - _C.training = DeepSpeech2Trainer.params() - - _C.decoding = DeepSpeech2Tester.params() else: - _C.data = ManifestDataset.params() - - _C.collator = SpeechCollator.params() - _C.model = DeepSpeech2ModelOnline.params() - - _C.training = DeepSpeech2Trainer.params() - - _C.decoding = DeepSpeech2Tester.params() """Get a yacs CfgNode object with default values for my_project.""" # Return a clone so that the defaults will not be altered # This is for the "local variable" use pattern diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 1fd47bd1e..4acfad86b 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -134,7 +134,6 @@ class DeepSpeech2Trainer(Trainer): use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) elif self.args.model_type == 'online': - print("fc_layers_size_list", config.model.fc_layers_size_list) model = DeepSpeech2ModelOnline( feat_size=self.train_loader.collate_fn.feature_size, dict_size=self.train_loader.collate_fn.vocab_size, diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index d97e95740..bed9c41d3 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -174,6 +174,7 @@ class CRNNEncoder(nn.Layer): num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 num_chunk = int(num_chunk) chunk_state_list = [None] * self.num_rnn_layers + final_chunk_state_list = None for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size @@ -366,4 +367,4 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( audio_chunk, audio_chunk_lens, chunk_state_list) probs_chunk = self.decoder.softmax(eouts_chunk) - return probs_chunk, final_state_list + return probs_chunk, eouts_chunk_lens, final_state_list diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/s0/local/export.sh index 6955239c7..2e09e5f5e 100755 --- a/examples/tiny/s0/local/export.sh +++ b/examples/tiny/s0/local/export.sh @@ -1,7 +1,7 @@ #!/bin/bash if [ $# != 4 ];then - echo "usage: $0 config_path ckpt_prefix jit_model_path" + echo "usage: $0 config_path ckpt_prefix jit_model_path model_type" exit -1 fi diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/s0/local/test.sh index 2f74491a1..b5b68c599 100755 --- a/examples/tiny/s0/local/test.sh +++ b/examples/tiny/s0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix" + echo "usage: ${0} config_path ckpt_path_prefix model_type" exit -1 fi diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh index 1d49dcd1d..c6a631800 100755 --- a/examples/tiny/s0/local/train.sh +++ b/examples/tiny/s0/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" exit -1 fi diff --git a/examples/tiny/s0/run_online.sh b/examples/tiny/s0/run_online.sh deleted file mode 100755 index 3f5ecbb66..000000000 --- a/examples/tiny/s0/run_online.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -set -e -source path.sh - -gpus=7 -stage=1 -stop_stage=1 -conf_path=conf/deepspeech2_online.yaml -avg_num=1 -model_type=online #online | offline - -source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; - -avg_ckpt=avg_${avg_num} -ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') ###ckpt = deepspeech2 -echo "checkpoint name ${ckpt}" - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # prepare data - bash ./local/data.sh || exit -1 -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # avg n best model - avg.sh exp/${ckpt}/checkpoints ${avg_num} -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} -fi From 722c55e4c5fe61af75dc581070273ee1ff292f1d Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 4 Aug 2021 12:57:31 +0000 Subject: [PATCH 19/24] reconstruct the rnn state, from list to tensor --- deepspeech/exps/deepspeech2/model.py | 14 +-- deepspeech/models/ds2_online/deepspeech2.py | 106 +++++++++++++++----- tests/deepspeech2_online_model_test.py | 10 +- 3 files changed, 92 insertions(+), 38 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 4acfad86b..51ef1de47 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -376,16 +376,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): dtype='float32'), # audio, [B,T,D] paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] - [ - ( - paddle.static.InputSpec( - shape=[None, None, None], dtype='float32' - ), #num_rnn_layers * num_dirctions, rnn_size - paddle.static.InputSpec( - shape=[None, None, None], dtype='float32' - ) #num_rnn_layers * num_dirctions, rnn_size - ) for i in range(self.config.model.num_rnn_layers) - ] + paddle.static.InputSpec( + shape=[None, None, None], dtype='float32'), + paddle.static.InputSpec( + shape=[None, None, None], dtype='float32') ]) else: raise Exception("wrong model type") diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index bed9c41d3..b42ac8ec1 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -48,6 +48,7 @@ class CRNNEncoder(nn.Layer): self.num_fc_layers = num_fc_layers self.rnn_direction = rnn_direction self.fc_layers_size_list = fc_layers_size_list + self.use_gru = use_gru self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0) i_size = self.conv.output_dim @@ -96,7 +97,8 @@ class CRNNEncoder(nn.Layer): Returns: x (Tensor): encoder outputs, [B, T_output, D] x_lens (Tensor): encoder length, [B] - final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + final_state_h_box(Tensor): final_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size + final_state_c_box(Tensor): final_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size """ # [B, T, D] # convolution group @@ -118,32 +120,79 @@ class CRNNEncoder(nn.Layer): for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens, final_state_list - def forward_chunk(self, x, x_lens, init_state_list): + if self.use_gru == True: + final_state_h_box = paddle.concat(final_state_list, axis=0) + final_state_c_box = paddle.zeros_like(final_state_h_box) + else: + final_state_h_list = [ + final_state_list[i][0] for i in range(self.num_rnn_layers) + ] + final_state_c_list = [ + final_state_list[i][1] for i in range(self.num_rnn_layers) + ] + final_state_h_box = paddle.concat(final_state_h_list, axis=0) + final_state_c_box = paddle.concat(final_state_c_list, axis=0) + + return x, x_lens, final_state_h_box, final_state_c_box + + def forward_chunk(self, x, x_lens, init_state_h_box, init_state_c_box): """Compute Encoder outputs Args: - x (Tensor): [B, feature_chunk_size, D] + x (Tensor): [B, feature_size, D] x_lens (Tensor): [B] - init_state_list (list of Tensors): [ num_directions, batch_size, hidden_size] * num_rnn_layers + init_state_h_box(Tensor): init_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size + init_state_c_box(Tensor): init_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size Returns: - x (Tensor): encoder outputs, [B, chunk_size, D] + x (Tensor): encoder outputs, [B, size, D] x_lens (Tensor): encoder length, [B] - chunk_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + final_state_h_box(Tensor): final_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size + final_state_c_box(Tensor): final_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size """ + if init_state_h_box is not None: + init_state_list = None + init_state_h_list = paddle.split( + init_state_h_box, self.num_rnn_layers, axis=0) + init_state_c_list = paddle.split( + init_state_c_box, self.num_rnn_layers, axis=0) + if self.use_gru == True: + init_state_list = init_state_h_list + else: + init_state_list = [(init_state_h_list[i], init_state_c_list[i]) + for i in range(self.num_rnn_layers)] + else: + init_state_list = [None] * self.num_rnn_layers + x, x_lens = self.conv(x, x_lens) - chunk_final_state_list = [] + final_chunk_state_list = [] for i in range(0, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state_list[i], x_lens) #[B, T, D] - chunk_final_state_list.append(final_state) + final_chunk_state_list.append(final_state) x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens, chunk_final_state_list + + if self.use_gru == True: + final_chunk_state_h_box = paddle.concat( + final_chunk_state_list, axis=0) + final_chunk_state_c_box = paddle.zeros_like(final_chunk_state_h_box) + else: + final_chunk_state_h_list = [ + final_chunk_state_list[i][0] for i in range(self.num_rnn_layers) + ] + final_chunk_state_c_list = [ + final_chunk_state_list[i][1] for i in range(self.num_rnn_layers) + ] + final_chunk_state_h_box = paddle.concat( + final_chunk_state_h_list, axis=0) + final_chunk_state_c_box = paddle.concat( + final_chunk_state_c_list, axis=0) + + return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8): """Compute Encoder outputs @@ -153,9 +202,10 @@ class CRNNEncoder(nn.Layer): x_lens (Tensor): [B] decoder_chunk_size: The chunk size of decoder Returns: - eouts_chunk_list (List of Tensor): The list of encoder outputs in chunk_size, [B, chunk_size, D] * num_chunks - eouts_chunk_lens_list (List of Tensor): The list of encoder length in chunk_size, [B] * num_chunks - final_chunk_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + eouts_list (List of Tensor): The list of encoder outputs in chunk_size, [B, chunk_size, D] * num_chunks + eouts_lens_list (List of Tensor): The list of encoder length in chunk_size, [B] * num_chunks + final_state_h_box(Tensor): final_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size + final_state_c_box(Tensor): final_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size """ subsampling_rate = self.conv.subsampling_rate receptive_field_length = self.conv.receptive_field_length @@ -173,8 +223,10 @@ class CRNNEncoder(nn.Layer): padded_x = paddle.concat([x, padding], axis=1) num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 num_chunk = int(num_chunk) - chunk_state_list = [None] * self.num_rnn_layers - final_chunk_state_list = None + chunk_state_h_box = None + chunk_state_c_box = None + final_state_h_box = None + final_state_c_box = None for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size @@ -190,13 +242,14 @@ class CRNNEncoder(nn.Layer): x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, x_len_left, x_chunk_len_tmp) - eouts_chunk, eouts_chunk_lens, chunk_state_list = self.forward_chunk( - x_chunk, x_chunk_lens, chunk_state_list) + eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward_chunk( + x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box) eouts_chunk_list.append(eouts_chunk) eouts_chunk_lens_list.append(eouts_chunk_lens) - final_chunk_state_list = chunk_state_list - return eouts_chunk_list, eouts_chunk_lens_list, final_chunk_state_list + final_state_h_box = chunk_state_h_box + final_state_c_box = chunk_state_c_box + return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box class DeepSpeech2ModelOnline(nn.Layer): @@ -283,7 +336,8 @@ class DeepSpeech2ModelOnline(nn.Layer): Returns: loss (Tenosr): [1] """ - eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( + audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) return loss @@ -300,7 +354,8 @@ class DeepSpeech2ModelOnline(nn.Layer): vocab_list=vocab_list, decoding_method=decoding_method) - eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( + audio, audio_len) probs = self.decoder.softmax(eouts) return self.decoder.decode_probs( probs.numpy(), eouts_len, vocab_list, decoding_method, @@ -363,8 +418,9 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): fc_layers_size_list=fc_layers_size_list, use_gru=use_gru) - def forward(self, audio_chunk, audio_chunk_lens, chunk_state_list): - eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder.forward_chunk( - audio_chunk, audio_chunk_lens, chunk_state_list) + def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, + chunk_state_c_box): + eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder.forward_chunk( + audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box) probs_chunk = self.decoder.softmax(eouts_chunk) - return probs_chunk, eouts_chunk_lens, final_state_list + return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py index ce235cd6d..fd1dfc4b5 100644 --- a/tests/deepspeech2_online_model_test.py +++ b/tests/deepspeech2_online_model_test.py @@ -119,9 +119,9 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): paddle.device.set_device("cpu") de_ch_size = 9 - eouts, eouts_lens, final_state_list = model.encoder(self.audio, - self.audio_len) - eouts_by_chk_list, eouts_lens_by_chk_list, final_state_list_by_chk = model.encoder.forward_chunk_by_chunk( + eouts, eouts_lens, final_state_h_box, final_state_c_box = model.encoder( + self.audio, self.audio_len) + eouts_by_chk_list, eouts_lens_by_chk_list, final_state_h_box_chk, final_state_c_box_chk = model.encoder.forward_chunk_by_chunk( self.audio, self.audio_len, de_ch_size) eouts_by_chk = paddle.concat(eouts_by_chk_list, axis=1) eouts_lens_by_chk = paddle.add_n(eouts_lens_by_chk_list) @@ -134,6 +134,10 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual( paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk))), 0) self.assertEqual(paddle.allclose(eouts_by_chk, eouts), True) + self.assertEqual( + paddle.allclose(final_state_h_box, final_state_h_box_chk), True) + self.assertEqual( + paddle.allclose(final_state_c_box, final_state_c_box_chk), True) """ print ("conv_x", conv_x) print ("conv_x_by_chk", conv_x_by_chk) From 61d854045191c8ef853e31c2a18f3587f362bc08 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Thu, 5 Aug 2021 04:36:56 +0000 Subject: [PATCH 20/24] reconstruct the export function and the run.sh in aishell and librispeech --- deepspeech/exps/deepspeech2/model.py | 16 ++-------------- deepspeech/models/ds2_online/deepspeech2.py | 17 +++++++++++++++++ examples/aishell/s0/local/export.sh | 9 +++++---- examples/aishell/s0/local/test.sh | 8 +++++--- examples/aishell/s0/local/train.sh | 8 +++++--- examples/aishell/s0/run.sh | 7 ++++--- examples/librispeech/s0/local/export.sh | 9 +++++---- examples/librispeech/s0/local/test.sh | 8 +++++--- examples/librispeech/s0/local/train.sh | 8 +++++--- examples/librispeech/s0/run.sh | 7 ++++--- 10 files changed, 57 insertions(+), 40 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 51ef1de47..de5ff5f44 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -367,20 +367,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): dtype='int64'), # audio_length, [B] ]) elif self.args.model_type == 'online': - static_model = paddle.jit.to_static( - infer_model, - input_spec=[ - paddle.static.InputSpec( - shape=[None, None, - feat_dim], #[B, chunk_size, feat_dim] - dtype='float32'), # audio, [B,T,D] - paddle.static.InputSpec(shape=[None], - dtype='int64'), # audio_length, [B] - paddle.static.InputSpec( - shape=[None, None, None], dtype='float32'), - paddle.static.InputSpec( - shape=[None, None, None], dtype='float32') - ]) + static_model = DeepSpeech2InferModelOnline.export(infer_model, + feat_dim) else: raise Exception("wrong model type") logger.info(f"Export code: {static_model.forward.code}") diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index b42ac8ec1..ad8a0506f 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -424,3 +424,20 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box) probs_chunk = self.decoder.softmax(eouts_chunk) return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box + + @classmethod + def export(self, infer_model, feat_dim): + static_model = paddle.jit.to_static( + infer_model, + input_spec=[ + paddle.static.InputSpec( + shape=[None, None, feat_dim], #[B, chunk_size, feat_dim] + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[None], + dtype='int64'), # audio_length, [B] + paddle.static.InputSpec( + shape=[None, None, None], dtype='float32'), + paddle.static.InputSpec( + shape=[None, None, None], dtype='float32') + ]) + return static_model diff --git a/examples/aishell/s0/local/export.sh b/examples/aishell/s0/local/export.sh index f99a15bad..2e09e5f5e 100755 --- a/examples/aishell/s0/local/export.sh +++ b/examples/aishell/s0/local/export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: $0 config_path ckpt_prefix jit_model_path" +if [ $# != 4 ];then + echo "usage: $0 config_path ckpt_prefix jit_model_path model_type" exit -1 fi @@ -11,6 +11,7 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_path_prefix=$2 jit_model_export_path=$3 +model_type=$4 device=gpu if [ ${ngpu} == 0 ];then @@ -22,8 +23,8 @@ python3 -u ${BIN_DIR}/export.py \ --nproc ${ngpu} \ --config ${config_path} \ --checkpoint_path ${ckpt_path_prefix} \ ---export_path ${jit_model_export_path} - +--export_path ${jit_model_export_path} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in export!" diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/s0/local/test.sh index fd9cb5661..9fd0bc8d5 100755 --- a/examples/aishell/s0/local/test.sh +++ b/examples/aishell/s0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path ckpt_path_prefix model_type" exit -1 fi @@ -14,6 +14,7 @@ if [ ${ngpu} == 0 ];then fi config_path=$1 ckpt_prefix=$2 +model_type=$3 # download language model bash local/download_lm_ch.sh @@ -26,7 +27,8 @@ python3 -u ${BIN_DIR}/test.py \ --nproc 1 \ --config ${config_path} \ --result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/s0/local/train.sh index f6bd2c983..c6a631800 100755 --- a/examples/aishell/s0/local/train.sh +++ b/examples/aishell/s0/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" exit -1 fi @@ -10,6 +10,7 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +model_type=$3 device=gpu if [ ${ngpu} == 0 ];then @@ -22,7 +23,8 @@ python3 -u ${BIN_DIR}/train.py \ --device ${device} \ --nproc ${ngpu} \ --config ${config_path} \ ---output exp/${ckpt_name} +--output exp/${ckpt_name} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in training!" diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index c9708dcc9..7cd63999c 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -7,6 +7,7 @@ stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml avg_num=1 +model_type=offline source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -21,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -31,10 +32,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} fi diff --git a/examples/librispeech/s0/local/export.sh b/examples/librispeech/s0/local/export.sh index f99a15bad..2e09e5f5e 100755 --- a/examples/librispeech/s0/local/export.sh +++ b/examples/librispeech/s0/local/export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: $0 config_path ckpt_prefix jit_model_path" +if [ $# != 4 ];then + echo "usage: $0 config_path ckpt_prefix jit_model_path model_type" exit -1 fi @@ -11,6 +11,7 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_path_prefix=$2 jit_model_export_path=$3 +model_type=$4 device=gpu if [ ${ngpu} == 0 ];then @@ -22,8 +23,8 @@ python3 -u ${BIN_DIR}/export.py \ --nproc ${ngpu} \ --config ${config_path} \ --checkpoint_path ${ckpt_path_prefix} \ ---export_path ${jit_model_export_path} - +--export_path ${jit_model_export_path} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in export!" diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/s0/local/test.sh index 16a5e9ef0..b5b68c599 100755 --- a/examples/librispeech/s0/local/test.sh +++ b/examples/librispeech/s0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path ckpt_path_prefix model_type" exit -1 fi @@ -14,6 +14,7 @@ if [ ${ngpu} == 0 ];then fi config_path=$1 ckpt_prefix=$2 +model_type=$3 # download language model bash local/download_lm_en.sh @@ -26,7 +27,8 @@ python3 -u ${BIN_DIR}/test.py \ --nproc 1 \ --config ${config_path} \ --result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/s0/local/train.sh b/examples/librispeech/s0/local/train.sh index f3eb98daf..039b9cea4 100755 --- a/examples/librispeech/s0/local/train.sh +++ b/examples/librispeech/s0/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" exit -1 fi @@ -10,6 +10,7 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +model_type=$3 device=gpu if [ ${ngpu} == 0 ];then @@ -23,7 +24,8 @@ python3 -u ${BIN_DIR}/train.py \ --device ${device} \ --nproc ${ngpu} \ --config ${config_path} \ ---output exp/${ckpt_name} +--output exp/${ckpt_name} \ +--model_type ${model_type} if [ $? -ne 0 ]; then echo "Failed in training!" diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/s0/run.sh index 6553e073d..c7902a56a 100755 --- a/examples/librispeech/s0/run.sh +++ b/examples/librispeech/s0/run.sh @@ -6,6 +6,7 @@ stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml avg_num=30 +model_type=offline source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} @@ -19,7 +20,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} ${model_type} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -29,10 +30,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} fi From 319228653e9a360bccaf5cdbc1e3a413bc27a3d9 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Thu, 5 Aug 2021 05:27:45 +0000 Subject: [PATCH 21/24] fix some small mistakes --- deepspeech/exps/deepspeech2/model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index de5ff5f44..03974e1bb 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -139,9 +139,10 @@ class DeepSpeech2Trainer(Trainer): dict_size=self.train_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + rnn_direction=config.model.rnn_direction, num_fc_layers=config.model.num_fc_layers, fc_layers_size_list=config.model.fc_layers_size_list, - rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru) else: raise Exception("wrong model type") @@ -411,9 +412,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): dict_size=self.test_loader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + rnn_direction=config.model.rnn_direction, num_fc_layers=config.model.num_fc_layers, fc_layers_size_list=config.model.fc_layers_size_list, - rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru) else: raise Exception("Wrong model type") From 85d502147530a16ea48a56865aea81eb6acdeb37 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Sun, 8 Aug 2021 11:04:45 +0000 Subject: [PATCH 22/24] reconstruct the exp/model.py and the model.export() --- deepspeech/exps/deepspeech2/config.py | 2 +- deepspeech/exps/deepspeech2/model.py | 27 +++++-- deepspeech/models/ds2_online/deepspeech2.py | 87 +++++---------------- 3 files changed, 43 insertions(+), 73 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 53358014c..38b7d0e4d 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -27,7 +27,7 @@ def get_cfg_defaults(model_type='offline'): _C.collator = SpeechCollator.params() _C.training = DeepSpeech2Trainer.params() _C.decoding = DeepSpeech2Tester.params() - if (model_type == 'offline'): + if model_type == 'offline': _C.model = DeepSpeech2Model.params() else: _C.model = DeepSpeech2ModelOnline.params() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 03974e1bb..03fe8c6f5 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -124,10 +124,23 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config + if hasattr(self, "train_loader"): + config.defrost() + config.model.feat_size = self.train_loader.collate_fn.feature_size + config.model.dict_size = self.train_loader.collate_fn.vocab_size + config.freeze() + elif hasattr(self, "test_loader"): + config.defrost() + config.model.feat_size = self.test_loader.collate_fn.feature_size + config.model.dict_size = self.test_loader.collate_fn.vocab_size + config.freeze() + else: + raise Exception("Please setup the dataloader first") + if self.args.model_type == 'offline': model = DeepSpeech2Model( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, + feat_size=config.model.feat_size, + dict_size=config.model.dict_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, @@ -135,8 +148,8 @@ class DeepSpeech2Trainer(Trainer): share_rnn_weights=config.model.share_rnn_weights) elif self.args.model_type == 'online': model = DeepSpeech2ModelOnline( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, + feat_size=config.model.feat_size, + dict_size=config.model.dict_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, @@ -209,6 +222,7 @@ class DeepSpeech2Trainer(Trainer): batch_sampler=batch_sampler, collate_fn=collate_fn_train, num_workers=config.collator.num_workers) + print("feature_size", self.train_loader.collate_fn.feature_size) self.valid_loader = DataLoader( dev_dataset, batch_size=config.collator.batch_size, @@ -368,8 +382,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): dtype='int64'), # audio_length, [B] ]) elif self.args.model_type == 'online': - static_model = DeepSpeech2InferModelOnline.export(infer_model, - feat_dim) + static_model = infer_model.export() else: raise Exception("wrong model type") logger.info(f"Export code: {static_model.forward.code}") @@ -395,6 +408,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.iteration = 0 self.epoch = 0 + ''' def setup_model(self): config = self.config if self.args.model_type == 'offline': @@ -422,6 +436,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.model = model logger.info("Setup model!") + ''' def setup_dataloader(self): config = self.config.clone() diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index ad8a0506f..3c82f3250 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -88,55 +88,7 @@ class CRNNEncoder(nn.Layer): def output_size(self): return self.fc_layers_size_list[-1] - def forward(self, x, x_lens): - """Compute Encoder outputs - - Args: - x (Tensor): [B, T_input, D] - x_lens (Tensor): [B] - Returns: - x (Tensor): encoder outputs, [B, T_output, D] - x_lens (Tensor): encoder length, [B] - final_state_h_box(Tensor): final_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size - final_state_c_box(Tensor): final_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size - """ - # [B, T, D] - # convolution group - x, x_lens = self.conv(x, x_lens) - # convert data from convolution feature map to sequence of vectors - #B, C, D, T = paddle.shape(x) # not work under jit - #x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] - #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit - #x = x.reshape([0, 0, -1]) #[B, T, C*D] - - # remove padding part - init_state = None - final_state_list = [] - for i in range(0, self.num_rnn_layers): - x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D] - final_state_list.append(final_state) - x = self.layernorm_list[i](x) - - for i in range(self.num_fc_layers): - x = self.fc_layers_list[i](x) - x = F.relu(x) - - if self.use_gru == True: - final_state_h_box = paddle.concat(final_state_list, axis=0) - final_state_c_box = paddle.zeros_like(final_state_h_box) - else: - final_state_h_list = [ - final_state_list[i][0] for i in range(self.num_rnn_layers) - ] - final_state_c_list = [ - final_state_list[i][1] for i in range(self.num_rnn_layers) - ] - final_state_h_box = paddle.concat(final_state_h_list, axis=0) - final_state_c_box = paddle.concat(final_state_c_list, axis=0) - - return x, x_lens, final_state_h_box, final_state_c_box - - def forward_chunk(self, x, x_lens, init_state_h_box, init_state_c_box): + def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None): """Compute Encoder outputs Args: @@ -152,13 +104,16 @@ class CRNNEncoder(nn.Layer): """ if init_state_h_box is not None: init_state_list = None - init_state_h_list = paddle.split( - init_state_h_box, self.num_rnn_layers, axis=0) - init_state_c_list = paddle.split( - init_state_c_box, self.num_rnn_layers, axis=0) + if self.use_gru == True: + init_state_h_list = paddle.split( + init_state_h_box, self.num_rnn_layers, axis=0) init_state_list = init_state_h_list else: + init_state_h_list = paddle.split( + init_state_h_box, self.num_rnn_layers, axis=0) + init_state_c_list = paddle.split( + init_state_c_box, self.num_rnn_layers, axis=0) init_state_list = [(init_state_h_list[i], init_state_c_list[i]) for i in range(self.num_rnn_layers)] else: @@ -179,7 +134,7 @@ class CRNNEncoder(nn.Layer): if self.use_gru == True: final_chunk_state_h_box = paddle.concat( final_chunk_state_list, axis=0) - final_chunk_state_c_box = paddle.zeros_like(final_chunk_state_h_box) + final_chunk_state_c_box = init_state_c_box #paddle.zeros_like(final_chunk_state_h_box) else: final_chunk_state_h_list = [ final_chunk_state_list[i][0] for i in range(self.num_rnn_layers) @@ -242,13 +197,13 @@ class CRNNEncoder(nn.Layer): x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, x_len_left, x_chunk_len_tmp) - eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward_chunk( + eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward( x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box) eouts_chunk_list.append(eouts_chunk) eouts_chunk_lens_list.append(eouts_chunk_lens) - final_state_h_box = chunk_state_h_box - final_state_c_box = chunk_state_c_box + final_state_h_box = chunk_state_h_box + final_state_c_box = chunk_state_c_box return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box @@ -297,7 +252,7 @@ class DeepSpeech2ModelOnline(nn.Layer): feat_size, dict_size, num_conv_layers=2, - num_rnn_layers=3, + num_rnn_layers=4, rnn_size=1024, rnn_direction='forward', num_fc_layers=2, @@ -337,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer): loss (Tenosr): [1] """ eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( - audio, audio_len) + audio, audio_len, None, None) loss = self.decoder(eouts, eouts_len, text, text_len) return loss @@ -355,7 +310,7 @@ class DeepSpeech2ModelOnline(nn.Layer): decoding_method=decoding_method) eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( - audio, audio_len) + audio, audio_len, None, None) probs = self.decoder.softmax(eouts) return self.decoder.decode_probs( probs.numpy(), eouts_len, vocab_list, decoding_method, @@ -401,7 +356,7 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): feat_size, dict_size, num_conv_layers=2, - num_rnn_layers=3, + num_rnn_layers=4, rnn_size=1024, rnn_direction='forward', num_fc_layers=2, @@ -420,18 +375,18 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box): - eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder.forward_chunk( + eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder( audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box) probs_chunk = self.decoder.softmax(eouts_chunk) return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box - @classmethod - def export(self, infer_model, feat_dim): + def export(self): static_model = paddle.jit.to_static( - infer_model, + self, input_spec=[ paddle.static.InputSpec( - shape=[None, None, feat_dim], #[B, chunk_size, feat_dim] + shape=[None, None, self.encoder.feat_size + ], #[B, chunk_size, feat_dim] dtype='float32'), # audio, [B,T,D] paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] From 7a3d1641227cfaa06d40d17da1910bf3cee98306 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Sun, 8 Aug 2021 11:57:17 +0000 Subject: [PATCH 23/24] fix the bidirect rnn, add deepspeech2.yaml for aishell, tiny, librispeech --- deepspeech/models/ds2_online/deepspeech2.py | 15 ++-- .../aishell/s0/conf/deepspeech2_online.yaml | 67 ++++++++++++++++++ .../s0/conf/deepspeech2_online.yaml | 67 ++++++++++++++++++ examples/tiny/s0/conf/deepspeech2_online.yaml | 69 +++++++++++++++++++ 4 files changed, 213 insertions(+), 5 deletions(-) create mode 100644 examples/aishell/s0/conf/deepspeech2_online.yaml create mode 100644 examples/librispeech/s0/conf/deepspeech2_online.yaml create mode 100644 examples/tiny/s0/conf/deepspeech2_online.yaml diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 3c82f3250..75a6f044f 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -56,12 +56,17 @@ class CRNNEncoder(nn.Layer): self.rnn = nn.LayerList() self.layernorm_list = nn.LayerList() self.fc_layers_list = nn.LayerList() - layernorm_size = rnn_size + if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional': + layernorm_size = 2 * rnn_size + elif rnn_direction == 'forward': + layernorm_size = rnn_size + else: + raise Exception("Wrong rnn direction") for i in range(0, num_rnn_layers): if i == 0: rnn_input_size = i_size else: - rnn_input_size = rnn_size + rnn_input_size = layernorm_size if use_gru == True: self.rnn.append( nn.GRU( @@ -78,7 +83,7 @@ class CRNNEncoder(nn.Layer): direction=rnn_direction)) self.layernorm_list.append(nn.LayerNorm(layernorm_size)) - fc_input_size = rnn_size + fc_input_size = layernorm_size for i in range(self.num_fc_layers): self.fc_layers_list.append( nn.Linear(fc_input_size, fc_layers_size_list[i])) @@ -385,8 +390,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): self, input_spec=[ paddle.static.InputSpec( - shape=[None, None, self.encoder.feat_size - ], #[B, chunk_size, feat_dim] + shape=[None, None, + self.encoder.feat_size], #[B, chunk_size, feat_dim] dtype='float32'), # audio, [B,T,D] paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml new file mode 100644 index 000000000..60df8d175 --- /dev/null +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -0,0 +1,67 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.0 + max_input_len: 27.0 # second + min_output_len: 0.0 + max_output_len: .inf + min_output_input_ratio: 0.00 + max_output_input_ratio: .inf + +collator: + batch_size: 32 # one gpu + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: + specgram_type: linear #linear, mfcc, fbank + feat_dim: + delta_delta: False + stride_ms: 10.0 + window_ms: 20.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 + use_dB_normalization: True + target_dB: -20 + dither: 1.0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + +model: + num_conv_layers: 2 + num_rnn_layers: 4 + rnn_layer_size: 1024 + rnn_direction: bidirect + num_fc_layers: 2 + fc_layers_size_list: 512, 256 + use_gru: True + +training: + n_epoch: 50 + lr: 2e-3 + lr_decay: 0.83 + weight_decay: 1e-06 + global_grad_clip: 3.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + +decoding: + batch_size: 64 + error_rate_type: cer + decoding_method: ctc_beam_search + lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm + alpha: 1.9 + beta: 5.0 + beam_size: 300 + cutoff_prob: 0.99 + cutoff_top_n: 40 + num_proc_bsearch: 10 diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml new file mode 100644 index 000000000..2e4aed40a --- /dev/null +++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml @@ -0,0 +1,67 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev-clean + test_manifest: data/manifest.test-clean + min_input_len: 0.0 + max_input_len: 27.0 # second + min_output_len: 0.0 + max_output_len: .inf + min_output_input_ratio: 0.00 + max_output_input_ratio: .inf + +collator: + batch_size: 20 + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: + specgram_type: linear + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 20.0 + delta_delta: False + dither: 1.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + +model: + num_conv_layers: 2 + num_rnn_layers: 3 + rnn_layer_size: 2048 + rnn_direction: forward + num_fc_layers: 2 + fc_layers_size_list: 512, 256 + use_gru: False + +training: + n_epoch: 50 + lr: 1e-3 + lr_decay: 0.83 + weight_decay: 1e-06 + global_grad_clip: 5.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + +decoding: + batch_size: 128 + error_rate_type: wer + decoding_method: ctc_beam_search + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 1.9 + beta: 0.3 + beam_size: 500 + cutoff_prob: 1.0 + cutoff_top_n: 40 + num_proc_bsearch: 8 diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml new file mode 100644 index 000000000..333c2b9a9 --- /dev/null +++ b/examples/tiny/s0/conf/deepspeech2_online.yaml @@ -0,0 +1,69 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.tiny + dev_manifest: data/manifest.tiny + test_manifest: data/manifest.tiny + min_input_len: 0.0 + max_input_len: 27.0 + min_output_len: 0.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + + +collator: + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: + specgram_type: linear + feat_dim: + delta_delta: False + stride_ms: 10.0 + window_ms: 20.0 + n_fft: None + max_freq: None + target_sample_rate: 16000 + use_dB_normalization: True + target_dB: -20 + dither: 1.0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 + batch_size: 4 + +model: + num_conv_layers: 2 + num_rnn_layers: 4 + rnn_layer_size: 2048 + rnn_direction: forward + num_fc_layers: 2 + fc_layers_size_list: 512, 256 + use_gru: True + +training: + n_epoch: 10 + lr: 1e-5 + lr_decay: 1.0 + weight_decay: 1e-06 + global_grad_clip: 5.0 + log_interval: 1 + checkpoint: + kbest_n: 3 + latest_n: 2 + + +decoding: + batch_size: 128 + error_rate_type: wer + decoding_method: ctc_beam_search + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 500 + cutoff_prob: 1.0 + cutoff_top_n: 40 + num_proc_bsearch: 8 From 718ae52e3ff3204ab02a3f45852ec47897873742 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 10 Aug 2021 09:28:23 +0000 Subject: [PATCH 24/24] add from_config function to ds2_oneline and ds2 --- deepspeech/exps/deepspeech2/model.py | 135 ++++-------------- deepspeech/models/ds2/deepspeech2.py | 33 +++++ deepspeech/models/ds2_online/deepspeech2.py | 44 ++++-- .../aishell/s0/conf/deepspeech2_online.yaml | 12 +- tests/deepspeech2_online_model_test.py | 119 ++++++--------- 5 files changed, 141 insertions(+), 202 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 03fe8c6f5..dfd812419 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Contains DeepSpeech2 model.""" +"""Contains DeepSpeech2 and DeepSpeech2Online model.""" import time from collections import defaultdict from pathlib import Path @@ -38,8 +38,6 @@ from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools from deepspeech.utils.log import Autolog from deepspeech.utils.log import Log -#from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline -#from deepspeech.models.ds2_online import DeepSpeech2ModelOnline logger = Log(__name__).getlog() @@ -123,40 +121,20 @@ class DeepSpeech2Trainer(Trainer): return total_loss, num_seen_utts def setup_model(self): - config = self.config - if hasattr(self, "train_loader"): - config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.freeze() - elif hasattr(self, "test_loader"): - config.defrost() - config.model.feat_size = self.test_loader.collate_fn.feature_size - config.model.dict_size = self.test_loader.collate_fn.vocab_size - config.freeze() - else: - raise Exception("Please setup the dataloader first") + config = self.config.clone() + config.defrost() + assert (self.train_loader.collate_fn.feature_size == + self.test_loader.collate_fn.feature_size) + assert (self.train_loader.collate_fn.vocab_size == + self.test_loader.collate_fn.vocab_size) + config.model.feat_size = self.train_loader.collate_fn.feature_size + config.model.dict_size = self.train_loader.collate_fn.vocab_size + config.freeze() if self.args.model_type == 'offline': - model = DeepSpeech2Model( - feat_size=config.model.feat_size, - dict_size=config.model.dict_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + model = DeepSpeech2Model.from_config(config.model) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline( - feat_size=config.model.feat_size, - dict_size=config.model.dict_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru) + model = DeepSpeech2ModelOnline.from_config(config.model) else: raise Exception("wrong model type") if self.parallel: @@ -194,6 +172,9 @@ class DeepSpeech2Trainer(Trainer): config.data.manifest = config.data.dev_manifest dev_dataset = ManifestDataset.from_config(config) + config.data.manifest = config.data.test_manifest + test_dataset = ManifestDataset.from_config(config) + if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, @@ -217,19 +198,29 @@ class DeepSpeech2Trainer(Trainer): config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) + + config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" + collate_fn_test = SpeechCollator.from_config(config) + self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, num_workers=config.collator.num_workers) - print("feature_size", self.train_loader.collate_fn.feature_size) self.valid_loader = DataLoader( dev_dataset, batch_size=config.collator.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev) - logger.info("Setup train/valid Dataloader!") + self.test_loader = DataLoader( + test_dataset, + batch_size=config.decoding.batch_size, + shuffle=False, + drop_last=False, + collate_fn=collate_fn_test) + logger.info("Setup train/valid/test Dataloader!") class DeepSpeech2Tester(DeepSpeech2Trainer): @@ -371,20 +362,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size - if self.args.model_type == 'offline': - static_model = paddle.jit.to_static( - infer_model, - input_spec=[ - paddle.static.InputSpec( - shape=[None, None, feat_dim], - dtype='float32'), # audio, [B,T,D] - paddle.static.InputSpec(shape=[None], - dtype='int64'), # audio_length, [B] - ]) - elif self.args.model_type == 'online': - static_model = infer_model.export() - else: - raise Exception("wrong model type") + static_model = infer_model.export() logger.info(f"Export code: {static_model.forward.code}") paddle.jit.save(static_model, self.args.export_path) @@ -408,63 +386,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.iteration = 0 self.epoch = 0 - ''' - def setup_model(self): - config = self.config - if self.args.model_type == 'offline': - model = DeepSpeech2Model( - feat_size=self.test_loader.collate_fn.feature_size, - dict_size=self.test_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) - elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline( - feat_size=self.test_loader.collate_fn.feature_size, - dict_size=self.test_loader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru) - else: - raise Exception("Wrong model type") - - self.model = model - logger.info("Setup model!") - ''' - - def setup_dataloader(self): - config = self.config.clone() - config.defrost() - # return raw text - - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - # config.data.min_input_len = 0.0 # second - # config.data.max_input_len = float('inf') # second - # config.data.min_output_len = 0.0 # tokens - # config.data.max_output_len = float('inf') # tokens - # config.data.min_output_input_ratio = 0.00 - # config.data.max_output_input_ratio = float('inf') - test_dataset = ManifestDataset.from_config(config) - - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - # return text ord id - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config)) - logger.info("Setup test Dataloader!") - def setup_output_dir(self): """Create a directory used for output. """ diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index 8d737e800..1ffd797b4 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -228,6 +228,27 @@ class DeepSpeech2Model(nn.Layer): layer_tools.summary(model) return model + @classmethod + def from_config(cls, config): + """Build a DeepSpeec2Model from config + Parameters + + config: yacs.config.CfgNode + config.model + Returns + ------- + DeepSpeech2Model + The model built from config. + """ + model = cls(feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) + return model + class DeepSpeech2InferModel(DeepSpeech2Model): def __init__(self, @@ -260,3 +281,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model): eouts, eouts_len = self.encoder(audio, audio_len) probs = self.decoder.softmax(eouts) return probs + + def export(self): + static_model = paddle.jit.to_static( + self, + input_spec=[ + paddle.static.InputSpec( + shape=[None, None, self.encoder.feat_size], + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[None], + dtype='int64'), # audio_length, [B] + ]) + return static_model diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index 75a6f044f..3083e4b2a 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -51,8 +51,9 @@ class CRNNEncoder(nn.Layer): self.use_gru = use_gru self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0) - i_size = self.conv.output_dim + self.output_dim = self.conv.output_dim + i_size = self.conv.output_dim self.rnn = nn.LayerList() self.layernorm_list = nn.LayerList() self.fc_layers_list = nn.LayerList() @@ -82,16 +83,18 @@ class CRNNEncoder(nn.Layer): num_layers=1, direction=rnn_direction)) self.layernorm_list.append(nn.LayerNorm(layernorm_size)) + self.output_dim = layernorm_size fc_input_size = layernorm_size for i in range(self.num_fc_layers): self.fc_layers_list.append( nn.Linear(fc_input_size, fc_layers_size_list[i])) fc_input_size = fc_layers_size_list[i] + self.output_dim = fc_layers_size_list[i] @property def output_size(self): - return self.fc_layers_size_list[-1] + return self.output_dim def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None): """Compute Encoder outputs @@ -190,9 +193,6 @@ class CRNNEncoder(nn.Layer): for i in range(0, num_chunk): start = i * chunk_stride end = start + chunk_size - # end = min(start + chunk_size, max_len) - # if (end - start < receptive_field_length): - # break x_chunk = padded_x[:, start:end, :] x_len_left = paddle.where(x_lens - i * chunk_stride < 0, @@ -221,8 +221,6 @@ class DeepSpeech2ModelOnline(nn.Layer): :type text_data: Variable :param audio_len: Valid sequence length data layer. :type audio_len: Variable - :param masks: Masks data layer to reset padding. - :type masks: Variable :param dict_size: Dictionary size for tokenized transcription. :type dict_size: int :param num_conv_layers: Number of stacking convolution layers. @@ -231,6 +229,10 @@ class DeepSpeech2ModelOnline(nn.Layer): :type num_rnn_layers: int :param rnn_size: RNN layer size (dimension of RNN cells). :type rnn_size: int + :param num_fc_layers: Number of stacking FC layers. + :type num_fc_layers: int + :param fc_layers_size_list: The list of FC layer sizes. + :type fc_layers_size_list: [int,] :param use_gru: Use gru if set True. Use simple rnn if set False. :type use_gru: bool :return: A tuple of an output unnormalized log probability layer ( @@ -274,7 +276,6 @@ class DeepSpeech2ModelOnline(nn.Layer): fc_layers_size_list=fc_layers_size_list, rnn_size=rnn_size, use_gru=use_gru) - assert (self.encoder.output_size == fc_layers_size_list[-1]) self.decoder = CTCDecoder( odim=dict_size, # is in vocab @@ -337,7 +338,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Returns ------- - DeepSpeech2Model + DeepSpeech2ModelOnline The model built from pretrained result. """ model = cls(feat_size=dataloader.collate_fn.feature_size, @@ -355,6 +356,29 @@ class DeepSpeech2ModelOnline(nn.Layer): layer_tools.summary(model) return model + @classmethod + def from_config(cls, config): + """Build a DeepSpeec2ModelOnline from config + Parameters + + config: yacs.config.CfgNode + config.model + Returns + ------- + DeepSpeech2ModelOnline + The model built from config. + """ + model = cls(feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru) + return model + class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): def __init__(self, @@ -392,7 +416,7 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): paddle.static.InputSpec( shape=[None, None, self.encoder.feat_size], #[B, chunk_size, feat_dim] - dtype='float32'), # audio, [B,T,D] + dtype='float32'), paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] paddle.static.InputSpec( diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml index 60df8d175..33030a523 100644 --- a/examples/aishell/s0/conf/deepspeech2_online.yaml +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -36,17 +36,17 @@ collator: model: num_conv_layers: 2 - num_rnn_layers: 4 + num_rnn_layers: 3 rnn_layer_size: 1024 - rnn_direction: bidirect - num_fc_layers: 2 - fc_layers_size_list: 512, 256 + rnn_direction: forward # [forward, bidirect] + num_fc_layers: 1 + fc_layers_size_list: 512, use_gru: True training: n_epoch: 50 lr: 2e-3 - lr_decay: 0.83 + lr_decay: 0.83 # 0.83 weight_decay: 1e-06 global_grad_clip: 3.0 log_interval: 100 @@ -55,7 +55,7 @@ training: latest_n: 5 decoding: - batch_size: 64 + batch_size: 32 error_rate_type: cer decoding_method: ctc_beam_search lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py index fd1dfc4b5..87f048870 100644 --- a/tests/deepspeech2_online_model_test.py +++ b/tests/deepspeech2_online_model_test.py @@ -106,18 +106,34 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): self.assertEqual(loss.numel(), 1) def test_ds2_6(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + rnn_direction='bidirect', + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_7(self): + use_gru = False model = DeepSpeech2ModelOnline( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, num_rnn_layers=1, rnn_size=1024, + rnn_direction='forward', num_fc_layers=2, fc_layers_size_list=[512, 256], - use_gru=True) + use_gru=use_gru) model.eval() paddle.device.set_device("cpu") - de_ch_size = 9 + de_ch_size = 8 eouts, eouts_lens, final_state_h_box, final_state_c_box = model.encoder( self.audio, self.audio_len) @@ -126,99 +142,44 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): eouts_by_chk = paddle.concat(eouts_by_chk_list, axis=1) eouts_lens_by_chk = paddle.add_n(eouts_lens_by_chk_list) decode_max_len = eouts.shape[1] - print("dml", decode_max_len) eouts_by_chk = eouts_by_chk[:, :decode_max_len, :] - self.assertEqual( - paddle.sum( - paddle.abs(paddle.subtract(eouts_lens, eouts_lens_by_chk))), 0) - self.assertEqual( - paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk))), 0) self.assertEqual(paddle.allclose(eouts_by_chk, eouts), True) self.assertEqual( paddle.allclose(final_state_h_box, final_state_h_box_chk), True) - self.assertEqual( - paddle.allclose(final_state_c_box, final_state_c_box_chk), True) - """ - print ("conv_x", conv_x) - print ("conv_x_by_chk", conv_x_by_chk) - print ("final_state_list", final_state_list) - #print ("final_state_list_by_chk", final_state_list_by_chk) - print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])))) - print (paddle.allclose(eouts[:,:de_ch_size,:], eouts_by_chk[:,:de_ch_size,:])) - print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])))) - print (paddle.allclose(eouts[:,de_ch_size:de_ch_size*2,:], eouts_by_chk[:,de_ch_size:de_ch_size*2,:])) - print (paddle.sum(paddle.abs(paddle.subtract(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])))) - print (paddle.allclose(eouts[:,de_ch_size*2:de_ch_size*3,:], eouts_by_chk[:,de_ch_size*2:de_ch_size*3,:])) - print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) - print (paddle.sum(paddle.abs(paddle.subtract(eouts, eouts_by_chk)))) - print (paddle.allclose(eouts[:,:,:], eouts_by_chk[:,:,:])) - """ - - """ - def split_into_chunk(self, x, x_lens, decoder_chunk_size, subsampling_rate, - receptive_field_length): - chunk_size = (decoder_chunk_size - 1 - ) * subsampling_rate + receptive_field_length - chunk_stride = subsampling_rate * decoder_chunk_size - max_len = x.shape[1] - assert (chunk_size <= max_len) - x_chunk_list = [] - x_chunk_lens_list = [] - padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride - padding = paddle.zeros((x.shape[0], padding_len, x.shape[2])) - padded_x = paddle.concat([x, padding], axis=1) - num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 - num_chunk = int(num_chunk) - for i in range(0, num_chunk): - start = i * chunk_stride - end = start + chunk_size - x_chunk = padded_x[:, start:end, :] - x_len_left = paddle.where(x_lens - i * chunk_stride < 0, - paddle.zeros_like(x_lens), - x_lens - i * chunk_stride) - x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size - x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, - x_len_left, x_chunk_len_tmp) - x_chunk_list.append(x_chunk) - x_chunk_lens_list.append(x_chunk_lens) - - return x_chunk_list, x_chunk_lens_list + if use_gru == False: + self.assertEqual( + paddle.allclose(final_state_c_box, final_state_c_box_chk), True) - def test_ds2_7(self): + def test_ds2_8(self): + use_gru = True model = DeepSpeech2ModelOnline( feat_size=self.feat_dim, dict_size=10, num_conv_layers=2, num_rnn_layers=1, rnn_size=1024, + rnn_direction='forward', num_fc_layers=2, fc_layers_size_list=[512, 256], - use_gru=True) + use_gru=use_gru) model.eval() paddle.device.set_device("cpu") - de_ch_size = 9 - - audio_chunk_list, audio_chunk_lens_list = self.split_into_chunk( - self.audio, self.audio_len, de_ch_size, - model.encoder.conv.subsampling_rate, - model.encoder.conv.receptive_field_length) - eouts_prefix = None - eouts_lens_prefix = None - chunk_state_list = [None] * model.encoder.num_rnn_layers - for i, audio_chunk in enumerate(audio_chunk_list): - audio_chunk_lens = audio_chunk_lens_list[i] - eouts_prefix, eouts_lens_prefix, chunk_state_list = model.decode_prob_by_chunk( - audio_chunk, audio_chunk_lens, eouts_prefix, eouts_lens_prefix, - chunk_state_list) - # print (i, probs_pre_chunks.shape) - - probs, eouts, eouts_lens, final_state_list = model.decode_prob( - self.audio, self.audio_len) + de_ch_size = 8 - decode_max_len = probs.shape[1] - probs_pre_chunks = probs_pre_chunks[:, :decode_max_len, :] - self.assertEqual(paddle.allclose(probs, probs_pre_chunks), True) - """ + eouts, eouts_lens, final_state_h_box, final_state_c_box = model.encoder( + self.audio, self.audio_len) + eouts_by_chk_list, eouts_lens_by_chk_list, final_state_h_box_chk, final_state_c_box_chk = model.encoder.forward_chunk_by_chunk( + self.audio, self.audio_len, de_ch_size) + eouts_by_chk = paddle.concat(eouts_by_chk_list, axis=1) + eouts_lens_by_chk = paddle.add_n(eouts_lens_by_chk_list) + decode_max_len = eouts.shape[1] + eouts_by_chk = eouts_by_chk[:, :decode_max_len, :] + self.assertEqual(paddle.allclose(eouts_by_chk, eouts), True) + self.assertEqual( + paddle.allclose(final_state_h_box, final_state_h_box_chk), True) + if use_gru == False: + self.assertEqual( + paddle.allclose(final_state_c_box, final_state_c_box_chk), True) if __name__ == '__main__':