From 2cacbaf48ee80d1c00256cab732b203d4dc00b28 Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Wed, 28 Jul 2021 02:14:36 +0000 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86deepspeech2.py?= =?UTF-8?q?=E9=83=A8=E5=88=86LSTM=E5=92=8CGRU=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E5=A2=9E=E5=8A=A0=E4=BA=86LayerNorm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepspeech/exps/deepspeech2/model.py | 6 +- deepspeech/models/ds2/deepspeech2.py | 65 +++++++++++++++---- examples/aishell/s0/conf/deepspeech2.yaml | 3 +- examples/librispeech/s0/conf/deepspeech2.yaml | 1 + examples/tiny/s0/conf/deepspeech2.yaml | 1 + 5 files changed, 62 insertions(+), 14 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 2f84b686..544d57d1 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -127,7 +127,8 @@ class DeepSpeech2Trainer(Trainer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) if self.parallel: model = paddle.DataParallel(model) @@ -374,7 +375,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) self.model = model logger.info("Setup model!") diff --git a/deepspeech/models/ds2/deepspeech2.py b/deepspeech/models/ds2/deepspeech2.py index 0bd5fb95..7f173ce2 100644 --- a/deepspeech/models/ds2/deepspeech2.py +++ b/deepspeech/models/ds2/deepspeech2.py @@ -25,6 +25,11 @@ from deepspeech.utils import layer_tools from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log +from paddle.nn import LSTM, GRU +from paddle.nn import LayerNorm +from paddle.nn import LayerList + + logger = Log(__name__).getlog() __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferMode'] @@ -38,25 +43,50 @@ class CRNNEncoder(nn.Layer): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True): + share_rnn_weights=True, + apply_online=True): super().__init__() self.rnn_size = rnn_size self.feat_size = feat_size # 161 for linear self.dict_size = dict_size - + self.num_rnn_layers = num_rnn_layers + self.apply_online = apply_online self.conv = ConvStack(feat_size, num_conv_layers) i_size = self.conv.output_height # H after conv stack + + + self.rnn = LayerList() + self.layernorm_list = LayerList() + + if (apply_online == True): + rnn_direction = 'forward' + else: + rnn_direction = 'bidirect' + + if use_gru == True: + self.rnn.append(GRU(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + for i in range(1, num_rnn_layers): + self.rnn.append(GRU(input_size=rnn_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + else: + self.rnn.append(LSTM(input_size=i_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + for i in range(1, num_rnn_layers): + self.rnn.append(LSTM(input_size=rnn_size, hidden_size=rnn_size, num_layers=1, direction = rnn_direction)) + self.layernorm_list.append(LayerNorm(rnn_size)) + """ self.rnn = RNNStack( i_size=i_size, h_size=rnn_size, num_stacks=num_rnn_layers, use_gru=use_gru, share_rnn_weights=share_rnn_weights) - + """ @property def output_size(self): - return self.rnn_size * 2 + return self.rnn_size def forward(self, audio, audio_len): """Compute Encoder outputs @@ -86,7 +116,15 @@ class CRNNEncoder(nn.Layer): x = x.reshape([0, 0, -1]) #[B, T, C*D] # remove padding part - x, x_lens = self.rnn(x, x_lens) #[B, T, D] + print ("x.shape:", x.shape) + x, output_state = self.rnn[0](x, None, x_lens) + x = self.layernorm_list[0](x) + for i in range(1, self.num_rnn_layers): + x, output_state = self.rnn[i](x, output_state, x_lens) #[B, T, D] + x = self.layernorm_list[i](x) + """ + x, x_lens = self.rnn(x, x_lens) + """ return x, x_lens @@ -141,7 +179,8 @@ class DeepSpeech2Model(nn.Layer): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True): + share_rnn_weights=True, + apply_online = True): super().__init__() self.encoder = CRNNEncoder( feat_size=feat_size, @@ -150,8 +189,9 @@ class DeepSpeech2Model(nn.Layer): num_rnn_layers=num_rnn_layers, rnn_size=rnn_size, use_gru=use_gru, - share_rnn_weights=share_rnn_weights) - assert (self.encoder.output_size == rnn_size * 2) + share_rnn_weights=share_rnn_weights, + apply_online=apply_online) + assert (self.encoder.output_size == rnn_size) self.decoder = CTCDecoder( odim=dict_size, # is in vocab @@ -221,7 +261,8 @@ class DeepSpeech2Model(nn.Layer): num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + share_rnn_weights=config.model.share_rnn_weights, + apply_online=config.model.apply_online) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -237,7 +278,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model): num_rnn_layers=3, rnn_size=1024, use_gru=False, - share_rnn_weights=True): + share_rnn_weights=True, + apply_online = True): super().__init__( feat_size=feat_size, dict_size=dict_size, @@ -245,7 +287,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model): num_rnn_layers=num_rnn_layers, rnn_size=rnn_size, use_gru=use_gru, - share_rnn_weights=share_rnn_weights) + share_rnn_weights=share_rnn_weights, + apply_online=apply_online) def forward(self, audio, audio_len): """export model function diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 1c97fc60..7d0d1f89 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -36,10 +36,11 @@ collator: model: num_conv_layers: 2 - num_rnn_layers: 3 + num_rnn_layers: 4 rnn_layer_size: 1024 use_gru: True share_rnn_weights: False + apply_online: False training: n_epoch: 50 diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index acee94c3..be1918d0 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -40,6 +40,7 @@ model: rnn_layer_size: 2048 use_gru: False share_rnn_weights: True + apply_online: False training: n_epoch: 50 diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index ea433f34..8c719e5c 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -41,6 +41,7 @@ model: rnn_layer_size: 2048 use_gru: False share_rnn_weights: True + apply_online: True training: n_epoch: 10