diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 53358014c..38b7d0e4d 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -27,7 +27,7 @@ def get_cfg_defaults(model_type='offline'): _C.collator = SpeechCollator.params() _C.training = DeepSpeech2Trainer.params() _C.decoding = DeepSpeech2Tester.params() - if (model_type == 'offline'): + if model_type == 'offline': _C.model = DeepSpeech2Model.params() else: _C.model = DeepSpeech2ModelOnline.params() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 03974e1bb..03fe8c6f5 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -124,10 +124,23 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config + if hasattr(self, "train_loader"): + config.defrost() + config.model.feat_size = self.train_loader.collate_fn.feature_size + config.model.dict_size = self.train_loader.collate_fn.vocab_size + config.freeze() + elif hasattr(self, "test_loader"): + config.defrost() + config.model.feat_size = self.test_loader.collate_fn.feature_size + config.model.dict_size = self.test_loader.collate_fn.vocab_size + config.freeze() + else: + raise Exception("Please setup the dataloader first") + if self.args.model_type == 'offline': model = DeepSpeech2Model( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, + feat_size=config.model.feat_size, + dict_size=config.model.dict_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, @@ -135,8 +148,8 @@ class DeepSpeech2Trainer(Trainer): share_rnn_weights=config.model.share_rnn_weights) elif self.args.model_type == 'online': model = DeepSpeech2ModelOnline( - feat_size=self.train_loader.collate_fn.feature_size, - dict_size=self.train_loader.collate_fn.vocab_size, + feat_size=config.model.feat_size, + dict_size=config.model.dict_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, @@ -209,6 +222,7 @@ class DeepSpeech2Trainer(Trainer): batch_sampler=batch_sampler, collate_fn=collate_fn_train, num_workers=config.collator.num_workers) + print("feature_size", self.train_loader.collate_fn.feature_size) self.valid_loader = DataLoader( dev_dataset, batch_size=config.collator.batch_size, @@ -368,8 +382,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): dtype='int64'), # audio_length, [B] ]) elif self.args.model_type == 'online': - static_model = DeepSpeech2InferModelOnline.export(infer_model, - feat_dim) + static_model = infer_model.export() else: raise Exception("wrong model type") logger.info(f"Export code: {static_model.forward.code}") @@ -395,6 +408,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.iteration = 0 self.epoch = 0 + ''' def setup_model(self): config = self.config if self.args.model_type == 'offline': @@ -422,6 +436,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.model = model logger.info("Setup model!") + ''' def setup_dataloader(self): config = self.config.clone() diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index ad8a0506f..3c82f3250 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -88,55 +88,7 @@ class CRNNEncoder(nn.Layer): def output_size(self): return self.fc_layers_size_list[-1] - def forward(self, x, x_lens): - """Compute Encoder outputs - - Args: - x (Tensor): [B, T_input, D] - x_lens (Tensor): [B] - Returns: - x (Tensor): encoder outputs, [B, T_output, D] - x_lens (Tensor): encoder length, [B] - final_state_h_box(Tensor): final_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size - final_state_c_box(Tensor): final_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size - """ - # [B, T, D] - # convolution group - x, x_lens = self.conv(x, x_lens) - # convert data from convolution feature map to sequence of vectors - #B, C, D, T = paddle.shape(x) # not work under jit - #x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] - #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit - #x = x.reshape([0, 0, -1]) #[B, T, C*D] - - # remove padding part - init_state = None - final_state_list = [] - for i in range(0, self.num_rnn_layers): - x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D] - final_state_list.append(final_state) - x = self.layernorm_list[i](x) - - for i in range(self.num_fc_layers): - x = self.fc_layers_list[i](x) - x = F.relu(x) - - if self.use_gru == True: - final_state_h_box = paddle.concat(final_state_list, axis=0) - final_state_c_box = paddle.zeros_like(final_state_h_box) - else: - final_state_h_list = [ - final_state_list[i][0] for i in range(self.num_rnn_layers) - ] - final_state_c_list = [ - final_state_list[i][1] for i in range(self.num_rnn_layers) - ] - final_state_h_box = paddle.concat(final_state_h_list, axis=0) - final_state_c_box = paddle.concat(final_state_c_list, axis=0) - - return x, x_lens, final_state_h_box, final_state_c_box - - def forward_chunk(self, x, x_lens, init_state_h_box, init_state_c_box): + def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None): """Compute Encoder outputs Args: @@ -152,13 +104,16 @@ class CRNNEncoder(nn.Layer): """ if init_state_h_box is not None: init_state_list = None - init_state_h_list = paddle.split( - init_state_h_box, self.num_rnn_layers, axis=0) - init_state_c_list = paddle.split( - init_state_c_box, self.num_rnn_layers, axis=0) + if self.use_gru == True: + init_state_h_list = paddle.split( + init_state_h_box, self.num_rnn_layers, axis=0) init_state_list = init_state_h_list else: + init_state_h_list = paddle.split( + init_state_h_box, self.num_rnn_layers, axis=0) + init_state_c_list = paddle.split( + init_state_c_box, self.num_rnn_layers, axis=0) init_state_list = [(init_state_h_list[i], init_state_c_list[i]) for i in range(self.num_rnn_layers)] else: @@ -179,7 +134,7 @@ class CRNNEncoder(nn.Layer): if self.use_gru == True: final_chunk_state_h_box = paddle.concat( final_chunk_state_list, axis=0) - final_chunk_state_c_box = paddle.zeros_like(final_chunk_state_h_box) + final_chunk_state_c_box = init_state_c_box #paddle.zeros_like(final_chunk_state_h_box) else: final_chunk_state_h_list = [ final_chunk_state_list[i][0] for i in range(self.num_rnn_layers) @@ -242,13 +197,13 @@ class CRNNEncoder(nn.Layer): x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, x_len_left, x_chunk_len_tmp) - eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward_chunk( + eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward( x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box) eouts_chunk_list.append(eouts_chunk) eouts_chunk_lens_list.append(eouts_chunk_lens) - final_state_h_box = chunk_state_h_box - final_state_c_box = chunk_state_c_box + final_state_h_box = chunk_state_h_box + final_state_c_box = chunk_state_c_box return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box @@ -297,7 +252,7 @@ class DeepSpeech2ModelOnline(nn.Layer): feat_size, dict_size, num_conv_layers=2, - num_rnn_layers=3, + num_rnn_layers=4, rnn_size=1024, rnn_direction='forward', num_fc_layers=2, @@ -337,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer): loss (Tenosr): [1] """ eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( - audio, audio_len) + audio, audio_len, None, None) loss = self.decoder(eouts, eouts_len, text, text_len) return loss @@ -355,7 +310,7 @@ class DeepSpeech2ModelOnline(nn.Layer): decoding_method=decoding_method) eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( - audio, audio_len) + audio, audio_len, None, None) probs = self.decoder.softmax(eouts) return self.decoder.decode_probs( probs.numpy(), eouts_len, vocab_list, decoding_method, @@ -401,7 +356,7 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): feat_size, dict_size, num_conv_layers=2, - num_rnn_layers=3, + num_rnn_layers=4, rnn_size=1024, rnn_direction='forward', num_fc_layers=2, @@ -420,18 +375,18 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box): - eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder.forward_chunk( + eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder( audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box) probs_chunk = self.decoder.softmax(eouts_chunk) return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box - @classmethod - def export(self, infer_model, feat_dim): + def export(self): static_model = paddle.jit.to_static( - infer_model, + self, input_spec=[ paddle.static.InputSpec( - shape=[None, None, feat_dim], #[B, chunk_size, feat_dim] + shape=[None, None, self.encoder.feat_size + ], #[B, chunk_size, feat_dim] dtype='float32'), # audio, [B,T,D] paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B]