diff --git a/deepspeech/models/ds2_online/conv.py b/deepspeech/models/ds2_online/conv.py index 13c35ef2b..83d98e410 100644 --- a/deepspeech/models/ds2_online/conv.py +++ b/deepspeech/models/ds2_online/conv.py @@ -22,6 +22,8 @@ class Conv2dSubsampling4Online(Conv2dSubsampling4): def __init__(self, idim: int, odim: int, dropout_rate: float): super().__init__(idim, odim, dropout_rate, None) self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim + self.receptive_field_length = 2 * ( + 3 - 1) + 3 # stride_1 * (kernel_size_2 - 1) + kerel_size_1 def forward(self, x: paddle.Tensor, x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]: diff --git a/deepspeech/models/ds2_online/deepspeech2.py b/deepspeech/models/ds2_online/deepspeech2.py index e9e81d5d9..0b3c632be 100644 --- a/deepspeech/models/ds2_online/deepspeech2.py +++ b/deepspeech/models/ds2_online/deepspeech2.py @@ -108,7 +108,7 @@ class CRNNEncoder(nn.Layer): Returns: x (Tensor): encoder outputs, [B, T_output, D] x_lens (Tensor): encoder length, [B] - rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers """ # [B, T, D] # convolution group @@ -121,21 +121,21 @@ class CRNNEncoder(nn.Layer): # remove padding part init_state = None - rnn_final_state_list = [] + final_state_list = [] x, final_state = self.rnn[0](x, init_state, x_lens) - rnn_final_state_list.append(final_state) + final_state_list.append(final_state) x = self.layernorm_list[0](x) for i in range(1, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D] - rnn_final_state_list.append(final_state) + final_state_list.append(final_state) x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens, rnn_final_state_list + return x, x_lens, final_state_list - def forward(self, x, x_lens, init_state_list): + def forward_chunk(self, x, x_lens, init_state_list): """Compute Encoder outputs Args: @@ -145,22 +145,59 @@ class CRNNEncoder(nn.Layer): Returns: x (Tensor): encoder outputs, [B, chunk_size, D] x_lens (Tensor): encoder length, [B] - rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers + chunk_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers """ - rnn_final_state_list = [] + x, x_lens = self.conv(x, x_lens) + chunk_final_state_list = [] x, final_state = self.rnn[0](x, init_state_list[0], x_lens) - rnn_final_state_list.append(final_state) + chunk_final_state_list.append(final_state) x = self.layernorm_list[0](x) for i in range(1, self.num_rnn_layers): x, final_state = self.rnn[i](x, init_state_list[i], x_lens) #[B, T, D] - rnn_final_state_list.append(final_state) + chunk_final_state_list.append(final_state) x = self.layernorm_list[i](x) for i in range(self.num_fc_layers): x = self.fc_layers_list[i](x) x = F.relu(x) - return x, x_lens, rnn_final_state_list + return x, x_lens, chunk_final_state_list + + def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8): + subsampling_rate = self.conv.subsampling_rate + receptive_field_length = self.conv.receptive_field_length + chunk_size = (decoder_chunk_size - 1 + ) * subsampling_rate + receptive_field_length + chunk_stride = subsampling_rate * decoder_chunk_size + max_len = x.shape[1] + assert (chunk_size <= max_len) + + eouts_chunk_list = [] + eouts_chunk_lens_list = [] + + padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride + padding = paddle.zeros((x.shape[0], padding_len, x.shape[2])) + x_padded = paddle.concat([x, padding], axis=1) + num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1 + num_chunk = int(num_chunk) + chunk_init_state_list = [None] * self.num_rnn_layers + for i in range(0, num_chunk): + start = i * chunk_stride + end = start + chunk_size + x_chunk = x_padded[:, start:end, :] + x_len_left = x_lens - i * chunk_stride + x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size + x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp, + x_len_left, x_chunk_len_tmp) + + eouts_chunk, eouts_chunk_lens, chunk_final_state_list = self.forward_chunk( + x_chunk, x_chunk_lens, chunk_init_state_list) + + chunk_init_state_list = chunk_final_state_list + eouts_chunk_list.append(eouts_chunk) + eouts_chunk_lens_list.append(eouts_chunk_lens) + + return eouts_chunk_list, eouts_chunk_lens_list, chunk_final_state_list class DeepSpeech2ModelOnline(nn.Layer): @@ -248,7 +285,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Returns: loss (Tenosr): [1] """ - eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) return loss @@ -265,13 +302,54 @@ class DeepSpeech2ModelOnline(nn.Layer): vocab_list=vocab_list, decoding_method=decoding_method) - eouts, eouts_len = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) + + @paddle.no_grad() + def decode_chunk_by_chunk(self, audio, audio_len, vocab_list, + decoding_method, lang_model_path, beam_alpha, + beam_beta, beam_size, cutoff_prob, cutoff_top_n, + num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( + audio, audio_len) + eouts = paddle.concat(eouts_chunk_list, axis=1) + eouts_len = paddle.add_n(eouts_chunk_len_list) + probs = self.decoder.softmax(eouts) return self.decoder.decode_probs( probs.numpy(), eouts_len, vocab_list, decoding_method, lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, num_processes) + @paddle.no_grad() + def decode_prob(self, audio, audio_len): + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs, eouts, eouts_len, final_state_list + + @paddle.no_grad() + def decode_prob_chunk_by_chunk(self, audio, audio_len): + + eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk( + audio, audio_len) + eouts = paddle.concat(eouts_chunk_list, axis=1) + eouts_len = paddle.add_n(eouts_chunk_len_list) + probs = self.decoder.softmax(eouts) + return probs, eouts, eouts_len, final_state_list + @classmethod def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. @@ -338,7 +416,14 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): Returns: probs: probs after softmax """ - eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len) + eouts, eouts_len, final_state_list = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs + + def forward_chunk_by_chunk(self, audio, audio_len): + eouts_chunk_list, eouts_chunk_lens_list, final_state_list = self.encoder.forward_chunk_by_chunk( + audio_chunk, audio_chunk_len) + eouts = paddle.concat(eouts_chunk_list, axis=1) probs = self.decoder.softmax(eouts) return probs @@ -353,11 +438,11 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline): Returns: probs: probs after softmax """ - eouts_chunk, eouts_chunk_lens, rnn_final_state_list = self.encoder( + eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder( audio_chunk, audio_chunk_len, init_state_list) eouts_chunk_new_prefix = paddle.concat( [eouts_chunk_prefix, eouts_chunk], axis=1) eouts_chunk_lens_new_prefix = paddle.add(eouts_chunk_lens_prefix, eouts_chunk_lens) probs_chunk = self.decoder.softmax(eouts_chunk_new_prefix) - return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, rnn_final_state_list + return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, final_state_list diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index e99dacece..065a4c84d 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -157,9 +157,13 @@ class Autolog: model_precision="fp32"): import auto_log pid = os.getpid() - gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) - infer_config = inference.Config() - infer_config.enable_use_gpu(100, gpu_id) + if (os.environ['CUDA_VISIBLE_DEVICES'] != ''): + gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) + infer_config = inference.Config() + infer_config.enable_use_gpu(100, gpu_id) + else: + gpu_id = None + infer_config = inference.Config() autolog = auto_log.AutoLogger( model_name=model_name, model_precision=model_precision, diff --git a/tests/deepspeech2_online_model_test.py b/tests/deepspeech2_online_model_test.py new file mode 100644 index 000000000..80547544d --- /dev/null +++ b/tests/deepspeech2_online_model_test.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import paddle + +from deepspeech.models.ds2_online import DeepSpeech2ModelOnline + + +class TestDeepSpeech2ModelOnline(unittest.TestCase): + def setUp(self): + paddle.set_device('cpu') + + self.batch_size = 2 + self.feat_dim = 161 + max_len = 64 + + # (B, T, D) + audio = np.random.randn(self.batch_size, max_len, self.feat_dim) + audio_len = np.random.randint(max_len, size=self.batch_size) + audio_len[-1] = max_len + # (B, U) + text = np.array([[1, 2], [1, 2]]) + text_len = np.array([2] * self.batch_size) + + self.audio = paddle.to_tensor(audio, dtype='float32') + self.audio_len = paddle.to_tensor(audio_len, dtype='int64') + self.text = paddle.to_tensor(text, dtype='int32') + self.text_len = paddle.to_tensor(text_len, dtype='int64') + + def test_ds2_1(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_2(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=True) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_3(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_4(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=True) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_5(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + self.assertEqual(loss.numel(), 1) + + def test_ds2_6(self): + model = DeepSpeech2ModelOnline( + feat_size=self.feat_dim, + dict_size=10, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + num_fc_layers=2, + fc_layers_size_list=[512, 256], + use_gru=False) + loss = model(self.audio, self.audio_len, self.text, self.text_len) + model.eval() + + probs, eouts, eouts_len, final_state_list = model.decode_prob( + self.audio, self.audio_len) + probs_chk, eouts_chk, eouts_len_chk, final_state_list_chk = model.decode_prob_chunk_by_chunk( + self.audio, self.audio_len) + for i in range(len(final_state_list)): + for j in range(2): + self.assertEqual( + np.sum( + np.abs(final_state_list[i][j].numpy() - + final_state_list_chk[i][j].numpy())), 0) + + +if __name__ == '__main__': + unittest.main()