æ˜å增加了chunk_by_chunk,初步测试å通过ã

pull/735/head
huangyuxin 4 years ago
parent 2537221b61
commit d398270f95

@ -22,6 +22,8 @@ class Conv2dSubsampling4Online(Conv2dSubsampling4):
def __init__(self, idim: int, odim: int, dropout_rate: float):
super().__init__(idim, odim, dropout_rate, None)
self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
self.receptive_field_length = 2 * (
3 - 1) + 3 # stride_1 * (kernel_size_2 - 1) + kerel_size_1
def forward(self, x: paddle.Tensor,
x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:

@ -108,7 +108,7 @@ class CRNNEncoder(nn.Layer):
Returns:
x (Tensor): encoder outputs, [B, T_output, D]
x_lens (Tensor): encoder length, [B]
rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers
final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers
"""
# [B, T, D]
# convolution group
@ -121,21 +121,21 @@ class CRNNEncoder(nn.Layer):
# remove padding part
init_state = None
rnn_final_state_list = []
final_state_list = []
x, final_state = self.rnn[0](x, init_state, x_lens)
rnn_final_state_list.append(final_state)
final_state_list.append(final_state)
x = self.layernorm_list[0](x)
for i in range(1, self.num_rnn_layers):
x, final_state = self.rnn[i](x, init_state, x_lens) #[B, T, D]
rnn_final_state_list.append(final_state)
final_state_list.append(final_state)
x = self.layernorm_list[i](x)
for i in range(self.num_fc_layers):
x = self.fc_layers_list[i](x)
x = F.relu(x)
return x, x_lens, rnn_final_state_list
return x, x_lens, final_state_list
def forward(self, x, x_lens, init_state_list):
def forward_chunk(self, x, x_lens, init_state_list):
"""Compute Encoder outputs
Args:
@ -145,22 +145,59 @@ class CRNNEncoder(nn.Layer):
Returns:
x (Tensor): encoder outputs, [B, chunk_size, D]
x_lens (Tensor): encoder length, [B]
rnn_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers
chunk_final_state_list: list of final_states for RNN layers, [num_directions, batch_size, hidden_size] * num_rnn_layers
"""
rnn_final_state_list = []
x, x_lens = self.conv(x, x_lens)
chunk_final_state_list = []
x, final_state = self.rnn[0](x, init_state_list[0], x_lens)
rnn_final_state_list.append(final_state)
chunk_final_state_list.append(final_state)
x = self.layernorm_list[0](x)
for i in range(1, self.num_rnn_layers):
x, final_state = self.rnn[i](x, init_state_list[i],
x_lens) #[B, T, D]
rnn_final_state_list.append(final_state)
chunk_final_state_list.append(final_state)
x = self.layernorm_list[i](x)
for i in range(self.num_fc_layers):
x = self.fc_layers_list[i](x)
x = F.relu(x)
return x, x_lens, rnn_final_state_list
return x, x_lens, chunk_final_state_list
def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
subsampling_rate = self.conv.subsampling_rate
receptive_field_length = self.conv.receptive_field_length
chunk_size = (decoder_chunk_size - 1
) * subsampling_rate + receptive_field_length
chunk_stride = subsampling_rate * decoder_chunk_size
max_len = x.shape[1]
assert (chunk_size <= max_len)
eouts_chunk_list = []
eouts_chunk_lens_list = []
padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
x_padded = paddle.concat([x, padding], axis=1)
num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
num_chunk = int(num_chunk)
chunk_init_state_list = [None] * self.num_rnn_layers
for i in range(0, num_chunk):
start = i * chunk_stride
end = start + chunk_size
x_chunk = x_padded[:, start:end, :]
x_len_left = x_lens - i * chunk_stride
x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
x_len_left, x_chunk_len_tmp)
eouts_chunk, eouts_chunk_lens, chunk_final_state_list = self.forward_chunk(
x_chunk, x_chunk_lens, chunk_init_state_list)
chunk_init_state_list = chunk_final_state_list
eouts_chunk_list.append(eouts_chunk)
eouts_chunk_lens_list.append(eouts_chunk_lens)
return eouts_chunk_list, eouts_chunk_lens_list, chunk_final_state_list
class DeepSpeech2ModelOnline(nn.Layer):
@ -248,7 +285,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
Returns:
loss (Tenosr): [1]
"""
eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len)
eouts, eouts_len, final_state_list = self.encoder(audio, audio_len)
loss = self.decoder(eouts, eouts_len, text, text_len)
return loss
@ -265,13 +302,54 @@ class DeepSpeech2ModelOnline(nn.Layer):
vocab_list=vocab_list,
decoding_method=decoding_method)
eouts, eouts_len = self.encoder(audio, audio_len)
eouts, eouts_len, final_state_list = self.encoder(audio, audio_len)
probs = self.decoder.softmax(eouts)
return self.decoder.decode_probs(
probs.numpy(), eouts_len, vocab_list, decoding_method,
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
cutoff_top_n, num_processes)
@paddle.no_grad()
def decode_chunk_by_chunk(self, audio, audio_len, vocab_list,
decoding_method, lang_model_path, beam_alpha,
beam_beta, beam_size, cutoff_prob, cutoff_top_n,
num_processes):
# init once
# decoders only accept string encoded in utf-8
self.decoder.init_decode(
beam_alpha=beam_alpha,
beam_beta=beam_beta,
lang_model_path=lang_model_path,
vocab_list=vocab_list,
decoding_method=decoding_method)
eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk(
audio, audio_len)
eouts = paddle.concat(eouts_chunk_list, axis=1)
eouts_len = paddle.add_n(eouts_chunk_len_list)
probs = self.decoder.softmax(eouts)
return self.decoder.decode_probs(
probs.numpy(), eouts_len, vocab_list, decoding_method,
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
cutoff_top_n, num_processes)
@paddle.no_grad()
def decode_prob(self, audio, audio_len):
eouts, eouts_len, final_state_list = self.encoder(audio, audio_len)
probs = self.decoder.softmax(eouts)
return probs, eouts, eouts_len, final_state_list
@paddle.no_grad()
def decode_prob_chunk_by_chunk(self, audio, audio_len):
eouts_chunk_list, eouts_chunk_len_list, final_state_list = self.encoder.forward_chunk_by_chunk(
audio, audio_len)
eouts = paddle.concat(eouts_chunk_list, axis=1)
eouts_len = paddle.add_n(eouts_chunk_len_list)
probs = self.decoder.softmax(eouts)
return probs, eouts, eouts_len, final_state_list
@classmethod
def from_pretrained(cls, dataloader, config, checkpoint_path):
"""Build a DeepSpeech2Model model from a pretrained model.
@ -338,7 +416,14 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
Returns:
probs: probs after softmax
"""
eouts, eouts_len, rnn_final_state_list = self.encoder(audio, audio_len)
eouts, eouts_len, final_state_list = self.encoder(audio, audio_len)
probs = self.decoder.softmax(eouts)
return probs
def forward_chunk_by_chunk(self, audio, audio_len):
eouts_chunk_list, eouts_chunk_lens_list, final_state_list = self.encoder.forward_chunk_by_chunk(
audio_chunk, audio_chunk_len)
eouts = paddle.concat(eouts_chunk_list, axis=1)
probs = self.decoder.softmax(eouts)
return probs
@ -353,11 +438,11 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
Returns:
probs: probs after softmax
"""
eouts_chunk, eouts_chunk_lens, rnn_final_state_list = self.encoder(
eouts_chunk, eouts_chunk_lens, final_state_list = self.encoder(
audio_chunk, audio_chunk_len, init_state_list)
eouts_chunk_new_prefix = paddle.concat(
[eouts_chunk_prefix, eouts_chunk], axis=1)
eouts_chunk_lens_new_prefix = paddle.add(eouts_chunk_lens_prefix,
eouts_chunk_lens)
probs_chunk = self.decoder.softmax(eouts_chunk_new_prefix)
return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, rnn_final_state_list
return probs_chunk, eouts_chunk_new_prefix, eouts_chunk_lens_new_prefix, final_state_list

@ -157,9 +157,13 @@ class Autolog:
model_precision="fp32"):
import auto_log
pid = os.getpid()
gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
infer_config = inference.Config()
infer_config.enable_use_gpu(100, gpu_id)
if (os.environ['CUDA_VISIBLE_DEVICES'] != ''):
gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
infer_config = inference.Config()
infer_config.enable_use_gpu(100, gpu_id)
else:
gpu_id = None
infer_config = inference.Config()
autolog = auto_log.AutoLogger(
model_name=model_name,
model_precision=model_precision,

@ -0,0 +1,134 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
class TestDeepSpeech2ModelOnline(unittest.TestCase):
def setUp(self):
paddle.set_device('cpu')
self.batch_size = 2
self.feat_dim = 161
max_len = 64
# (B, T, D)
audio = np.random.randn(self.batch_size, max_len, self.feat_dim)
audio_len = np.random.randint(max_len, size=self.batch_size)
audio_len[-1] = max_len
# (B, U)
text = np.array([[1, 2], [1, 2]])
text_len = np.array([2] * self.batch_size)
self.audio = paddle.to_tensor(audio, dtype='float32')
self.audio_len = paddle.to_tensor(audio_len, dtype='int64')
self.text = paddle.to_tensor(text, dtype='int32')
self.text_len = paddle.to_tensor(text_len, dtype='int64')
def test_ds2_1(self):
model = DeepSpeech2ModelOnline(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False)
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
def test_ds2_2(self):
model = DeepSpeech2ModelOnline(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=True)
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
def test_ds2_3(self):
model = DeepSpeech2ModelOnline(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False)
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
def test_ds2_4(self):
model = DeepSpeech2ModelOnline(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=True)
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
def test_ds2_5(self):
model = DeepSpeech2ModelOnline(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False)
loss = model(self.audio, self.audio_len, self.text, self.text_len)
self.assertEqual(loss.numel(), 1)
def test_ds2_6(self):
model = DeepSpeech2ModelOnline(
feat_size=self.feat_dim,
dict_size=10,
num_conv_layers=2,
num_rnn_layers=3,
rnn_size=1024,
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False)
loss = model(self.audio, self.audio_len, self.text, self.text_len)
model.eval()
probs, eouts, eouts_len, final_state_list = model.decode_prob(
self.audio, self.audio_len)
probs_chk, eouts_chk, eouts_len_chk, final_state_list_chk = model.decode_prob_chunk_by_chunk(
self.audio, self.audio_len)
for i in range(len(final_state_list)):
for j in range(2):
self.assertEqual(
np.sum(
np.abs(final_state_list[i][j].numpy() -
final_state_list_chk[i][j].numpy())), 0)
if __name__ == '__main__':
unittest.main()
Loading…
Cancel
Save