parent
5659bd2386
commit
907c93392f
@ -1,93 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import json
|
|
||||||
import math
|
|
||||||
import logging
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
__all__ = ['load_cmvn']
|
|
||||||
|
|
||||||
|
|
||||||
def _load_json_cmvn(json_cmvn_file):
|
|
||||||
""" Load the json format cmvn stats file and calculate cmvn
|
|
||||||
Args:
|
|
||||||
json_cmvn_file: cmvn stats file in json format
|
|
||||||
Returns:
|
|
||||||
a numpy array of [means, vars]
|
|
||||||
"""
|
|
||||||
with open(json_cmvn_file) as f:
|
|
||||||
cmvn_stats = json.load(f)
|
|
||||||
|
|
||||||
means = cmvn_stats['mean_stat']
|
|
||||||
variance = cmvn_stats['var_stat']
|
|
||||||
count = cmvn_stats['frame_num']
|
|
||||||
for i in range(len(means)):
|
|
||||||
means[i] /= count
|
|
||||||
variance[i] = variance[i] / count - means[i] * means[i]
|
|
||||||
if variance[i] < 1.0e-20:
|
|
||||||
variance[i] = 1.0e-20
|
|
||||||
variance[i] = 1.0 / math.sqrt(variance[i])
|
|
||||||
cmvn = np.array([means, variance])
|
|
||||||
return cmvn
|
|
||||||
|
|
||||||
|
|
||||||
def _load_kaldi_cmvn(kaldi_cmvn_file):
|
|
||||||
""" Load the kaldi format cmvn stats file and calculate cmvn
|
|
||||||
Args:
|
|
||||||
kaldi_cmvn_file: kaldi text style global cmvn file, which
|
|
||||||
is generated by:
|
|
||||||
compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
|
|
||||||
Returns:
|
|
||||||
a numpy array of [means, vars]
|
|
||||||
"""
|
|
||||||
means = []
|
|
||||||
variance = []
|
|
||||||
with open(kaldi_cmvn_file, 'r') as fid:
|
|
||||||
# kaldi binary file start with '\0B'
|
|
||||||
if fid.read(2) == '\0B':
|
|
||||||
logger.error('kaldi cmvn binary file is not supported, please '
|
|
||||||
'recompute it by: compute-cmvn-stats --binary=false '
|
|
||||||
' scp:feats.scp global_cmvn')
|
|
||||||
sys.exit(1)
|
|
||||||
fid.seek(0)
|
|
||||||
arr = fid.read().split()
|
|
||||||
assert (arr[0] == '[')
|
|
||||||
assert (arr[-2] == '0')
|
|
||||||
assert (arr[-1] == ']')
|
|
||||||
feat_dim = int((len(arr) - 2 - 2) / 2)
|
|
||||||
for i in range(1, feat_dim + 1):
|
|
||||||
means.append(float(arr[i]))
|
|
||||||
count = float(arr[feat_dim + 1])
|
|
||||||
for i in range(feat_dim + 2, 2 * feat_dim + 2):
|
|
||||||
variance.append(float(arr[i]))
|
|
||||||
|
|
||||||
for i in range(len(means)):
|
|
||||||
means[i] /= count
|
|
||||||
variance[i] = variance[i] / count - means[i] * means[i]
|
|
||||||
if variance[i] < 1.0e-20:
|
|
||||||
variance[i] = 1.0e-20
|
|
||||||
variance[i] = 1.0 / math.sqrt(variance[i])
|
|
||||||
cmvn = np.array([means, variance])
|
|
||||||
return cmvn
|
|
||||||
|
|
||||||
|
|
||||||
def load_cmvn(cmvn_file, is_json):
|
|
||||||
if is_json:
|
|
||||||
cmvn = _load_json_cmvn(cmvn_file)
|
|
||||||
else:
|
|
||||||
cmvn = _load_kaldi_cmvn(cmvn_file)
|
|
||||||
return cmvn[0], cmvn[1]
|
|
@ -1 +1,2 @@
|
|||||||
* s0 for deepspeech2
|
* s0 for deepspeech2
|
||||||
|
* s1 for U2
|
||||||
|
@ -0,0 +1,105 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
import unittest
|
||||||
|
from deepspeech.models.deepspeech2 import DeepSpeech2Model
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeepSpeech2Model(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
paddle.set_device('cpu')
|
||||||
|
|
||||||
|
self.batch_size = 2
|
||||||
|
self.feat_dim = 161
|
||||||
|
max_len = 64
|
||||||
|
|
||||||
|
#(B, T, D)
|
||||||
|
audio = np.random.randn(self.batch_size, max_len, self.feat_dim)
|
||||||
|
audio_len = np.random.randint(
|
||||||
|
max_len, size=self.batch_size, dtype='int32')
|
||||||
|
audio_len[-1] = max_len
|
||||||
|
#(B, U)
|
||||||
|
text = np.array([[1, 2], [1, 2]], dtype='int32')
|
||||||
|
text_len = np.array([2] * self.batch_size, dtype='int32')
|
||||||
|
|
||||||
|
self.audio = paddle.to_tensor(audio, dtype='float32')
|
||||||
|
self.audio_len = paddle.to_tensor(audio_len, dtype='int64')
|
||||||
|
self.text = paddle.to_tensor(text, dtype='int32')
|
||||||
|
self.text_len = paddle.to_tensor(text_len, dtype='int64')
|
||||||
|
|
||||||
|
def test_ds2_1(self):
|
||||||
|
model = DeepSpeech2Model(
|
||||||
|
feat_size=self.feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
loss = model(self.audio, self.audio_len, self.text, self.text_len)
|
||||||
|
self.assertEqual(loss.numel(), 1)
|
||||||
|
|
||||||
|
def test_ds2_2(self):
|
||||||
|
model = DeepSpeech2Model(
|
||||||
|
feat_size=self.feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=True,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
loss = model(self.audio, self.audio_len, self.text, self.text_len)
|
||||||
|
self.assertEqual(loss.numel(), 1)
|
||||||
|
|
||||||
|
def test_ds2_3(self):
|
||||||
|
model = DeepSpeech2Model(
|
||||||
|
feat_size=self.feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=True, )
|
||||||
|
loss = model(self.audio, self.audio_len, self.text, self.text_len)
|
||||||
|
self.assertEqual(loss.numel(), 1)
|
||||||
|
|
||||||
|
def test_ds2_4(self):
|
||||||
|
model = DeepSpeech2Model(
|
||||||
|
feat_size=self.feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=True,
|
||||||
|
share_rnn_weights=True, )
|
||||||
|
loss = model(self.audio, self.audio_len, self.text, self.text_len)
|
||||||
|
self.assertEqual(loss.numel(), 1)
|
||||||
|
|
||||||
|
def test_ds2_5(self):
|
||||||
|
model = DeepSpeech2Model(
|
||||||
|
feat_size=self.feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
loss = model(self.audio, self.audio_len, self.text, self.text_len)
|
||||||
|
self.assertEqual(loss.numel(), 1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -0,0 +1,39 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
import unittest
|
||||||
|
from deepspeech.modules.mask import sequence_mask
|
||||||
|
from deepspeech.modules.mask import make_non_pad_mask
|
||||||
|
|
||||||
|
|
||||||
|
class TestU2Model(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
paddle.set_device('cpu')
|
||||||
|
self.lengths = paddle.to_tensor([5, 3, 2])
|
||||||
|
self.masks = np.array(
|
||||||
|
[[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [1, 1, 0, 0, 0]], )
|
||||||
|
|
||||||
|
def test_sequence_mask(self):
|
||||||
|
res = sequence_mask(self.lengths)
|
||||||
|
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
|
||||||
|
|
||||||
|
def test_make_non_pad_mask(self):
|
||||||
|
res = make_non_pad_mask(self.lengths)
|
||||||
|
self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -1,99 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from deepspeech.models.deepspeech2 import DeepSpeech2Model
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
batch_size = 2
|
|
||||||
feat_dim = 161
|
|
||||||
max_len = 100
|
|
||||||
audio = np.random.randn(batch_size, feat_dim, max_len)
|
|
||||||
audio_len = np.random.randint(100, size=batch_size, dtype='int32')
|
|
||||||
audio_len[-1] = 100
|
|
||||||
text = np.array([[1, 2], [1, 2]], dtype='int32')
|
|
||||||
text_len = np.array([2] * batch_size, dtype='int32')
|
|
||||||
|
|
||||||
audio = paddle.to_tensor(audio, dtype='float32')
|
|
||||||
audio_len = paddle.to_tensor(audio_len, dtype='int64')
|
|
||||||
text = paddle.to_tensor(text, dtype='int32')
|
|
||||||
text_len = paddle.to_tensor(text_len, dtype='int64')
|
|
||||||
|
|
||||||
print(audio.shape)
|
|
||||||
print(audio_len.shape)
|
|
||||||
print(text.shape)
|
|
||||||
print(text_len.shape)
|
|
||||||
print("-----------------")
|
|
||||||
|
|
||||||
model = DeepSpeech2Model(
|
|
||||||
feat_size=feat_dim,
|
|
||||||
dict_size=10,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=1024,
|
|
||||||
use_gru=False,
|
|
||||||
share_rnn_weights=False, )
|
|
||||||
logits, probs, logits_len = model(audio, audio_len, text, text_len)
|
|
||||||
print('probs.shape', probs.shape)
|
|
||||||
print("-----------------")
|
|
||||||
|
|
||||||
model2 = DeepSpeech2Model(
|
|
||||||
feat_size=feat_dim,
|
|
||||||
dict_size=10,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=1024,
|
|
||||||
use_gru=True,
|
|
||||||
share_rnn_weights=False, )
|
|
||||||
logits, probs, logits_len = model2(audio, audio_len, text, text_len)
|
|
||||||
print('probs.shape', probs.shape)
|
|
||||||
print("-----------------")
|
|
||||||
|
|
||||||
model3 = DeepSpeech2Model(
|
|
||||||
feat_size=feat_dim,
|
|
||||||
dict_size=10,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=1024,
|
|
||||||
use_gru=False,
|
|
||||||
share_rnn_weights=True, )
|
|
||||||
logits, probs, logits_len = model3(audio, audio_len, text, text_len)
|
|
||||||
print('probs.shape', probs.shape)
|
|
||||||
print("-----------------")
|
|
||||||
|
|
||||||
model4 = DeepSpeech2Model(
|
|
||||||
feat_size=feat_dim,
|
|
||||||
dict_size=10,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=1024,
|
|
||||||
use_gru=True,
|
|
||||||
share_rnn_weights=True, )
|
|
||||||
logits, probs, logits_len = model4(audio, audio_len, text, text_len)
|
|
||||||
print('probs.shape', probs.shape)
|
|
||||||
print("-----------------")
|
|
||||||
|
|
||||||
model5 = DeepSpeech2Model(
|
|
||||||
feat_size=feat_dim,
|
|
||||||
dict_size=10,
|
|
||||||
num_conv_layers=2,
|
|
||||||
num_rnn_layers=3,
|
|
||||||
rnn_size=1024,
|
|
||||||
use_gru=False,
|
|
||||||
share_rnn_weights=False, )
|
|
||||||
logits, probs, logits_len = model5(audio, audio_len, text, text_len)
|
|
||||||
print('probs.shape', probs.shape)
|
|
||||||
print("-----------------")
|
|
@ -0,0 +1,60 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
import unittest
|
||||||
|
from deepspeech.models.u2 import U2TransformerModel
|
||||||
|
from deepspeech.models.u2 import U2ConformerModel
|
||||||
|
|
||||||
|
|
||||||
|
class TestU2Model(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
batch_size = 2
|
||||||
|
feat_dim = 161
|
||||||
|
max_len = 100
|
||||||
|
audio = np.random.randn(batch_size, feat_dim, max_len)
|
||||||
|
audio_len = np.random.randint(100, size=batch_size, dtype='int32')
|
||||||
|
audio_len[-1] = 100
|
||||||
|
text = np.array([[1, 2], [1, 2]], dtype='int32')
|
||||||
|
text_len = np.array([2] * batch_size, dtype='int32')
|
||||||
|
|
||||||
|
self.audio = paddle.to_tensor(audio, dtype='float32')
|
||||||
|
self.audio_len = paddle.to_tensor(audio_len, dtype='int64')
|
||||||
|
self.text = paddle.to_tensor(text, dtype='int32')
|
||||||
|
self.text_len = paddle.to_tensor(text_len, dtype='int64')
|
||||||
|
|
||||||
|
print(audio.shape)
|
||||||
|
print(audio_len.shape)
|
||||||
|
print(text.shape)
|
||||||
|
print(text_len.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
def test_ds2_1(self):
|
||||||
|
model = DeepSpeech2Model(
|
||||||
|
feat_size=feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
logits, probs, logits_len = model(self.audio, self.audio_len, self.text,
|
||||||
|
self.text_len)
|
||||||
|
print('probs.shape', probs.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Reference in new issue