parent
6cc80c0aff
commit
4dc75c40c9
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"type": "shift",
|
||||||
|
"params": {"min_shift_ms": -5,
|
||||||
|
"max_shift_ms": 5},
|
||||||
|
"prob": 1.0
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,39 @@
|
|||||||
|
# https://yaml.org/type/float.html
|
||||||
|
data:
|
||||||
|
train_manifest: data/manifest.tiny
|
||||||
|
dev_manifest: data/manifest.tiny
|
||||||
|
test_manifest: data/manifest.tiny
|
||||||
|
mean_std_filepath: data/mean_std.npz
|
||||||
|
vocab_filepath: data/vocab.txt
|
||||||
|
augmentation_config: conf/augmentation.config
|
||||||
|
batch_size: 4
|
||||||
|
max_duration: 27.0
|
||||||
|
min_duration: 0.0
|
||||||
|
specgram_type: linear
|
||||||
|
target_sample_rate: 16000
|
||||||
|
max_freq: None
|
||||||
|
n_fft: None
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 20.0
|
||||||
|
use_dB_normalization: True
|
||||||
|
target_dB: -20
|
||||||
|
random_seed: 0
|
||||||
|
keep_transcription_text: False
|
||||||
|
sortagrad: True
|
||||||
|
shuffle_method: batch_shuffle
|
||||||
|
num_workers: 0
|
||||||
|
model:
|
||||||
|
num_conv_layers: 2
|
||||||
|
num_rnn_layers: 3
|
||||||
|
rnn_layer_size: 2048
|
||||||
|
use_gru: False
|
||||||
|
share_rnn_weights: True
|
||||||
|
training:
|
||||||
|
n_epoch: 20
|
||||||
|
lr: 1e-5
|
||||||
|
weight_decay: 1e-06
|
||||||
|
global_grad_clip: 400.0
|
||||||
|
max_iteration: 500000
|
||||||
|
plot_interval: 1000
|
||||||
|
save_interval: 1000
|
||||||
|
valid_interval: 1000
|
@ -0,0 +1,104 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from network2 import DeepSpeech2
|
||||||
|
import paddle
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
batch_size = 2
|
||||||
|
feat_dim = 161
|
||||||
|
max_len = 100
|
||||||
|
audio = np.random.randn(batch_size, feat_dim, max_len)
|
||||||
|
audio_len = np.random.randint(100, size=batch_size, dtype='int32')
|
||||||
|
audio_len[-1] = 100
|
||||||
|
text = np.array([[1, 2], [1, 2]], dtype='int32')
|
||||||
|
text_len = np.array([2] * batch_size, dtype='int32')
|
||||||
|
|
||||||
|
place = paddle.CUDAPinnedPlace()
|
||||||
|
audio = paddle.to_tensor(
|
||||||
|
audio, dtype='float32', place=place, stop_gradient=True)
|
||||||
|
audio_len = paddle.to_tensor(
|
||||||
|
audio_len, dtype='int64', place=place, stop_gradient=True)
|
||||||
|
text = paddle.to_tensor(
|
||||||
|
text, dtype='int32', place=place, stop_gradient=True)
|
||||||
|
text_len = paddle.to_tensor(
|
||||||
|
text_len, dtype='int64', place=place, stop_gradient=True)
|
||||||
|
|
||||||
|
print(audio.shape)
|
||||||
|
print(audio_len.shape)
|
||||||
|
print(text.shape)
|
||||||
|
print(text_len.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
model = DeepSpeech2(
|
||||||
|
feat_size=feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
probs = model(audio, text, audio_len, text_len)
|
||||||
|
print('probs.shape', probs.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
model2 = DeepSpeech2(
|
||||||
|
feat_size=feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=True,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
probs = model2(audio, text, audio_len, text_len)
|
||||||
|
print('probs.shape', probs.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
model3 = DeepSpeech2(
|
||||||
|
feat_size=feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=True, )
|
||||||
|
probs = model3(audio, text, audio_len, text_len)
|
||||||
|
print('probs.shape', probs.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
model4 = DeepSpeech2(
|
||||||
|
feat_size=feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=True,
|
||||||
|
share_rnn_weights=True, )
|
||||||
|
probs = model4(audio, text, audio_len, text_len)
|
||||||
|
print('probs.shape', probs.shape)
|
||||||
|
print("-----------------")
|
||||||
|
|
||||||
|
model5 = DeepSpeech2(
|
||||||
|
feat_size=feat_dim,
|
||||||
|
dict_size=10,
|
||||||
|
num_conv_layers=2,
|
||||||
|
num_rnn_layers=3,
|
||||||
|
rnn_size=1024,
|
||||||
|
use_gru=False,
|
||||||
|
share_rnn_weights=False, )
|
||||||
|
probs = model5(audio, text, audio_len, text_len)
|
||||||
|
print('probs.shape', probs.shape)
|
||||||
|
print("-----------------")
|
Loading…
Reference in new issue