add u2 config

pull/578/head
Hui Zhang 5 years ago
parent 9626e99ce4
commit 2fa6bbbed5

@ -1,21 +0,0 @@
#! /usr/bin/env bash
. ${MAIN_ROOT}/utils/utility.sh
DIR=data/pretrain
mkdir -p ${DIR}
URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz'
MD5=fafb11fe57c3ecd107147056453f5348
TARGET=${DIR}/librispeech_model_fluid.tar.gz
echo "Download LibriSpeech model ..."
download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then
echo "Fail to download LibriSpeech model!"
exit 1
fi
tar -zxvf $TARGET -C ${DIR}
exit 0

@ -0,0 +1,90 @@
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
causal: true
use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# use raw_wav or kaldi feature
raw_wav: true
# feature extraction
collate_conf:
# waveform level config
wav_distortion_conf:
wav_dither: 1.0
wav_distortion_rate: 0.0
distortion_methods: []
speed_perturb: true
feature_extraction_conf:
feature_type: 'fbank'
mel_bins: 80
frame_shift: 10
frame_length: 25
using_pitch: false
# spec level config
# spec_swap: false
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
spec_aug: true
spec_aug_conf:
warp_for_time: False
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
max_w: 80
# dataset related
dataset_conf:
max_length: 40960
min_length: 0
batch_type: 'static' # static or dynamic
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
batch_size: 16
sort: true
grad_clip: 5
accum_grad: 1
max_epoch: 180
log_interval: 100
optim: adam
optim_conf:
lr: 0.001
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000

@ -0,0 +1,83 @@
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
use_dynamic_chunk: true
use_dynamic_left_chunk: false
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# use raw_wav or kaldi feature
raw_wav: true
# feature extraction
collate_conf:
# waveform level config
wav_distortion_conf:
wav_dither: 0.0
wav_distortion_rate: 0.0
distortion_methods: []
speed_perturb: false
feature_extraction_conf:
feature_type: 'fbank'
mel_bins: 80
frame_shift: 10
frame_length: 25
using_pitch: false
# spec level config
# spec_swap: false
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
spec_aug: true
spec_aug_conf:
warp_for_time: False
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
max_w: 80
# dataset related
dataset_conf:
max_length: 40960
min_length: 0
batch_type: 'static' # static or dynamic
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
batch_size: 16
sort: true
grad_clip: 5
accum_grad: 1
max_epoch: 180
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000

@ -0,0 +1,86 @@
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# use raw_wav or kaldi feature
raw_wav: true
# feature extraction
collate_conf:
# waveform level config
wav_distortion_conf:
wav_dither: 0.1
wav_distortion_rate: 0.0
distortion_methods: []
speed_perturb: true
feature_extraction_conf:
feature_type: 'fbank'
mel_bins: 80
frame_shift: 10
frame_length: 25
using_pitch: false
# spec level config
# spec_swap: false
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
spec_aug: true
spec_aug_conf:
warp_for_time: False
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
max_w: 80
# dataset related
dataset_conf:
max_length: 40960
min_length: 0
batch_type: 'static' # static or dynamic
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
batch_size: 16
sort: true
grad_clip: 5
accum_grad: 4
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000

@ -0,0 +1,80 @@
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
# use raw_wav or kaldi feature
raw_wav: true
# feature extraction
collate_conf:
# waveform level config
wav_distortion_conf:
wav_dither: 0.1
wav_distortion_rate: 0.0
distortion_methods: []
speed_perturb: true
feature_extraction_conf:
feature_type: 'fbank'
mel_bins: 80
frame_shift: 10
frame_length: 25
using_pitch: false
# spec level config
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
spec_aug: true
spec_aug_conf:
warp_for_time: False
num_t_mask: 2
num_f_mask: 2
max_t: 50
max_f: 10
max_w: 80
# dataset related
dataset_conf:
max_length: 40960
min_length: 0
batch_type: 'static' # static or dynamic
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
batch_size: 26
sort: true
grad_clip: 5
accum_grad: 1
max_epoch: 240
log_interval: 100
optim: adam
optim_conf:
lr: 0.002
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 25000

@ -0,0 +1 @@
../../s0/local/data.sh

@ -0,0 +1 @@
../../s0/local/download_lm_en.sh

@ -0,0 +1,14 @@
export MAIN_ROOT=${PWD}/../../../
export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin

@ -0,0 +1,16 @@
#!/bin/bash
set -e
source path.sh
# prepare data
bash ./local/data.sh
# train model
bash ./local/train.sh
# test model
bash ./local/test.sh
# infer model
bash ./local/infer.sh

@ -13,8 +13,11 @@
# limitations under the License. # limitations under the License.
import paddle import paddle
import numpy as np
import unittest import unittest
import numpy as np
from yacs.config import CfgNode as CN
from deepspeech.models.u2 import U2TransformerModel from deepspeech.models.u2 import U2TransformerModel
from deepspeech.models.u2 import U2ConformerModel from deepspeech.models.u2 import U2ConformerModel
@ -41,9 +44,82 @@ class TestU2Model(unittest.TestCase):
self.text_len = paddle.to_tensor(text_len, dtype='int64') self.text_len = paddle.to_tensor(text_len, dtype='int64')
def test_transformer(self): def test_transformer(self):
conf_str = """
# network architecture
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder architecture type
normalize_before: true
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
"""
cfg = CN().load_cfg(conf_str)
print(cfg)
model = U2TransformerModel() model = U2TransformerModel()
def test_conformer(self): def test_conformer(self):
conf_str = """
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
"""
cfg = CN().load_cfg(conf_str)
print(cfg)
model = U2ConformerModel() model = U2ConformerModel()

Loading…
Cancel
Save