parent
9626e99ce4
commit
2fa6bbbed5
@ -1,21 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
DIR=data/pretrain
|
||||
mkdir -p ${DIR}
|
||||
|
||||
URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz'
|
||||
MD5=fafb11fe57c3ecd107147056453f5348
|
||||
TARGET=${DIR}/librispeech_model_fluid.tar.gz
|
||||
|
||||
|
||||
echo "Download LibriSpeech model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download LibriSpeech model!"
|
||||
exit 1
|
||||
fi
|
||||
tar -zxvf $TARGET -C ${DIR}
|
||||
|
||||
exit 0
|
@ -0,0 +1,90 @@
|
||||
# network architecture
|
||||
# encoder related
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 256 # dimension of attention
|
||||
attention_heads: 4
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||
normalize_before: true
|
||||
cnn_module_kernel: 15
|
||||
use_cnn_module: True
|
||||
activation_type: 'swish'
|
||||
pos_enc_layer_type: 'rel_pos'
|
||||
selfattention_layer_type: 'rel_selfattn'
|
||||
causal: true
|
||||
use_dynamic_chunk: true
|
||||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||
use_dynamic_left_chunk: false
|
||||
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
|
||||
|
||||
# use raw_wav or kaldi feature
|
||||
raw_wav: true
|
||||
|
||||
# feature extraction
|
||||
collate_conf:
|
||||
# waveform level config
|
||||
wav_distortion_conf:
|
||||
wav_dither: 1.0
|
||||
wav_distortion_rate: 0.0
|
||||
distortion_methods: []
|
||||
speed_perturb: true
|
||||
feature_extraction_conf:
|
||||
feature_type: 'fbank'
|
||||
mel_bins: 80
|
||||
frame_shift: 10
|
||||
frame_length: 25
|
||||
using_pitch: false
|
||||
# spec level config
|
||||
# spec_swap: false
|
||||
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||
spec_aug: true
|
||||
spec_aug_conf:
|
||||
warp_for_time: False
|
||||
num_t_mask: 2
|
||||
num_f_mask: 2
|
||||
max_t: 50
|
||||
max_f: 10
|
||||
max_w: 80
|
||||
|
||||
# dataset related
|
||||
dataset_conf:
|
||||
max_length: 40960
|
||||
min_length: 0
|
||||
batch_type: 'static' # static or dynamic
|
||||
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||
batch_size: 16
|
||||
sort: true
|
||||
|
||||
grad_clip: 5
|
||||
accum_grad: 1
|
||||
max_epoch: 180
|
||||
log_interval: 100
|
||||
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.001
|
||||
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
@ -0,0 +1,83 @@
|
||||
# network architecture
|
||||
# encoder related
|
||||
encoder: transformer
|
||||
encoder_conf:
|
||||
output_size: 256 # dimension of attention
|
||||
attention_heads: 4
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder architecture type
|
||||
normalize_before: true
|
||||
use_dynamic_chunk: true
|
||||
use_dynamic_left_chunk: false
|
||||
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
|
||||
# use raw_wav or kaldi feature
|
||||
raw_wav: true
|
||||
|
||||
# feature extraction
|
||||
collate_conf:
|
||||
# waveform level config
|
||||
wav_distortion_conf:
|
||||
wav_dither: 0.0
|
||||
wav_distortion_rate: 0.0
|
||||
distortion_methods: []
|
||||
speed_perturb: false
|
||||
feature_extraction_conf:
|
||||
feature_type: 'fbank'
|
||||
mel_bins: 80
|
||||
frame_shift: 10
|
||||
frame_length: 25
|
||||
using_pitch: false
|
||||
# spec level config
|
||||
# spec_swap: false
|
||||
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||
spec_aug: true
|
||||
spec_aug_conf:
|
||||
warp_for_time: False
|
||||
num_t_mask: 2
|
||||
num_f_mask: 2
|
||||
max_t: 50
|
||||
max_f: 10
|
||||
max_w: 80
|
||||
|
||||
|
||||
# dataset related
|
||||
dataset_conf:
|
||||
max_length: 40960
|
||||
min_length: 0
|
||||
batch_type: 'static' # static or dynamic
|
||||
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||
batch_size: 16
|
||||
sort: true
|
||||
|
||||
grad_clip: 5
|
||||
accum_grad: 1
|
||||
max_epoch: 180
|
||||
log_interval: 100
|
||||
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.002
|
||||
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
@ -0,0 +1,86 @@
|
||||
# network architecture
|
||||
# encoder related
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 256 # dimension of attention
|
||||
attention_heads: 4
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||
normalize_before: true
|
||||
cnn_module_kernel: 15
|
||||
use_cnn_module: True
|
||||
activation_type: 'swish'
|
||||
pos_enc_layer_type: 'rel_pos'
|
||||
selfattention_layer_type: 'rel_selfattn'
|
||||
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
|
||||
# use raw_wav or kaldi feature
|
||||
raw_wav: true
|
||||
|
||||
# feature extraction
|
||||
collate_conf:
|
||||
# waveform level config
|
||||
wav_distortion_conf:
|
||||
wav_dither: 0.1
|
||||
wav_distortion_rate: 0.0
|
||||
distortion_methods: []
|
||||
speed_perturb: true
|
||||
feature_extraction_conf:
|
||||
feature_type: 'fbank'
|
||||
mel_bins: 80
|
||||
frame_shift: 10
|
||||
frame_length: 25
|
||||
using_pitch: false
|
||||
# spec level config
|
||||
# spec_swap: false
|
||||
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||
spec_aug: true
|
||||
spec_aug_conf:
|
||||
warp_for_time: False
|
||||
num_t_mask: 2
|
||||
num_f_mask: 2
|
||||
max_t: 50
|
||||
max_f: 10
|
||||
max_w: 80
|
||||
|
||||
|
||||
# dataset related
|
||||
dataset_conf:
|
||||
max_length: 40960
|
||||
min_length: 0
|
||||
batch_type: 'static' # static or dynamic
|
||||
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||
batch_size: 16
|
||||
sort: true
|
||||
|
||||
grad_clip: 5
|
||||
accum_grad: 4
|
||||
max_epoch: 240
|
||||
log_interval: 100
|
||||
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.002
|
||||
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
@ -0,0 +1,80 @@
|
||||
# network architecture
|
||||
# encoder related
|
||||
encoder: transformer
|
||||
encoder_conf:
|
||||
output_size: 256 # dimension of attention
|
||||
attention_heads: 4
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder architecture type
|
||||
normalize_before: true
|
||||
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 4
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
|
||||
# use raw_wav or kaldi feature
|
||||
raw_wav: true
|
||||
|
||||
# feature extraction
|
||||
collate_conf:
|
||||
# waveform level config
|
||||
wav_distortion_conf:
|
||||
wav_dither: 0.1
|
||||
wav_distortion_rate: 0.0
|
||||
distortion_methods: []
|
||||
speed_perturb: true
|
||||
feature_extraction_conf:
|
||||
feature_type: 'fbank'
|
||||
mel_bins: 80
|
||||
frame_shift: 10
|
||||
frame_length: 25
|
||||
using_pitch: false
|
||||
# spec level config
|
||||
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||
spec_aug: true
|
||||
spec_aug_conf:
|
||||
warp_for_time: False
|
||||
num_t_mask: 2
|
||||
num_f_mask: 2
|
||||
max_t: 50
|
||||
max_f: 10
|
||||
max_w: 80
|
||||
|
||||
|
||||
# dataset related
|
||||
dataset_conf:
|
||||
max_length: 40960
|
||||
min_length: 0
|
||||
batch_type: 'static' # static or dynamic
|
||||
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||
batch_size: 26
|
||||
sort: true
|
||||
|
||||
grad_clip: 5
|
||||
accum_grad: 1
|
||||
max_epoch: 240
|
||||
log_interval: 100
|
||||
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.002
|
||||
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
@ -0,0 +1 @@
|
||||
../../s0/local/data.sh
|
@ -0,0 +1 @@
|
||||
../../s0/local/download_lm_en.sh
|
@ -0,0 +1,14 @@
|
||||
export MAIN_ROOT=${PWD}/../../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
|
||||
MODEL=u2
|
||||
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
|
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source path.sh
|
||||
|
||||
# prepare data
|
||||
bash ./local/data.sh
|
||||
|
||||
# train model
|
||||
bash ./local/train.sh
|
||||
|
||||
# test model
|
||||
bash ./local/test.sh
|
||||
|
||||
# infer model
|
||||
bash ./local/infer.sh
|
Loading…
Reference in new issue