parent
9626e99ce4
commit
2fa6bbbed5
@ -1,21 +0,0 @@
|
|||||||
#! /usr/bin/env bash
|
|
||||||
|
|
||||||
. ${MAIN_ROOT}/utils/utility.sh
|
|
||||||
|
|
||||||
DIR=data/pretrain
|
|
||||||
mkdir -p ${DIR}
|
|
||||||
|
|
||||||
URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz'
|
|
||||||
MD5=fafb11fe57c3ecd107147056453f5348
|
|
||||||
TARGET=${DIR}/librispeech_model_fluid.tar.gz
|
|
||||||
|
|
||||||
|
|
||||||
echo "Download LibriSpeech model ..."
|
|
||||||
download $URL $MD5 $TARGET
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo "Fail to download LibriSpeech model!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
tar -zxvf $TARGET -C ${DIR}
|
|
||||||
|
|
||||||
exit 0
|
|
@ -0,0 +1,90 @@
|
|||||||
|
# network architecture
|
||||||
|
# encoder related
|
||||||
|
encoder: conformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
cnn_module_kernel: 15
|
||||||
|
use_cnn_module: True
|
||||||
|
activation_type: 'swish'
|
||||||
|
pos_enc_layer_type: 'rel_pos'
|
||||||
|
selfattention_layer_type: 'rel_selfattn'
|
||||||
|
causal: true
|
||||||
|
use_dynamic_chunk: true
|
||||||
|
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||||
|
use_dynamic_left_chunk: false
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
|
||||||
|
# use raw_wav or kaldi feature
|
||||||
|
raw_wav: true
|
||||||
|
|
||||||
|
# feature extraction
|
||||||
|
collate_conf:
|
||||||
|
# waveform level config
|
||||||
|
wav_distortion_conf:
|
||||||
|
wav_dither: 1.0
|
||||||
|
wav_distortion_rate: 0.0
|
||||||
|
distortion_methods: []
|
||||||
|
speed_perturb: true
|
||||||
|
feature_extraction_conf:
|
||||||
|
feature_type: 'fbank'
|
||||||
|
mel_bins: 80
|
||||||
|
frame_shift: 10
|
||||||
|
frame_length: 25
|
||||||
|
using_pitch: false
|
||||||
|
# spec level config
|
||||||
|
# spec_swap: false
|
||||||
|
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||||
|
spec_aug: true
|
||||||
|
spec_aug_conf:
|
||||||
|
warp_for_time: False
|
||||||
|
num_t_mask: 2
|
||||||
|
num_f_mask: 2
|
||||||
|
max_t: 50
|
||||||
|
max_f: 10
|
||||||
|
max_w: 80
|
||||||
|
|
||||||
|
# dataset related
|
||||||
|
dataset_conf:
|
||||||
|
max_length: 40960
|
||||||
|
min_length: 0
|
||||||
|
batch_type: 'static' # static or dynamic
|
||||||
|
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||||
|
batch_size: 16
|
||||||
|
sort: true
|
||||||
|
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 1
|
||||||
|
max_epoch: 180
|
||||||
|
log_interval: 100
|
||||||
|
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.001
|
||||||
|
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
@ -0,0 +1,83 @@
|
|||||||
|
# network architecture
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder architecture type
|
||||||
|
normalize_before: true
|
||||||
|
use_dynamic_chunk: true
|
||||||
|
use_dynamic_left_chunk: false
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
# use raw_wav or kaldi feature
|
||||||
|
raw_wav: true
|
||||||
|
|
||||||
|
# feature extraction
|
||||||
|
collate_conf:
|
||||||
|
# waveform level config
|
||||||
|
wav_distortion_conf:
|
||||||
|
wav_dither: 0.0
|
||||||
|
wav_distortion_rate: 0.0
|
||||||
|
distortion_methods: []
|
||||||
|
speed_perturb: false
|
||||||
|
feature_extraction_conf:
|
||||||
|
feature_type: 'fbank'
|
||||||
|
mel_bins: 80
|
||||||
|
frame_shift: 10
|
||||||
|
frame_length: 25
|
||||||
|
using_pitch: false
|
||||||
|
# spec level config
|
||||||
|
# spec_swap: false
|
||||||
|
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||||
|
spec_aug: true
|
||||||
|
spec_aug_conf:
|
||||||
|
warp_for_time: False
|
||||||
|
num_t_mask: 2
|
||||||
|
num_f_mask: 2
|
||||||
|
max_t: 50
|
||||||
|
max_f: 10
|
||||||
|
max_w: 80
|
||||||
|
|
||||||
|
|
||||||
|
# dataset related
|
||||||
|
dataset_conf:
|
||||||
|
max_length: 40960
|
||||||
|
min_length: 0
|
||||||
|
batch_type: 'static' # static or dynamic
|
||||||
|
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||||
|
batch_size: 16
|
||||||
|
sort: true
|
||||||
|
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 1
|
||||||
|
max_epoch: 180
|
||||||
|
log_interval: 100
|
||||||
|
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.002
|
||||||
|
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
@ -0,0 +1,86 @@
|
|||||||
|
# network architecture
|
||||||
|
# encoder related
|
||||||
|
encoder: conformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: true
|
||||||
|
cnn_module_kernel: 15
|
||||||
|
use_cnn_module: True
|
||||||
|
activation_type: 'swish'
|
||||||
|
pos_enc_layer_type: 'rel_pos'
|
||||||
|
selfattention_layer_type: 'rel_selfattn'
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
# use raw_wav or kaldi feature
|
||||||
|
raw_wav: true
|
||||||
|
|
||||||
|
# feature extraction
|
||||||
|
collate_conf:
|
||||||
|
# waveform level config
|
||||||
|
wav_distortion_conf:
|
||||||
|
wav_dither: 0.1
|
||||||
|
wav_distortion_rate: 0.0
|
||||||
|
distortion_methods: []
|
||||||
|
speed_perturb: true
|
||||||
|
feature_extraction_conf:
|
||||||
|
feature_type: 'fbank'
|
||||||
|
mel_bins: 80
|
||||||
|
frame_shift: 10
|
||||||
|
frame_length: 25
|
||||||
|
using_pitch: false
|
||||||
|
# spec level config
|
||||||
|
# spec_swap: false
|
||||||
|
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||||
|
spec_aug: true
|
||||||
|
spec_aug_conf:
|
||||||
|
warp_for_time: False
|
||||||
|
num_t_mask: 2
|
||||||
|
num_f_mask: 2
|
||||||
|
max_t: 50
|
||||||
|
max_f: 10
|
||||||
|
max_w: 80
|
||||||
|
|
||||||
|
|
||||||
|
# dataset related
|
||||||
|
dataset_conf:
|
||||||
|
max_length: 40960
|
||||||
|
min_length: 0
|
||||||
|
batch_type: 'static' # static or dynamic
|
||||||
|
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||||
|
batch_size: 16
|
||||||
|
sort: true
|
||||||
|
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 4
|
||||||
|
max_epoch: 240
|
||||||
|
log_interval: 100
|
||||||
|
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.002
|
||||||
|
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
@ -0,0 +1,80 @@
|
|||||||
|
# network architecture
|
||||||
|
# encoder related
|
||||||
|
encoder: transformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 256 # dimension of attention
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder architecture type
|
||||||
|
normalize_before: true
|
||||||
|
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 4
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
|
||||||
|
# use raw_wav or kaldi feature
|
||||||
|
raw_wav: true
|
||||||
|
|
||||||
|
# feature extraction
|
||||||
|
collate_conf:
|
||||||
|
# waveform level config
|
||||||
|
wav_distortion_conf:
|
||||||
|
wav_dither: 0.1
|
||||||
|
wav_distortion_rate: 0.0
|
||||||
|
distortion_methods: []
|
||||||
|
speed_perturb: true
|
||||||
|
feature_extraction_conf:
|
||||||
|
feature_type: 'fbank'
|
||||||
|
mel_bins: 80
|
||||||
|
frame_shift: 10
|
||||||
|
frame_length: 25
|
||||||
|
using_pitch: false
|
||||||
|
# spec level config
|
||||||
|
feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
|
||||||
|
spec_aug: true
|
||||||
|
spec_aug_conf:
|
||||||
|
warp_for_time: False
|
||||||
|
num_t_mask: 2
|
||||||
|
num_f_mask: 2
|
||||||
|
max_t: 50
|
||||||
|
max_f: 10
|
||||||
|
max_w: 80
|
||||||
|
|
||||||
|
|
||||||
|
# dataset related
|
||||||
|
dataset_conf:
|
||||||
|
max_length: 40960
|
||||||
|
min_length: 0
|
||||||
|
batch_type: 'static' # static or dynamic
|
||||||
|
# the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
|
||||||
|
batch_size: 26
|
||||||
|
sort: true
|
||||||
|
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 1
|
||||||
|
max_epoch: 240
|
||||||
|
log_interval: 100
|
||||||
|
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.002
|
||||||
|
scheduler: warmuplr # pytorch v1.1.0+ required
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
@ -0,0 +1 @@
|
|||||||
|
../../s0/local/data.sh
|
@ -0,0 +1 @@
|
|||||||
|
../../s0/local/download_lm_en.sh
|
@ -0,0 +1,14 @@
|
|||||||
|
export MAIN_ROOT=${PWD}/../../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||||
|
|
||||||
|
|
||||||
|
MODEL=u2
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
|
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
# prepare data
|
||||||
|
bash ./local/data.sh
|
||||||
|
|
||||||
|
# train model
|
||||||
|
bash ./local/train.sh
|
||||||
|
|
||||||
|
# test model
|
||||||
|
bash ./local/test.sh
|
||||||
|
|
||||||
|
# infer model
|
||||||
|
bash ./local/infer.sh
|
Loading…
Reference in new issue