diff --git a/examples/tiny/s0/local/download_model.sh b/examples/tiny/s0/local/download_model.sh deleted file mode 100644 index f13bde0f2..000000000 --- a/examples/tiny/s0/local/download_model.sh +++ /dev/null @@ -1,21 +0,0 @@ -#! /usr/bin/env bash - -. ${MAIN_ROOT}/utils/utility.sh - -DIR=data/pretrain -mkdir -p ${DIR} - -URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz' -MD5=fafb11fe57c3ecd107147056453f5348 -TARGET=${DIR}/librispeech_model_fluid.tar.gz - - -echo "Download LibriSpeech model ..." -download $URL $MD5 $TARGET -if [ $? -ne 0 ]; then - echo "Fail to download LibriSpeech model!" - exit 1 -fi -tar -zxvf $TARGET -C ${DIR} - -exit 0 diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml new file mode 100644 index 000000000..60d0205bf --- /dev/null +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -0,0 +1,90 @@ +# network architecture +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +# use raw_wav or kaldi feature +raw_wav: true + +# feature extraction +collate_conf: + # waveform level config + wav_distortion_conf: + wav_dither: 1.0 + wav_distortion_rate: 0.0 + distortion_methods: [] + speed_perturb: true + feature_extraction_conf: + feature_type: 'fbank' + mel_bins: 80 + frame_shift: 10 + frame_length: 25 + using_pitch: false + # spec level config + # spec_swap: false + feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature + spec_aug: true + spec_aug_conf: + warp_for_time: False + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 10 + max_w: 80 + +# dataset related +dataset_conf: + max_length: 40960 + min_length: 0 + batch_type: 'static' # static or dynamic + # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB + batch_size: 16 + sort: true + +grad_clip: 5 +accum_grad: 1 +max_epoch: 180 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.001 +scheduler: warmuplr # pytorch v1.1.0+ required +scheduler_conf: + warmup_steps: 25000 \ No newline at end of file diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml new file mode 100644 index 000000000..8de073811 --- /dev/null +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -0,0 +1,83 @@ +# network architecture +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder architecture type + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# use raw_wav or kaldi feature +raw_wav: true + +# feature extraction +collate_conf: + # waveform level config + wav_distortion_conf: + wav_dither: 0.0 + wav_distortion_rate: 0.0 + distortion_methods: [] + speed_perturb: false + feature_extraction_conf: + feature_type: 'fbank' + mel_bins: 80 + frame_shift: 10 + frame_length: 25 + using_pitch: false + # spec level config + # spec_swap: false + feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature + spec_aug: true + spec_aug_conf: + warp_for_time: False + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 10 + max_w: 80 + + +# dataset related +dataset_conf: + max_length: 40960 + min_length: 0 + batch_type: 'static' # static or dynamic + # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB + batch_size: 16 + sort: true + +grad_clip: 5 +accum_grad: 1 +max_epoch: 180 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.002 +scheduler: warmuplr # pytorch v1.1.0+ required +scheduler_conf: + warmup_steps: 25000 \ No newline at end of file diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml new file mode 100644 index 000000000..aac997938 --- /dev/null +++ b/examples/tiny/s1/conf/conformer.yaml @@ -0,0 +1,86 @@ +# network architecture +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# use raw_wav or kaldi feature +raw_wav: true + +# feature extraction +collate_conf: + # waveform level config + wav_distortion_conf: + wav_dither: 0.1 + wav_distortion_rate: 0.0 + distortion_methods: [] + speed_perturb: true + feature_extraction_conf: + feature_type: 'fbank' + mel_bins: 80 + frame_shift: 10 + frame_length: 25 + using_pitch: false + # spec level config + # spec_swap: false + feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature + spec_aug: true + spec_aug_conf: + warp_for_time: False + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 10 + max_w: 80 + + +# dataset related +dataset_conf: + max_length: 40960 + min_length: 0 + batch_type: 'static' # static or dynamic + # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB + batch_size: 16 + sort: true + +grad_clip: 5 +accum_grad: 4 +max_epoch: 240 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.002 +scheduler: warmuplr # pytorch v1.1.0+ required +scheduler_conf: + warmup_steps: 25000 \ No newline at end of file diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml new file mode 100644 index 000000000..219389da0 --- /dev/null +++ b/examples/tiny/s1/conf/transformer.yaml @@ -0,0 +1,80 @@ +# network architecture +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder architecture type + normalize_before: true + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# use raw_wav or kaldi feature +raw_wav: true + +# feature extraction +collate_conf: + # waveform level config + wav_distortion_conf: + wav_dither: 0.1 + wav_distortion_rate: 0.0 + distortion_methods: [] + speed_perturb: true + feature_extraction_conf: + feature_type: 'fbank' + mel_bins: 80 + frame_shift: 10 + frame_length: 25 + using_pitch: false + # spec level config + feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature + spec_aug: true + spec_aug_conf: + warp_for_time: False + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 10 + max_w: 80 + + +# dataset related +dataset_conf: + max_length: 40960 + min_length: 0 + batch_type: 'static' # static or dynamic + # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB + batch_size: 26 + sort: true + +grad_clip: 5 +accum_grad: 1 +max_epoch: 240 +log_interval: 100 + +optim: adam +optim_conf: + lr: 0.002 +scheduler: warmuplr # pytorch v1.1.0+ required +scheduler_conf: + warmup_steps: 25000 \ No newline at end of file diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh new file mode 120000 index 000000000..7c4cf4564 --- /dev/null +++ b/examples/tiny/s1/local/data.sh @@ -0,0 +1 @@ +../../s0/local/data.sh \ No newline at end of file diff --git a/examples/tiny/s1/local/download_lm_en.sh b/examples/tiny/s1/local/download_lm_en.sh new file mode 120000 index 000000000..831f3c31c --- /dev/null +++ b/examples/tiny/s1/local/download_lm_en.sh @@ -0,0 +1 @@ +../../s0/local/download_lm_en.sh \ No newline at end of file diff --git a/examples/tiny/s1/path.sh b/examples/tiny/s1/path.sh new file mode 100644 index 000000000..14e4fc3ec --- /dev/null +++ b/examples/tiny/s1/path.sh @@ -0,0 +1,14 @@ +export MAIN_ROOT=${PWD}/../../../ + +export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + + +MODEL=u2 +export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin diff --git a/examples/tiny/s1/run.sh b/examples/tiny/s1/run.sh new file mode 100644 index 000000000..2b5ed5308 --- /dev/null +++ b/examples/tiny/s1/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +source path.sh + +# prepare data +bash ./local/data.sh + +# train model +bash ./local/train.sh + +# test model +bash ./local/test.sh + +# infer model +bash ./local/infer.sh diff --git a/tests/u2_model_test.py b/tests/u2_model_test.py index a86210750..c8c547dd5 100644 --- a/tests/u2_model_test.py +++ b/tests/u2_model_test.py @@ -13,8 +13,11 @@ # limitations under the License. import paddle -import numpy as np + import unittest +import numpy as np +from yacs.config import CfgNode as CN + from deepspeech.models.u2 import U2TransformerModel from deepspeech.models.u2 import U2ConformerModel @@ -41,9 +44,82 @@ class TestU2Model(unittest.TestCase): self.text_len = paddle.to_tensor(text_len, dtype='int64') def test_transformer(self): + conf_str = """ + # network architecture + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder architecture type + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + """ + cfg = CN().load_cfg(conf_str) + print(cfg) model = U2TransformerModel() def test_conformer(self): + conf_str = """ + # network architecture + # encoder related + encoder: conformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + """ + cfg = CN().load_cfg(conf_str) + print(cfg) model = U2ConformerModel()