add u2 config

5 years ago · 2fa6bbbed5
parent 9626e99ce4
commit 2fa6bbbed5
10 changed files with 448 additions and 22 deletions
--- a/examples/tiny/s0/local/download_model.sh
+++ b/examples/tiny/s0/local/download_model.sh
@ -1,21 +0,0 @@
 #! /usr/bin/env bash
 . ${MAIN_ROOT}/utils/utility.sh
 DIR=data/pretrain
 mkdir -p ${DIR}
 URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz'
 MD5=fafb11fe57c3ecd107147056453f5348
 TARGET=${DIR}/librispeech_model_fluid.tar.gz
 echo "Download LibriSpeech model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then
    echo "Fail to download LibriSpeech model!"
    exit 1
 fi
 tar -zxvf $TARGET -C ${DIR}
 exit 0
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@ -0,0 +1,90 @@
 # network architecture
 # encoder related
 encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
    cnn_module_kernel: 15
    use_cnn_module: True
    activation_type: 'swish'
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
    causal: true
    use_dynamic_chunk: true
    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 # use raw_wav or kaldi feature
 raw_wav: true
 # feature extraction
 collate_conf:
    # waveform level config
    wav_distortion_conf:
        wav_dither: 1.0
        wav_distortion_rate: 0.0
        distortion_methods: []
    speed_perturb: true
    feature_extraction_conf:
        feature_type: 'fbank'
        mel_bins: 80
        frame_shift: 10
        frame_length: 25
        using_pitch: false
    # spec level config
    # spec_swap: false
    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
    spec_aug: true
    spec_aug_conf:
        warp_for_time: False
        num_t_mask: 2
        num_f_mask: 2
        max_t: 50
        max_f: 10
        max_w: 80
 # dataset related
 dataset_conf:
    max_length: 40960
    min_length: 0
    batch_type: 'static' # static or dynamic
    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
    batch_size: 16
    sort: true
 grad_clip: 5
 accum_grad: 1
 max_epoch: 180
 log_interval: 100
 optim: adam
 optim_conf:
    lr: 0.001
 scheduler: warmuplr     # pytorch v1.1.0+ required
 scheduler_conf:
    warmup_steps: 25000
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@ -0,0 +1,83 @@
 # network architecture
 # encoder related
 encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder architecture type
    normalize_before: true
    use_dynamic_chunk: true
    use_dynamic_left_chunk: false
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 # use raw_wav or kaldi feature
 raw_wav: true
 # feature extraction
 collate_conf:
    # waveform level config
    wav_distortion_conf:
        wav_dither: 0.0
        wav_distortion_rate: 0.0
        distortion_methods: []
    speed_perturb: false
    feature_extraction_conf:
        feature_type: 'fbank'
        mel_bins: 80
        frame_shift: 10
        frame_length: 25
        using_pitch: false
    # spec level config
    # spec_swap: false
    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
    spec_aug: true
    spec_aug_conf:
        warp_for_time: False
        num_t_mask: 2
        num_f_mask: 2
        max_t: 50
        max_f: 10
        max_w: 80
 # dataset related
 dataset_conf:
    max_length: 40960
    min_length: 0
    batch_type: 'static' # static or dynamic
    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
    batch_size: 16
    sort: true
 grad_clip: 5
 accum_grad: 1
 max_epoch: 180
 log_interval: 100
 optim: adam
 optim_conf:
    lr: 0.002
 scheduler: warmuplr     # pytorch v1.1.0+ required
 scheduler_conf:
    warmup_steps: 25000
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@ -0,0 +1,86 @@
 # network architecture
 # encoder related
 encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: true
    cnn_module_kernel: 15
    use_cnn_module: True
    activation_type: 'swish'
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 # use raw_wav or kaldi feature
 raw_wav: true
 # feature extraction
 collate_conf:
    # waveform level config
    wav_distortion_conf:
        wav_dither: 0.1
        wav_distortion_rate: 0.0
        distortion_methods: []
    speed_perturb: true
    feature_extraction_conf:
        feature_type: 'fbank'
        mel_bins: 80
        frame_shift: 10
        frame_length: 25
        using_pitch: false
    # spec level config
    # spec_swap: false
    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
    spec_aug: true
    spec_aug_conf:
        warp_for_time: False
        num_t_mask: 2
        num_f_mask: 2
        max_t: 50
        max_f: 10
        max_w: 80
 # dataset related
 dataset_conf:
    max_length: 40960
    min_length: 0
    batch_type: 'static' # static or dynamic
    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
    batch_size: 16
    sort: true
 grad_clip: 5
 accum_grad: 4
 max_epoch: 240
 log_interval: 100
 optim: adam
 optim_conf:
    lr: 0.002
 scheduler: warmuplr     # pytorch v1.1.0+ required
 scheduler_conf:
    warmup_steps: 25000
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@ -0,0 +1,80 @@
 # network architecture
 # encoder related
 encoder: transformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder architecture type
    normalize_before: true
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 # use raw_wav or kaldi feature
 raw_wav: true
 # feature extraction
 collate_conf:
    # waveform level config
    wav_distortion_conf:
        wav_dither: 0.1
        wav_distortion_rate: 0.0
        distortion_methods: []
    speed_perturb: true
    feature_extraction_conf:
        feature_type: 'fbank'
        mel_bins: 80
        frame_shift: 10
        frame_length: 25
        using_pitch: false
    # spec level config
    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
    spec_aug: true
    spec_aug_conf:
        warp_for_time: False
        num_t_mask: 2
        num_f_mask: 2
        max_t: 50
        max_f: 10
        max_w: 80
 # dataset related
 dataset_conf:
    max_length: 40960
    min_length: 0
    batch_type: 'static' # static or dynamic
    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
    batch_size: 26
    sort: true
 grad_clip: 5
 accum_grad: 1
 max_epoch: 240
 log_interval: 100
 optim: adam
 optim_conf:
    lr: 0.002
 scheduler: warmuplr     # pytorch v1.1.0+ required
 scheduler_conf:
    warmup_steps: 25000
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@ -0,0 +1 @@
 ../../s0/local/data.sh
--- a/examples/tiny/s1/local/download_lm_en.sh
+++ b/examples/tiny/s1/local/download_lm_en.sh
@ -0,0 +1 @@
 ../../s0/local/download_lm_en.sh
--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
@ -0,0 +1,14 @@
 export MAIN_ROOT=${PWD}/../../../
 export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 MODEL=u2
 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@ -0,0 +1,16 @@
 #!/bin/bash
 set -e
 source path.sh
 # prepare data
 bash ./local/data.sh
 # train model
 bash ./local/train.sh
 # test model
 bash ./local/test.sh
 # infer model
 bash ./local/infer.sh
--- a/tests/u2_model_test.py
+++ b/tests/u2_model_test.py
@ -13,8 +13,11 @@
 # limitations under the License.
 import paddle
-import numpy as np
+
 import unittest
 import numpy as np
 from yacs.config import CfgNode as CN
 from deepspeech.models.u2 import U2TransformerModel
 from deepspeech.models.u2 import U2ConformerModel
@ -41,9 +44,82 @@ class TestU2Model(unittest.TestCase):
        self.text_len = paddle.to_tensor(text_len, dtype='int64')
    def test_transformer(self):
        conf_str = """
            # network architecture
            # encoder related
            encoder: transformer
            encoder_conf:
                output_size: 256    # dimension of attention
                attention_heads: 4
                linear_units: 2048  # the number of units of position-wise feed forward
                num_blocks: 12      # the number of encoder blocks
                dropout_rate: 0.1
                positional_dropout_rate: 0.1
                attention_dropout_rate: 0.0
                input_layer: conv2d # encoder architecture type
                normalize_before: true
            # decoder related
            decoder: transformer
            decoder_conf:
                attention_heads: 4
                linear_units: 2048
                num_blocks: 6
                dropout_rate: 0.1
                positional_dropout_rate: 0.1
                self_attention_dropout_rate: 0.0
                src_attention_dropout_rate: 0.0
            # hybrid CTC/attention
            model_conf:
                ctc_weight: 0.3
                lsm_weight: 0.1     # label smoothing option
                length_normalized_loss: false
        """
        cfg = CN().load_cfg(conf_str)
        print(cfg)
        model = U2TransformerModel()
    def test_conformer(self):
        conf_str = """
            # network architecture
            # encoder related
            encoder: conformer
            encoder_conf:
                output_size: 256    # dimension of attention
                attention_heads: 4
                linear_units: 2048  # the number of units of position-wise feed forward
                num_blocks: 12      # the number of encoder blocks
                dropout_rate: 0.1
                positional_dropout_rate: 0.1
                attention_dropout_rate: 0.0
                input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
                normalize_before: true
                cnn_module_kernel: 15
                use_cnn_module: True
                activation_type: 'swish'
                pos_enc_layer_type: 'rel_pos'
                selfattention_layer_type: 'rel_selfattn'
            # decoder related
            decoder: transformer
            decoder_conf:
                attention_heads: 4
                linear_units: 2048
                num_blocks: 6
                dropout_rate: 0.1
                positional_dropout_rate: 0.1
                self_attention_dropout_rate: 0.0
                src_attention_dropout_rate: 0.0
            # hybrid CTC/attention
            model_conf:
                ctc_weight: 0.3
                lsm_weight: 0.1     # label smoothing option
                length_normalized_loss: false
        """
        cfg = CN().load_cfg(conf_str)
        print(cfg)
        model = U2ConformerModel()