Merge pull request #739 from PaddlePaddle/k8

zh 8k egs
4 years ago · de520cee96
parent 8432b9b256 7278cf9bbb
commit de520cee96
15 changed files with 630 additions and 4 deletions
--- a/examples/callcenter/s1/.gitignore
+++ b/examples/callcenter/s1/.gitignore
@ -0,0 +1,3 @@
 data
 exp
 *.profile
--- a/examples/callcenter/s1/README.md
+++ b/examples/callcenter/s1/README.md
@ -0,0 +1,20 @@
 # MandarinK8
 ## Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |  
 | conformer | 45.73 M | conf/conformer.yaml | spec_aug + shift | test | attention | 2.1794936656951904 | 0.102304 |  
 | conformer | 45.73 M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 2.1794936656951904 | 0.084295 |  
 | conformer | 45.73 M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 2.1794936656951904 | 0.084340 |  
 | conformer | 45.73 M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | 2.1794936656951904 | 0.081675 |  
 ## Chunk Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | 2.23287845  | 0.087982 |  
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | 2.23287845  | 0.086962 |  
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | 2.23287845 | 0.086741 |  
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | 2.23287845 | 0.083495 | 
--- a/examples/callcenter/s1/conf/augmentation.json
+++ b/examples/callcenter/s1/conf/augmentation.json
@ -0,0 +1,34 @@
 [
  {
    "type": "speed",
    "params": {
      "min_speed_rate": 0.9,
      "max_speed_rate": 1.1,
      "num_rates": 3
    },
    "prob": 0.0
  },
  {
    "type": "shift",
    "params": {
      "min_shift_ms": -5,
      "max_shift_ms": 5
    },
    "prob": 1.0
  },
  {
    "type": "specaug",
    "params": {
      "F": 10,
      "T": 50,
      "n_freq_masks": 2,
      "n_time_masks": 2,
      "p": 1.0,
      "W": 80,
      "adaptive_number_ratio": 0,
      "adaptive_size_ratio": 0,
      "max_n_time_masks": 20
    },
    "prob": 1.0
  }
 ]
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@ -0,0 +1,120 @@
 # https://yaml.org/type/float.html
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 8000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True 
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
    encoder_conf:
        output_size: 256    # dimension of attention
        attention_heads: 4
        linear_units: 2048  # the number of units of position-wise feed forward
        num_blocks: 12      # the number of encoder blocks
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.0
        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
        normalize_before: True
        use_cnn_module: True
        cnn_module_kernel: 15
        activation_type: 'swish'
        pos_enc_layer_type: 'rel_pos'
        selfattention_layer_type: 'rel_selfattn'
        causal: true
        use_dynamic_chunk: true
        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
        use_dynamic_left_chunk: false
    # decoder related
    decoder: transformer
    decoder_conf:
        attention_heads: 4
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        self_attention_dropout_rate: 0.0
        src_attention_dropout_rate: 0.0
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
 training:
  n_epoch: 240
  accum_grad: 4
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
    lr: 0.001
    weight_decay: 1e-6
  scheduler: warmuplr     # pytorch v1.1.0+ required
  scheduler_conf:
    warmup_steps: 25000
    lr_decay: 1.0
  log_interval: 100
  checkpoint:
    kbest_n: 50
    latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@ -0,0 +1,117 @@
 # https://yaml.org/type/float.html
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.0
  max_output_input_ratio: .inf 
 collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/augmentation.json
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 8000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True 
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: "data/mean_std.json"
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
    encoder_conf:
        output_size: 256    # dimension of attention
        attention_heads: 4
        linear_units: 2048  # the number of units of position-wise feed forward
        num_blocks: 12      # the number of encoder blocks
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.0
        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
        normalize_before: True
        use_cnn_module: True
        cnn_module_kernel: 15
        activation_type: 'swish'
        pos_enc_layer_type: 'rel_pos'
        selfattention_layer_type: 'rel_selfattn'
    # decoder related
    decoder: transformer
    decoder_conf:
        attention_heads: 4
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        self_attention_dropout_rate: 0.0
        src_attention_dropout_rate: 0.0
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
 training:
  n_epoch: 100 # 50 will be lowest 
  accum_grad: 4
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
    lr: 0.002
    weight_decay: 1e-6
  scheduler: warmuplr     # pytorch v1.1.0+ required
  scheduler_conf:
    warmup_steps: 25000
    lr_decay: 1.0
  log_interval: 100
  checkpoint:
    kbest_n: 50
    latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/callcenter/s1/local/align.sh
+++ b/examples/callcenter/s1/local/align.sh
@ -0,0 +1,43 @@
 #! /usr/bin/env bash
 if [ $# != 2 ];then
    echo "usage: ${0} config_path ckpt_path_prefix"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 device=gpu
 if [ ${ngpu} == 0 ];then
    device=cpu
 fi
 config_path=$1
 ckpt_prefix=$2
 ckpt_name=$(basename ${ckpt_prefxi})
 mkdir -p exp
 batch_size=1
 output_dir=${ckpt_prefix}
 mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
 --device ${device} \
 --nproc 1 \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
 --opts decoding.batch_size ${batch_size}
 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"
    exit 1
 fi
 exit 0
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@ -0,0 +1,77 @@
 #! /usr/bin/env bash
 stage=-1
 stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    for dataset in train dev test; do
        mv data/manifest.${dataset} data/manifest.${dataset}.raw
    done
 fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # download data, generate manifests
    # build vocabulary
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type="char" \
    --count_threshold=0 \
    --vocab_path="data/vocab.txt" \
    --manifest_paths "data/manifest.train.raw"
    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
        exit 1
    fi
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # compute mean and stddev for normalizer
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --specgram_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10.0 \
    --window_ms=25.0 \
    --sample_rate=8000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
    if [ $? -ne 0 ]; then
        echo "Compute mean and stddev failed. Terminated."
        exit 1
    fi
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \
            --manifest_path="data/manifest.${dataset}.raw" \
            --output_path="data/manifest.${dataset}"
        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
            exit 1
        fi
    } &
    done
    wait
 fi
 echo "data preparation done."
 exit 0
--- a/examples/callcenter/s1/local/download_lm_ch.sh
+++ b/examples/callcenter/s1/local/download_lm_ch.sh
@ -0,0 +1,21 @@
 #!/bin/bash
 . ${MAIN_ROOT}/utils/utility.sh
 DIR=data/lm
 mkdir -p ${DIR}
 URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
 MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
 echo "Download language model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then
    echo "Fail to download the language model!"
    exit 1
 fi
 exit 0
--- a/examples/callcenter/s1/local/export.sh
+++ b/examples/callcenter/s1/local/export.sh
@ -0,0 +1,34 @@
 #! /usr/bin/env bash
 if [ $# != 3 ];then
    echo "usage: $0 config_path ckpt_prefix jit_model_path"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
 device=gpu
 if [ ${ngpu} == 0 ];then
    device=cpu
 fi
 python3 -u ${BIN_DIR}/export.py \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
 if [ $? -ne 0 ]; then
    echo "Failed in export!"
    exit 1
 fi
 exit 0
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
@ -0,0 +1,67 @@
 #! /usr/bin/env bash
 if [ $# != 2 ];then
    echo "usage: ${0} config_path ckpt_path_prefix"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 device=gpu
 if [ ${ngpu} == 0 ];then
    device=cpu
 fi
 config_path=$1
 ckpt_prefix=$2
 ckpt_name=$(basename ${ckpt_prefxi})
 mkdir -p exp
 # download language model
 #bash local/download_lm_ch.sh
 #if [ $? -ne 0 ]; then
 #    exit 1
 #fi
 for type in attention ctc_greedy_search; do
    echo "decoding ${type}"
    batch_size=1
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
    --device ${device} \
    --nproc 1 \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
 done
 for type in ctc_prefix_beam_search attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
    --device ${device} \
    --nproc 1 \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
 done
 exit 0
--- a/examples/callcenter/s1/local/train.sh
+++ b/examples/callcenter/s1/local/train.sh
@ -0,0 +1,33 @@
 #! /usr/bin/env bash
 if [ $# != 2 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 device=gpu
 if [ ${ngpu} == 0 ];then
    device=cpu
 fi
 echo "using ${device}..."
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name}
 if [ $? -ne 0 ]; then
    echo "Failed in training!"
    exit 1
 fi
 exit 0
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
@ -0,0 +1,14 @@
 export MAIN_ROOT=${PWD}/../../../
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8 
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 MODEL=u2
 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/callcenter/s1/run.sh
+++ b/examples/callcenter/s1/run.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 set -e
 source path.sh
 stage=0
 stop_stage=100
 conf_path=conf/conformer.yaml
 avg_num=20
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
    CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path}  ${ckpt}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
    avg.sh exp/${ckpt}/checkpoints ${avg_num}
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
    CUDA_VISIBLE_DEVICES=4 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # export ckpt avg_n
    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@ -17,9 +17,8 @@
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
 | conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.01250648 | 0.069548 |  
 | conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.01250648 | 0.094753 |  
-| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.01250648 |  |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.01250648 | - |  
-| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.01250648 |  |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.01250648 | - |  
 ## Transformer
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@ -110,7 +110,7 @@ decoding:
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here.