spm demo; librisppech scripts and confs

4 years ago · f3cecf8809
parent 716bf6f1dd
commit f3cecf8809
25 changed files with 810 additions and 4 deletions
--- a/examples/aishell/.gitignore
+++ b/examples/aishell/.gitignore
@ -1,5 +1,5 @@
 data
 ckpt*
 demo_cache
-*.log
-log
+*log
+*profile
--- a/examples/aishell/s1/.gitignore
+++ b/examples/aishell/s1/.gitignore
@ -0,0 +1,3 @@
+data
+exp
+log
--- a/examples/librispeech/.gitignore
+++ b/examples/librispeech/.gitignore
@ -1,2 +1,4 @@
 data
+exp
+log
 ckpt*
--- a/examples/librispeech/s1/.gitignore
+++ b/examples/librispeech/s1/.gitignore
@ -0,0 +1,3 @@
+data
+exp
+log
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
@ -0,0 +1,34 @@
+[
+  {
+    "type": "shift",
+    "params": {
+      "min_shift_ms": -5,
+      "max_shift_ms": 5
+    },
+    "prob": 1.0
+  },
+  {
+    "type": "speed",
+    "params": {
+      "min_speed_rate": 0.9,
+      "max_speed_rate": 1.1,
+      "num_rates": 3
+    },
+    "prob": 0.0
+  },
+  {
+    "type": "specaug",
+    "params": {
+      "F": 10,
+      "T": 50,
+      "n_freq_masks": 2,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "W": 80,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20
+    },
+    "prob": 1.0
+  }
+]
--- a/examples/librispeech/s1/conf/chunk_confermer.yaml
+++ b/examples/librispeech/s1/conf/chunk_confermer.yaml
@ -0,0 +1,115 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.tiny
+  dev_manifest: data/manifest.tiny
+  test_manifest: data/manifest.tiny
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
+  min_input_len: 0.5
+  max_input_len: 20.0
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: conformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: True
+        use_cnn_module: True
+        cnn_module_kernel: 15
+        activation_type: 'swish'
+        pos_enc_layer_type: 'rel_pos'
+        selfattention_layer_type: 'rel_selfattn'
+        causal: True
+        use_dynamic_chunk: True
+        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+        use_dynamic_left_chunk: false
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 20
+  accum_grad: 1
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.001
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 1
+
+
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.0 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@ -0,0 +1,108 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.tiny
+  dev_manifest: data/manifest.tiny
+  test_manifest: data/manifest.tiny
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+        use_dynamic_chunk: true
+        use_dynamic_left_chunk: false
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 20
+  accum_grad: 1
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 1
+
+
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.0 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@ -0,0 +1,111 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.tiny
+  dev_manifest: data/manifest.tiny
+  test_manifest: data/manifest.tiny
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
+  min_input_len: 0.5
+  max_input_len: 20.0
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: conformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+        use_cnn_module: True
+        cnn_module_kernel: 15
+        activation_type: 'swish'
+        pos_enc_layer_type: 'rel_pos'
+        selfattention_layer_type: 'rel_selfattn'
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 20
+  accum_grad: 4
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 1
+
+
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.0 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@ -0,0 +1,106 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.tiny
+  dev_manifest: data/manifest.tiny
+  test_manifest: data/manifest.tiny
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 20
+  accum_grad: 1
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 1
+
+
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.0 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/librispeech/s1/local/avg.sh
+++ b/examples/librispeech/s1/local/avg.sh
@ -0,0 +1,23 @@
+#! /usr/bin/env bash
+
+if [ $# != 2 ];then
+    echo "usage: ${0} ckpt_dir avg_num"
+    exit -1
+fi
+
+ckpt_dir=${1}
+average_num=${2}
+decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
+
+python3 -u ${MAIN_ROOT}/utils/avg_model.py \
+--dst_model ${decode_checkpoint} \
+--ckpt_dir ${ckpt_dir}  \
+--num ${average_num} \
+--val_best
+            
+if [ $? -ne 0 ]; then
+    echo "Failed in avg ckpt!"
+    exit 1
+fi
+
+exit 0
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@ -0,0 +1,90 @@
+#! /usr/bin/env bash
+
+stage=-1
+stop_stage=100
+
+# bpemode (unigram or bpe)
+nbpe=200
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/librispeech/librispeech.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/librispeech" \
+    --full_download="False"
+    
+    if [ $? -ne 0 ]; then
+        echo "Prepare LibriSpeech failed. Terminated."
+        exit 1
+    fi
+    
+    head -n 64 data/manifest.dev-clean  > data/manifest.tiny.raw
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.tiny.raw"
+    
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # compute mean and stddev for normalizer
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.tiny.raw" \
+    --num_samples=64 \
+    --specgram_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --sample_rate=16000 \
+    --stride_ms=10.0 \
+    --window_ms=25.0 \
+    --num_workers=2 \
+    --output_path="data/mean_std.json"
+    
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    python3 ${MAIN_ROOT}/utils/format_data.py \
+    --feat_type "raw" \
+    --cmvn_path "data/mean_std.json" \
+    --unit_type "spm" \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --manifest_path="data/manifest.tiny.raw" \
+    --output_path="data/manifest.tiny"
+    
+    
+    if [ $? -ne 0 ]; then
+        echo "Formt mnaifest failed. Terminated."
+        exit 1
+    fi
+fi
+
+echo "LibriSpeech Data preparation done."
+exit 0
--- a/examples/librispeech/s1/local/download_lm_en.sh
+++ b/examples/librispeech/s1/local/download_lm_en.sh
@ -0,0 +1,20 @@
+#! /usr/bin/env bash
+
+. ${MAIN_ROOT}/utils/utility.sh
+
+DIR=data/lm
+mkdir -p ${DIR}
+
+URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
+
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/s1/local/export.sh
+++ b/examples/librispeech/s1/local/export.sh
@ -0,0 +1,24 @@
+#! /usr/bin/env bash
+
+if [ $# != 3 ];then
+    echo "usage: $0 config_path ckpt_prefix jit_model_path"
+    exit -1
+fi
+
+config_path=$1
+ckpt_path_prefix=$2
+jit_model_export_path=$3
+
+python3 -u ${BIN_DIR}/export.py \
+--config ${config_path} \
+--checkpoint_path ${ckpt_path_prefix} \
+--export_path ${jit_model_export_path} 
+
+
+if [ $? -ne 0 ]; then
+    echo "Failed in export!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
@ -0,0 +1,37 @@
+#! /usr/bin/env bash
+
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+device=gpu
+if [ ngpu != 0 ];then
+    device=cpu
+fi
+config_path=$1
+ckpt_prefix=$2
+
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+python3 -u ${BIN_DIR}/test.py \
+--device ${device} \
+--nproc 1 \
+--config ${config_path} \
+--result_file ${ckpt_prefix}.rsl \
+--checkpoint_path ${ckpt_prefix}
+
+if [ $? -ne 0 ]; then
+    echo "Failed in evaluation!"
+    exit 1
+fi
+
+
+exit 0
--- a/examples/librispeech/s1/local/train.sh
+++ b/examples/librispeech/s1/local/train.sh
@ -0,0 +1,31 @@
+#! /usr/bin/env bash
+
+if [ $# != 2 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+device=gpu
+if [ ngpu != 0 ];then
+    device=cpu
+fi
+
+mkdir -p exp
+
+python3 -u ${BIN_DIR}/train.py \
+--device ${device} \
+--nproc ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name}
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
@ -0,0 +1,14 @@
+export MAIN_ROOT=${PWD}/../../../
+
+export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+
+source path.sh
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+# prepare data
+bash ./local/data.sh
+
+# train model, all `ckpt` under `exp` dir
+CUDA_VISIBLE_DEVICES=0 ./local/train.sh conf/conformer.yaml test
+
+# test ckpt 1
+CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/conformer.yaml exp/test/checkpoints/1
+
+# avg 1 best model
+./local/avg.sh exp/test/checkpoints 1
+
+# export ckpt 1
+./local/export.sh conf/conformer.yaml exp/test/checkpoints/1 exp/test/checkpoints/1.jit.model
--- a/examples/spm/.gitignore
+++ b/examples/spm/.gitignore
@ -0,0 +1 @@
+data
--- a/examples/spm/README.md
+++ b/examples/spm/README.md
@ -0,0 +1,5 @@
+# SPM demo
+
+```
+bash run.sh
+```
--- a/examples/spm/path.sh
+++ b/examples/spm/path.sh
@ -0,0 +1,10 @@
+export MAIN_ROOT=${PWD}/../../
+
+export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
--- a/examples/spm/run.sh
+++ b/examples/spm/run.sh
@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+set -e
+
+source path.sh
+
+
+stage=0
+stop_stage=100
+# bpemode (unigram or bpe)
+nbpe=100
+bpemode=unigram
+
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+train_set=train
+dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe}
+
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p data/lang_char/
+
+    echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+    echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+    # we borrowed these code and scripts which are related bpe from ESPnet.
+    cut -f 2- -d" " text > data/lang_char/input.txt
+    ${MAIN_ROOT}/utils/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    ${MAIN_ROOT}/utils/spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+    num_token=$(cat $dict | wc -l)
+    echo "<sos/eos> $num_token" >> $dict # <eos>
+    wc -l ${dict}
+fi
+
+${MAIN_ROOT}/utils/spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt > data/lang_char/input.bpe
+${MAIN_ROOT}/utils/spm_decode --model=${bpemodel}.model --input_format=piece < data/lang_char/input.bpe | sed -e "s/▁/ /g" > data/lang_char/input.decode
--- a/examples/spm/text
+++ b/examples/spm/text
@ -0,0 +1,10 @@
+text-1 mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
+test-2 nor is mister quilter's manner less interesting than his matter
+test-3 he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
+test-4 he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
+test-5 linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
+test-6 it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
+test-7 on the general principles of art mister quilter writes with equal lucidity
+test-8 painting he tells us is of a different quality to mathematics and finish in art is adding more fact
+test-9 as for etchings they are of two kinds british and foreign
+test-10 he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@ -10,7 +10,6 @@ bpeprefix="data/bpe_${bpemode}_${nbpe}"

 source ${MAIN_ROOT}/utils/parse_options.sh

-
 mkdir -p data
 TARGET_DIR=${MAIN_ROOT}/examples/dataset
 mkdir -p ${TARGET_DIR}
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@ -2,7 +2,7 @@
 set -e

 source path.sh
-source ${MAIN_ROOT}/utils/parse_options.sh
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

 # prepare data
 bash ./local/data.sh
--- a/examples/tiny/s1/train.profile
+++ b/examples/tiny/s1/train.profile