Merge pull request #1012 from zh794390558/datapipe

[asr] independent dataloader
4 years ago · 6750770e54
parent 8fd976426b 2f4f744071
commit 6750770e54
228 changed files with 2125 additions and 305 deletions
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@ -1,7 +1,9 @@
 # ASR
-* s0 for deepspeech2
+* asr0 - deepspeech2 Streaming/Non-Streaming
-* s1 for u2/transformer/conformer
+* asr1 - transformer/conformer Streaming/Non-Streaming
 * asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
 ## Data
--- a/examples/aishell/asr0/.gitignore
+++ b/examples/aishell/asr0/.gitignore
--- a/examples/aishell/asr0/README.md
+++ b/examples/aishell/asr0/README.md
--- a/examples/aishell/asr0/conf/augmentation.json
+++ b/examples/aishell/asr0/conf/augmentation.json
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --spectrum_type="linear" \
    --delta_delta=false \
-    --stride_ms=10.0 \
+    --stride_ms=10 \
-    --window_ms=20.0 \
+    --window_ms=20 \
    --sample_rate=16000 \
    --use_dB_normalization=True \
    --num_samples=2000 \
@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
                --feat_type "raw" \
                --cmvn_path "data/mean_std.json" \
                --unit_type "char" \
                --vocab_path="data/vocab.txt" \
--- a/examples/aishell/asr0/local/download_lm_ch.sh
+++ b/examples/aishell/asr0/local/download_lm_ch.sh
--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
--- a/examples/aishell/asr0/path.sh
+++ b/examples/aishell/asr0/path.sh
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
--- a/examples/aishell/asr1/.gitignore
+++ b/examples/aishell/asr1/.gitignore
--- a/examples/aishell/asr1/README.md
+++ b/examples/aishell/asr1/README.md
@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
 ## Transformer 
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |  
 | transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |  
 | transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |  
 | transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |  
 | transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |  
--- a/examples/aishell/asr1/conf/augmentation.json
+++ b/examples/aishell/asr1/conf/augmentation.json
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@ -0,0 +1,29 @@
 process:
  # extract kaldi fbank from PCM
  - type: fbank_kaldi
    fs: 16000
    n_mels: 80
    n_shift: 160
    win_length: 400
    dither: true
  - type: cmvn_json
    cmvn_path: data/mean_std.json
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@ -0,0 +1,112 @@
 # https://yaml.org/type/float.html
 data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  min_input_len: 0.5
  max_input_len: 20.0 # second
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
 collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True 
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2
 # network architecture
 model:
    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
    encoder_conf:
        output_size: 256    # dimension of attention
        attention_heads: 4
        linear_units: 2048  # the number of units of position-wise feed forward
        num_blocks: 12      # the number of encoder blocks
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.0
        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
        normalize_before: true
    # decoder related
    decoder: transformer
    decoder_conf:
        attention_heads: 4
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        self_attention_dropout_rate: 0.0
        src_attention_dropout_rate: 0.0
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
        ctc_grad_norm_type: null 
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
 training:
  n_epoch: 120 
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
    lr: 0.002
    weight_decay: 1e-6
  scheduler: warmuplr     # pytorch v1.1.0+ required
  scheduler_conf:
    warmup_steps: 25000
    lr_decay: 1.0
  log_interval: 100
  checkpoint:
    kbest_n: 50
    latest_n: 5
 decoding:
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/local/aishell_train_lms.sh
+++ b/examples/aishell/asr1/local/aishell_train_lms.sh
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
+    --stride_ms=10 \
-    --window_ms=25.0 \
+    --window_ms=25 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \
--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@ -23,8 +23,6 @@ fi
 #    exit 1
 #fi
 for type in  attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
--- a/examples/aishell/asr1/path.sh
+++ b/examples/aishell/asr1/path.sh
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
--- a/examples/aishell/asr1/utils
+++ b/examples/aishell/asr1/utils
--- a/examples/callcenter/asr1/.gitignore
+++ b/examples/callcenter/asr1/.gitignore
--- a/examples/callcenter/asr1/README.md
+++ b/examples/callcenter/asr1/README.md
--- a/examples/callcenter/asr1/conf/augmentation.json
+++ b/examples/callcenter/asr1/conf/augmentation.json
--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/callcenter/asr1/conf/preprocess.yaml
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
@ -0,0 +1,29 @@
 process:
  # extract kaldi fbank from PCM
  - type: fbank_kaldi
    fs: 16000
    n_mels: 80
    n_shift: 160
    win_length: 400
    dither: true
  - type: cmvn_json
    cmvn_path: data/mean_std.json
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
--- a/examples/callcenter/asr1/local/data.sh
+++ b/examples/callcenter/asr1/local/data.sh
@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
+    --stride_ms=10 \
-    --window_ms=25.0 \
+    --window_ms=25 \
    --sample_rate=8000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \
--- a/examples/callcenter/asr1/local/download_lm_ch.sh
+++ b/examples/callcenter/asr1/local/download_lm_ch.sh
--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
--- a/examples/callcenter/asr1/path.sh
+++ b/examples/callcenter/asr1/path.sh
--- a/examples/callcenter/asr1/run.sh
+++ b/examples/callcenter/asr1/run.sh
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
 from pathlib import Path
 import soundfile
@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text,
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
 from pathlib import Path
 import soundfile
@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
    total_num = 0
    for subfolder, _, filelist in sorted(os.walk(data_dir)):
@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
                nchars = len(segments[1:])
                text = ' '.join(segments[1:]).lower()
                audio_filepath = os.path.abspath(
                    os.path.join(subfolder, segments[0] + '.flac'))
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
                utt2spk = '-'.join(utt.split('-')[:2])
                json_lines.append(
                    json.dumps({
-                        'utt':
+                        'utt': utt,
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
+                        'utt2spk': utt2spk,
-                        'feat':
+                        'feat': audio_filepath,
-                        audio_filepath,
+                        'feat_shape': (duration, ),  # second
-                        'feat_shape': (duration, ),  #second
+                        'text': text,
                        'text':
                        text
                    }))
                total_sec += duration
-                total_text += len(text)
+                total_char += nchars
                total_num += 1
    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
        print(f"{subset}:", file=f)
        print(f"{total_num} utts", file=f)
        print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
+        print(f"{total_char} char", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
                utt2spk = '-'.join(utt.split('-')[:2])
                json_lines.append(
                    json.dumps({
-                        'utt':
+                        'utt': utt,
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
+                        'utt2spk': utt2spk,
-                        'feat':
+                        'feat': audio_filepath,
                        audio_filepath,
                        'feat_shape': (duration, ),  #second
-                        'text':
+                        'text': text,
                        text
                    }))
                total_sec += duration
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                translation_str = " ".join(translation.split())
                trancription_str = " ".join(trancription.split())
                json_lines.append(
                    json.dumps(
                        {
                            'utt': utt,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
-                            'text': " ".join(translation.split()),
+                            'text': [translation_str, trancription_str],  
                            'text1': " ".join(trancription.split())
                        },
                        ensure_ascii=False))
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                assert os.path.exists(audio_path) and os.path.exists(text_path)
                audio_id = os.path.basename(audio_path)[:-4]
                spk = audio_id.split('_')[0]
                word_text, syllable_text, phone_text = read_trn(text_path)
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
                            'utt2spk', spk,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': word_text,  # charactor
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
                json.dumps(
                    {
                        'utt': utt_id,
                        'utt2spk': spk,
                        'utt2gender': gender,
                        'feat': str(audio_path),
                        'feat_shape': (duration, ),  # second
                        'text': word_text,  # word
                        'phone': phone_text,
                        'spk': spk,
                        'gender': gender,
                    },
                    ensure_ascii=False))
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@ -24,6 +24,7 @@ import json
 import os
 import soundfile
 from pathlib import Path
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
            audio_data, samplerate = soundfile.read(audio_path)
            duration = float(len(audio_data) / samplerate)
            text = phn_dict[audio_id]
            gender_spk = str(Path(audio_path).parent.stem)
            spk = gender_spk[1:]
            gender = gender_spk[0]
            utt_id = '_'.join([spk, gender, audio_id])
            json_lines.append(
                json.dumps(
                    {
                        'utt': audio_id,
                        'utt2spk': spk,
                        'utt2gender': gender,
                        'feat': audio_path,
                        'feat_shape': (duration, ),  # second
                        'text': text
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
            audio_data, samplerate = soundfile.read(u)
            duration = float(len(audio_data)) / samplerate
            utt = os.path.splitext(os.path.basename(u))[0]
            json_lines.append(
                json.dumps({
-                    'utt': os.path.splitext(os.path.basename(u))[0],
+                    'utt': utt,
                    'utt2spk': speaker,
                    'feat': u,
                    'feat_shape': (duration, ),  #second
                    'text': trans.lower()
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@ -1,8 +1,9 @@
 # ASR
-* s0 is for deepspeech2 offline
+* asr0 - deepspeech2 Streaming/Non-Streaming
-* s1 is for transformer/conformer/U2
+* asr1 - transformer/conformer Streaming/Non-Streaming
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
 ## Data
 | Data Subset | Duration in Seconds |
--- a/examples/librispeech/asr0/README.md
+++ b/examples/librispeech/asr0/README.md
--- a/examples/librispeech/asr0/conf/augmentation.json
+++ b/examples/librispeech/asr0/conf/augmentation.json
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
+    --stride_ms=10 \
-    --window_ms=20.0 \
+    --window_ms=20 \
    --use_dB_normalization=True \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \
--- a/examples/librispeech/asr0/local/download_lm_en.sh
+++ b/examples/librispeech/asr0/local/download_lm_en.sh
--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
--- a/examples/librispeech/asr0/path.sh
+++ b/examples/librispeech/asr0/path.sh
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
--- a/examples/librispeech/asr1/.gitignore
+++ b/examples/librispeech/asr1/.gitignore
--- a/examples/librispeech/asr1/README.md
+++ b/examples/librispeech/asr1/README.md
@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.805267604192098, | 0.049795 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |  
--- a/examples/librispeech/asr1/cmd.sh
+++ b/examples/librispeech/asr1/cmd.sh
--- a/examples/librispeech/asr1/conf/augmentation.json
+++ b/examples/librispeech/asr1/conf/augmentation.json
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/librispeech/asr1/conf/preprocess.yaml
+++ b/examples/librispeech/asr1/conf/preprocess.yaml
@ -0,0 +1,25 @@
 process:
  # extract kaldi fbank from PCM
  - type: fbank_kaldi
    fs: 16000
    n_mels: 80
    n_shift: 160
    win_length: 400
    dither: true
  - type: cmvn_json
    cmvn_path: data/mean_std.json
  # these three processes are a.k.a. SpecAugument
  - type: time_warp
    max_time_warp: 5
    inplace: true
    mode: PIL
  - type: freq_mask
    F: 30
    n_mask: 2
    inplace: true
    replace_with_zero: false
  - type: time_mask
    T: 40
    n_mask: 2
    inplace: true
    replace_with_zero: false
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 stride_ms=10
 window_ms=25
 sample_rate=16000
 feat_dim=80
 source ${MAIN_ROOT}/utils/parse_options.sh
@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi
-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+        mv data/manifest.${sub} data/manifest.${sub}.raw
    done
    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
+    for sub in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
    done
-    for set in dev-clean dev-other; do
+    for sub in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
    done
-    for set in test-clean test-other; do
+    for sub in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
    done
 fi
@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
    --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
    --delta_delta=false \
-    --sample_rate=16000 \
+    --sample_rate=${sample_rate} \
-    --stride_ms=10.0 \
+    --stride_ms=${stride_ms} \
-    --window_ms=25.0 \
+    --window_ms=${window_ms} \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@ -85,16 +90,15 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
+        --manifest_path="data/manifest.${sub}.raw" \
-        --output_path="data/manifest.${set}"
+        --output_path="data/manifest.${sub}"
        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    }&
    done
    wait
    for sub in train dev; do
        mv data/manifest.${sub} data/manifest.${sub}.fmt
    done
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    for sub in train dev; do
        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
    done
 fi
 echo "LibriSpeech Data preparation done."
--- a/examples/librispeech/asr1/local/download_lm_en.sh
+++ b/examples/librispeech/asr1/local/download_lm_en.sh
--- a/examples/librispeech/asr1/local/export.sh
+++ b/examples/librispeech/asr1/local/export.sh
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
--- a/examples/librispeech/asr1/local/test_hub.sh
+++ b/examples/librispeech/asr1/local/test_hub.sh
--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
--- a/examples/librispeech/asr1/path.sh
+++ b/examples/librispeech/asr1/path.sh
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
--- a/examples/librispeech/asr1/utils
+++ b/examples/librispeech/asr1/utils
--- a/examples/librispeech/asr2/.gitignore
+++ b/examples/librispeech/asr2/.gitignore
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
--- a/examples/librispeech/asr2/cmd.sh
+++ b/examples/librispeech/asr2/cmd.sh
--- a/examples/librispeech/asr2/conf/augmentation.json
+++ b/examples/librispeech/asr2/conf/augmentation.json
--- a/examples/librispeech/asr2/conf/decode/decode.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode.yaml
--- a/examples/librispeech/asr2/conf/decode/decode_att.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_att.yaml
--- a/examples/librispeech/asr2/conf/decode/decode_ctc.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml
--- a/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
--- a/examples/librispeech/asr2/conf/fbank.conf
+++ b/examples/librispeech/asr2/conf/fbank.conf
--- a/examples/librispeech/asr2/conf/lm/transformer.yaml
+++ b/examples/librispeech/asr2/conf/lm/transformer.yaml
--- a/examples/librispeech/asr2/conf/pitch.conf
+++ b/examples/librispeech/asr2/conf/pitch.conf
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
--- a/Show More
+++ b/Show More