Merge pull request #1012 from zh794390558/datapipe

[asr] independent dataloader
4 years ago · 6750770e54
parent 8fd976426b 2f4f744071
commit 6750770e54
228 changed files with 2125 additions and 305 deletions
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@ -1,7 +1,9 @@
 # ASR

-* s0 for deepspeech2
-* s1 for u2/transformer/conformer
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+

 ## Data

--- a/examples/aishell/asr0/.gitignore
+++ b/examples/aishell/asr0/.gitignore
--- a/examples/aishell/asr0/README.md
+++ b/examples/aishell/asr0/README.md
--- a/examples/aishell/asr0/conf/augmentation.json
+++ b/examples/aishell/asr0/conf/augmentation.json
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --spectrum_type="linear" \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --sample_rate=16000 \
    --use_dB_normalization=True \
    --num_samples=2000 \
@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-                --feat_type "raw" \
                --cmvn_path "data/mean_std.json" \
                --unit_type "char" \
                --vocab_path="data/vocab.txt" \
--- a/examples/aishell/asr0/local/download_lm_ch.sh
+++ b/examples/aishell/asr0/local/download_lm_ch.sh
--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
--- a/examples/aishell/asr0/path.sh
+++ b/examples/aishell/asr0/path.sh
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
--- a/examples/aishell/asr1/.gitignore
+++ b/examples/aishell/asr1/.gitignore
--- a/examples/aishell/asr1/README.md
+++ b/examples/aishell/asr1/README.md
@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
+
+
+## Transformer 
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |  
--- a/examples/aishell/asr1/conf/augmentation.json
+++ b/examples/aishell/asr1/conf/augmentation.json
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.5
+  max_input_len: 20.0 # second
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True 
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+# network architecture
+model:
+    cmvn_file: 
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null 
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 120 
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-6
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+
+
+decoding:
+  batch_size: 128
+  error_rate_type: cer 
+  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
--- a/examples/aishell/asr1/local/aishell_train_lms.sh
+++ b/examples/aishell/asr1/local/aishell_train_lms.sh
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \
--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@ -23,8 +23,6 @@ fi
 #    exit 1
 #fi

-
-
 for type in  attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
--- a/examples/aishell/asr1/path.sh
+++ b/examples/aishell/asr1/path.sh
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
--- a/examples/aishell/asr1/utils
+++ b/examples/aishell/asr1/utils
--- a/examples/callcenter/asr1/.gitignore
+++ b/examples/callcenter/asr1/.gitignore
--- a/examples/callcenter/asr1/README.md
+++ b/examples/callcenter/asr1/README.md
--- a/examples/callcenter/asr1/conf/augmentation.json
+++ b/examples/callcenter/asr1/conf/augmentation.json
--- a/examples/callcenter/asr1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/callcenter/asr1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'char'
  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -37,7 +37,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/callcenter/asr1/conf/preprocess.yaml
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
--- a/examples/callcenter/asr1/local/data.sh
+++ b/examples/callcenter/asr1/local/data.sh
@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
    --sample_rate=8000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="data/vocab.txt" \
--- a/examples/callcenter/asr1/local/download_lm_ch.sh
+++ b/examples/callcenter/asr1/local/download_lm_ch.sh
--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
--- a/examples/callcenter/asr1/path.sh
+++ b/examples/callcenter/asr1/path.sh
--- a/examples/callcenter/asr1/run.sh
+++ b/examples/callcenter/asr1/run.sh
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):

                audio_path = os.path.abspath(os.path.join(subfolder, fname))
                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name

                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text,
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path

 import soundfile

@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                # if no transcription for audio then skipped
                if audio_id not in transcript_dict:
                    continue
+               
+                utt2spk = Path(audio_path).parent.name
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
                text = transcript_dict[audio_id]
@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': text
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
    print("Creating manifest %s ..." % manifest_path)
    json_lines = []
    total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
    total_num = 0

    for subfolder, _, filelist in sorted(os.walk(data_dir)):
@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
            text_filepath = os.path.join(subfolder, text_filelist[0])
            for line in io.open(text_filepath, encoding="utf8"):
                segments = line.strip().split()
+                nchars = len(segments[1:])
                text = ' '.join(segments[1:]).lower()

                audio_filepath = os.path.abspath(
                    os.path.join(subfolder, segments[0] + '.flac'))
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
+
                json_lines.append(
                    json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
-                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
+                        'feat_shape': (duration, ),  # second
+                        'text': text,
                    }))

                total_sec += duration
-                total_text += len(text)
+                total_char += nchars
                total_num += 1

    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
        print(f"{subset}:", file=f)
        print(f"{total_num} utts", file=f)
        print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
        print(f"{total_sec / total_num} sec/utt", file=f)


--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
                audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                audio_data, samplerate = soundfile.read(audio_filepath)
                duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
                json_lines.append(
                    json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'text': text,
                    }))

                total_sec += duration
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
                    continue
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
+
+
+                translation_str = " ".join(translation.split())
+                trancription_str = " ".join(trancription.split())
                json_lines.append(
                    json.dumps(
                        {
                            'utt': utt,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
-                            'text': " ".join(translation.split()),
-                            'text1': " ".join(trancription.split())
+                            'text': [translation_str, trancription_str],  
                        },
                        ensure_ascii=False))

--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                assert os.path.exists(audio_path) and os.path.exists(text_path)

                audio_id = os.path.basename(audio_path)[:-4]
+                spk = audio_id.split('_')[0]
+
                word_text, syllable_text, phone_text = read_trn(text_path)
                audio_data, samplerate = soundfile.read(audio_path)
                duration = float(len(audio_data) / samplerate)
@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                    json.dumps(
                        {
                            'utt': audio_id,
+                            'utt2spk', spk,
                            'feat': audio_path,
                            'feat_shape': (duration, ),  # second
                            'text': word_text,  # charactor
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
                json.dumps(
                    {
                        'utt': utt_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                        'feat': str(audio_path),
                        'feat_shape': (duration, ),  # second
                        'text': word_text,  # word
                        'phone': phone_text,
-                        'spk': spk,
-                        'gender': gender,
                    },
                    ensure_ascii=False))

--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@ -24,6 +24,7 @@ import json
 import os

 import soundfile
+from pathlib import Path

 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
            audio_data, samplerate = soundfile.read(audio_path)
            duration = float(len(audio_data) / samplerate)
            text = phn_dict[audio_id]
+
+            gender_spk = str(Path(audio_path).parent.stem)
+            spk = gender_spk[1:]
+            gender = gender_spk[0]
+            utt_id = '_'.join([spk, gender, audio_id])
            json_lines.append(
                json.dumps(
                    {
                        'utt': audio_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                        'feat': audio_path,
                        'feat_shape': (duration, ),  # second
                        'text': text
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):

            audio_data, samplerate = soundfile.read(u)
            duration = float(len(audio_data)) / samplerate
+
+            utt = os.path.splitext(os.path.basename(u))[0]
            json_lines.append(
                json.dumps({
-                    'utt': os.path.splitext(os.path.basename(u))[0],
+                    'utt': utt,
+                    'utt2spk': speaker,
                    'feat': u,
                    'feat_shape': (duration, ),  #second
                    'text': trans.lower()
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@ -1,8 +1,9 @@
 # ASR

-* s0 is for deepspeech2 offline
-* s1 is for transformer/conformer/U2
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+

 ## Data
 | Data Subset | Duration in Seconds |
--- a/examples/librispeech/asr0/README.md
+++ b/examples/librispeech/asr0/README.md
--- a/examples/librispeech/asr0/conf/augmentation.json
+++ b/examples/librispeech/asr0/conf/augmentation.json
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --spectrum_type="linear" \
    --delta_delta=false \
    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
    --use_dB_normalization=True \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    for set in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type ${unit_type} \
        --vocab_path="data/vocab.txt" \
--- a/examples/librispeech/asr0/local/download_lm_en.sh
+++ b/examples/librispeech/asr0/local/download_lm_en.sh
--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
--- a/examples/librispeech/asr0/path.sh
+++ b/examples/librispeech/asr0/path.sh
--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
--- a/examples/librispeech/asr1/.gitignore
+++ b/examples/librispeech/asr1/.gitignore
--- a/examples/librispeech/asr1/README.md
+++ b/examples/librispeech/asr1/README.md
@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.805267604192098, | 0.049795 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |  
--- a/examples/librispeech/asr1/cmd.sh
+++ b/examples/librispeech/asr1/cmd.sh
--- a/examples/librispeech/asr1/conf/augmentation.json
+++ b/examples/librispeech/asr1/conf/augmentation.json
--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 64
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 16
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
--- a/examples/librispeech/asr1/conf/preprocess.yaml
+++ b/examples/librispeech/asr1/conf/preprocess.yaml
@ -0,0 +1,25 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@ -15,7 +15,7 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
@ -38,7 +38,7 @@ collator:

 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
    cmvn_file_type: "json"
    # encoder related
    encoder: transformer
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"

+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
 source ${MAIN_ROOT}/utils/parse_options.sh


@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi

-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${sub} data/manifest.${sub}.raw
    done

    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    for sub in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
    done

-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    for sub in dev-clean dev-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
    done

-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    for sub in test-clean test-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
    done
 fi

@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    --manifest_path="data/manifest.train.raw" \
    --num_samples=-1 \
    --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
    --use_dB_normalization=False \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
@ -85,16 +90,15 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
        --cmvn_path "data/mean_std.json" \
        --unit_type "spm" \
        --spm_model_prefix ${bpeprefix} \
        --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"

        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    }&
    done
    wait
+
+    for sub in train dev; do
+        mv data/manifest.${sub} data/manifest.${sub}.fmt
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    for sub in train dev; do
+        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+    done
 fi

 echo "LibriSpeech Data preparation done."
--- a/examples/librispeech/asr1/local/download_lm_en.sh
+++ b/examples/librispeech/asr1/local/download_lm_en.sh
--- a/examples/librispeech/asr1/local/export.sh
+++ b/examples/librispeech/asr1/local/export.sh
--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
--- a/examples/librispeech/asr1/local/test_hub.sh
+++ b/examples/librispeech/asr1/local/test_hub.sh
--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
--- a/examples/librispeech/asr1/path.sh
+++ b/examples/librispeech/asr1/path.sh
--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
--- a/examples/librispeech/asr1/utils
+++ b/examples/librispeech/asr1/utils
--- a/examples/librispeech/asr2/.gitignore
+++ b/examples/librispeech/asr2/.gitignore
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
--- a/examples/librispeech/asr2/cmd.sh
+++ b/examples/librispeech/asr2/cmd.sh
--- a/examples/librispeech/asr2/conf/augmentation.json
+++ b/examples/librispeech/asr2/conf/augmentation.json
--- a/examples/librispeech/asr2/conf/decode/decode.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode.yaml
--- a/examples/librispeech/asr2/conf/decode/decode_att.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_att.yaml
--- a/examples/librispeech/asr2/conf/decode/decode_ctc.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml
--- a/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
+++ b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
--- a/examples/librispeech/asr2/conf/fbank.conf
+++ b/examples/librispeech/asr2/conf/fbank.conf
--- a/examples/librispeech/asr2/conf/lm/transformer.yaml
+++ b/examples/librispeech/asr2/conf/lm/transformer.yaml
--- a/examples/librispeech/asr2/conf/pitch.conf
+++ b/examples/librispeech/asr2/conf/pitch.conf
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
--- a/Show More
+++ b/Show More