diff --git a/README.md b/README.md
index 2f9d9928..ec2d0f30 100644
--- a/README.md
+++ b/README.md
@@ -128,9 +128,9 @@ For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
 ```shell
 cd examples/csmsc/tts3
 # download the pretrained models and unaip them
-wget https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
 unzip pwg_baker_ckpt_0.4.zip
-wget https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
 unzip fastspeech2_nosil_baker_ckpt_0.4.zip
 # source the environment
 source path.sh
diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh
index ea7f683c..ba7d7980 100755
--- a/demos/metaverse/run.sh
+++ b/demos/metaverse/run.sh
@@ -25,9 +25,9 @@ fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # download pretrained tts models and unzip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
     unzip -d download download/pwg_baker_ckpt_0.4.zip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
     unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
 fi
 
diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh
index 069ec12e..44259cd3 100755
--- a/demos/story_talker/run.sh
+++ b/demos/story_talker/run.sh
@@ -19,9 +19,9 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # download pretrained tts models and unzip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
     unzip -d download download/pwg_baker_ckpt_0.4.zip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
     unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
 fi
 
diff --git a/demos/style_fs2/run.sh b/demos/style_fs2/run.sh
index f035dd1b..6f6d6068 100755
--- a/demos/style_fs2/run.sh
+++ b/demos/style_fs2/run.sh
@@ -14,9 +14,9 @@ mkdir -p download
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # download pretrained tts models and unzip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
     unzip -d download download/pwg_baker_ckpt_0.4.zip
-    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip
+    wget -P download https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip
     unzip -d download download/fastspeech2_nosil_baker_ckpt_0.4.zip
 fi
 
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 78f5c92f..ca04f6a7 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,4 +1,3 @@
-
 # Released Models
 
 ## Speech-to-Text Models
@@ -32,27 +31,28 @@ Language Model | Training Data | Token-based | Size | Descriptions
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)|||
-TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
-FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
-FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
-FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
-FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
+Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
+TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||
+FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|||
+FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
 
 ### Vocoders
 
 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)|||
-Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip)|5.1MB|
-Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)|||
-Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip)|||
-Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)|||
-|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip) |8.2MB|
+WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
+Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB|
+Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
+Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
+Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
+|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|
 
 ### Voice Cloning
 Model Type | Dataset| Example Link | Pretrained Models
 :-------------:| :------------:| :-----: | :-----:
-GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip)
+GE2E + FastSpeech2 | AISHELL-3  |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst
index f47c0892..4c2f86b1 100644
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -52,7 +52,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
             <td>
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -72,7 +72,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
             <td>
              <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
             </audio>
@@ -91,7 +91,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
             <td>
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -110,7 +110,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
             <td>
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -129,7 +129,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
             <td>
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -281,7 +281,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
                 <audio controls="controls">
                         <source
-                            src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
+                            src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
                             type="audio/wav">
                         Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -300,7 +300,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
             <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -320,7 +320,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
             <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -341,7 +341,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -361,7 +361,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
             <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -381,7 +381,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -401,7 +401,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
             <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -421,7 +421,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
@@ -441,7 +441,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             <td> 
                 <audio controls="controls">
                     <source
-                        src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
                         type="audio/wav">
                     Your browser does not support the <code>audio</code> element.
                 </audio>
diff --git a/examples/aishell/README.md b/examples/aishell/README.md
index 82ef91da..a9bba074 100644
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@@ -1,7 +1,9 @@
 # ASR
 
-* s0 for deepspeech2
-* s1 for u2/transformer/conformer
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
 
 ## Data
 
diff --git a/examples/aishell/s0/.gitignore b/examples/aishell/asr0/.gitignore
similarity index 100%
rename from examples/aishell/s0/.gitignore
rename to examples/aishell/asr0/.gitignore
diff --git a/examples/aishell/s0/README.md b/examples/aishell/asr0/README.md
similarity index 100%
rename from examples/aishell/s0/README.md
rename to examples/aishell/asr0/README.md
diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/asr0/conf/augmentation.json
similarity index 100%
rename from examples/aishell/s0/conf/augmentation.json
rename to examples/aishell/asr0/conf/augmentation.json
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml
similarity index 100%
rename from examples/aishell/s0/conf/deepspeech2.yaml
rename to examples/aishell/asr0/conf/deepspeech2.yaml
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
similarity index 100%
rename from examples/aishell/s0/conf/deepspeech2_online.yaml
rename to examples/aishell/asr0/conf/deepspeech2_online.yaml
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/asr0/local/data.sh
similarity index 96%
rename from examples/aishell/s0/local/data.sh
rename to examples/aishell/asr0/local/data.sh
index f4fccbe6..23f04f2a 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --manifest_path="data/manifest.train.raw" \
     --spectrum_type="linear" \
     --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
     --sample_rate=16000 \
     --use_dB_normalization=True \
     --num_samples=2000 \
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for dataset in train dev test; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-                --feat_type "raw" \
                 --cmvn_path "data/mean_std.json" \
                 --unit_type "char" \
                 --vocab_path="data/vocab.txt" \
diff --git a/examples/aishell/s0/local/download_lm_ch.sh b/examples/aishell/asr0/local/download_lm_ch.sh
similarity index 100%
rename from examples/aishell/s0/local/download_lm_ch.sh
rename to examples/aishell/asr0/local/download_lm_ch.sh
diff --git a/examples/aishell/s0/local/export.sh b/examples/aishell/asr0/local/export.sh
similarity index 100%
rename from examples/aishell/s0/local/export.sh
rename to examples/aishell/asr0/local/export.sh
diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/asr0/local/test.sh
similarity index 100%
rename from examples/aishell/s0/local/test.sh
rename to examples/aishell/asr0/local/test.sh
diff --git a/examples/aishell/s0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh
similarity index 100%
rename from examples/aishell/s0/local/test_export.sh
rename to examples/aishell/asr0/local/test_export.sh
diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh
similarity index 100%
rename from examples/aishell/s0/local/test_hub.sh
rename to examples/aishell/asr0/local/test_hub.sh
diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/asr0/local/train.sh
similarity index 100%
rename from examples/aishell/s0/local/train.sh
rename to examples/aishell/asr0/local/train.sh
diff --git a/examples/aishell/s0/path.sh b/examples/aishell/asr0/path.sh
similarity index 100%
rename from examples/aishell/s0/path.sh
rename to examples/aishell/asr0/path.sh
diff --git a/examples/aishell/s0/run.sh b/examples/aishell/asr0/run.sh
similarity index 100%
rename from examples/aishell/s0/run.sh
rename to examples/aishell/asr0/run.sh
diff --git a/examples/aishell/s1/.gitignore b/examples/aishell/asr1/.gitignore
similarity index 100%
rename from examples/aishell/s1/.gitignore
rename to examples/aishell/asr1/.gitignore
diff --git a/examples/aishell/s1/README.md b/examples/aishell/asr1/README.md
similarity index 67%
rename from examples/aishell/s1/README.md
rename to examples/aishell/asr1/README.md
index 0096c73e..8c53f95f 100644
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/asr1/README.md
@@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
+
+
+## Transformer 
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |  
+| transformer | 31.95M  | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |  
\ No newline at end of file
diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/asr1/conf/augmentation.json
similarity index 100%
rename from examples/aishell/s1/conf/augmentation.json
rename to examples/aishell/asr1/conf/augmentation.json
diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
similarity index 97%
rename from examples/aishell/s1/conf/chunk_conformer.yaml
rename to examples/aishell/asr1/conf/chunk_conformer.yaml
index 8682538b..336a6c46 100644
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'char'
   spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
similarity index 97%
rename from examples/aishell/s1/conf/conformer.yaml
rename to examples/aishell/asr1/conf/conformer.yaml
index 71cd044e..0e9d79d8 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'char'
   spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..dd4cfd27
--- /dev/null
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
new file mode 100644
index 00000000..c021f66b
--- /dev/null
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.5
+  max_input_len: 20.0 # second
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True 
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+# network architecture
+model:
+    cmvn_file: 
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null 
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 120 
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.002
+    weight_decay: 1e-6
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+
+
+decoding:
+  batch_size: 128
+  error_rate_type: cer 
+  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+
+
diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/asr1/local/aishell_train_lms.sh
similarity index 100%
rename from examples/aishell/s1/local/aishell_train_lms.sh
rename to examples/aishell/asr1/local/aishell_train_lms.sh
diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/asr1/local/align.sh
similarity index 100%
rename from examples/aishell/s1/local/align.sh
rename to examples/aishell/asr1/local/align.sh
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/asr1/local/data.sh
similarity index 96%
rename from examples/aishell/s1/local/data.sh
rename to examples/aishell/asr1/local/data.sh
index 2b9f69ae..76e28075 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --sample_rate=16000 \
     --use_dB_normalization=False \
     --num_samples=-1 \
@@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for dataset in train dev test; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
             --cmvn_path "data/mean_std.json" \
             --unit_type "char" \
             --vocab_path="data/vocab.txt" \
diff --git a/examples/aishell/s1/local/export.sh b/examples/aishell/asr1/local/export.sh
similarity index 100%
rename from examples/aishell/s1/local/export.sh
rename to examples/aishell/asr1/local/export.sh
diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/asr1/local/test.sh
similarity index 100%
rename from examples/aishell/s1/local/test.sh
rename to examples/aishell/asr1/local/test.sh
diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh
similarity index 99%
rename from examples/aishell/s1/local/test_hub.sh
rename to examples/aishell/asr1/local/test_hub.sh
index 99b141c8..6e78ec78 100755
--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@@ -23,8 +23,6 @@ fi
 #    exit 1
 #fi
 
-
-
 for type in  attention_rescoring; do
     echo "decoding ${type}"
     batch_size=1
diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh
similarity index 100%
rename from examples/aishell/s1/local/tlg.sh
rename to examples/aishell/asr1/local/tlg.sh
diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/asr1/local/train.sh
similarity index 100%
rename from examples/aishell/s1/local/train.sh
rename to examples/aishell/asr1/local/train.sh
diff --git a/examples/aishell/s1/path.sh b/examples/aishell/asr1/path.sh
similarity index 100%
rename from examples/aishell/s1/path.sh
rename to examples/aishell/asr1/path.sh
diff --git a/examples/aishell/s1/run.sh b/examples/aishell/asr1/run.sh
similarity index 100%
rename from examples/aishell/s1/run.sh
rename to examples/aishell/asr1/run.sh
diff --git a/examples/aishell/s1/utils b/examples/aishell/asr1/utils
similarity index 100%
rename from examples/aishell/s1/utils
rename to examples/aishell/asr1/utils
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index fe4887b9..056f35ba 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -97,7 +97,7 @@ optional arguments:
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
@@ -202,7 +202,7 @@ optional arguments:
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
 
 FastSpeech2 checkpoint contains files listed below.
 
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 2f1b37ee..376d4a33 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -41,7 +41,7 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th
 
 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
 
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -86,4 +86,4 @@ In addition, in order to accelerate the convergence of the model, we add `guided
 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
 ```
 ## Pretrained Model
-[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
+[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_0.3.zip).
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 834942fa..ae53443e 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -22,7 +22,7 @@ You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech
 ## Pretrained GE2E model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.
 
-Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip), and `unzip` it.
+Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
 
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
@@ -84,7 +84,7 @@ The training step is very similar to that one of [tts3](https://github.com/Paddl
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
 ```bash
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
@@ -115,7 +115,7 @@ ref_audio
 CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${conf_path} ${train_output_path} ${ckpt_name} ${ge2e_params_path} ${ref_audio_dir}
 ```
 ## Pretrained Model
-[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
+[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
 
 FastSpeech2 checkpoint contains files listed below.
 (There is no need for `speaker_id_map.txt` here )
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index d67af726..bc28bba1 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -132,7 +132,7 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_aishell3_ckpt_0.5.zip).
+Pretrained models can be downloaded here [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip).
 
 Parallel WaveGAN checkpoint contains files listed below.
 
diff --git a/examples/callcenter/s1/.gitignore b/examples/callcenter/asr1/.gitignore
similarity index 100%
rename from examples/callcenter/s1/.gitignore
rename to examples/callcenter/asr1/.gitignore
diff --git a/examples/callcenter/s1/README.md b/examples/callcenter/asr1/README.md
similarity index 100%
rename from examples/callcenter/s1/README.md
rename to examples/callcenter/asr1/README.md
diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/asr1/conf/augmentation.json
similarity index 100%
rename from examples/callcenter/s1/conf/augmentation.json
rename to examples/callcenter/asr1/conf/augmentation.json
diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml
similarity index 97%
rename from examples/callcenter/s1/conf/chunk_conformer.yaml
rename to examples/callcenter/asr1/conf/chunk_conformer.yaml
index a853658a..b18b46fe 100644
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'char'
   spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml
similarity index 97%
rename from examples/callcenter/s1/conf/conformer.yaml
rename to examples/callcenter/asr1/conf/conformer.yaml
index bd4f4578..47c438a6 100644
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'char'
   spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..dd4cfd27
--- /dev/null
+++ b/examples/callcenter/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/callcenter/s1/local/align.sh b/examples/callcenter/asr1/local/align.sh
similarity index 100%
rename from examples/callcenter/s1/local/align.sh
rename to examples/callcenter/asr1/local/align.sh
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/asr1/local/data.sh
similarity index 96%
rename from examples/callcenter/s1/local/data.sh
rename to examples/callcenter/asr1/local/data.sh
index 634bb8d0..c40c752a 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/asr1/local/data.sh
@@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --sample_rate=8000 \
     --use_dB_normalization=False \
     --num_samples=-1 \
@@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for dataset in train dev test; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-            --feat_type "raw" \
             --cmvn_path "data/mean_std.json" \
             --unit_type "char" \
             --vocab_path="data/vocab.txt" \
diff --git a/examples/callcenter/s1/local/download_lm_ch.sh b/examples/callcenter/asr1/local/download_lm_ch.sh
similarity index 100%
rename from examples/callcenter/s1/local/download_lm_ch.sh
rename to examples/callcenter/asr1/local/download_lm_ch.sh
diff --git a/examples/callcenter/s1/local/export.sh b/examples/callcenter/asr1/local/export.sh
similarity index 100%
rename from examples/callcenter/s1/local/export.sh
rename to examples/callcenter/asr1/local/export.sh
diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/asr1/local/test.sh
similarity index 100%
rename from examples/callcenter/s1/local/test.sh
rename to examples/callcenter/asr1/local/test.sh
diff --git a/examples/callcenter/s1/local/train.sh b/examples/callcenter/asr1/local/train.sh
similarity index 100%
rename from examples/callcenter/s1/local/train.sh
rename to examples/callcenter/asr1/local/train.sh
diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/asr1/path.sh
similarity index 100%
rename from examples/callcenter/s1/path.sh
rename to examples/callcenter/asr1/path.sh
diff --git a/examples/callcenter/s1/run.sh b/examples/callcenter/asr1/run.sh
similarity index 100%
rename from examples/callcenter/s1/run.sh
rename to examples/callcenter/asr1/run.sh
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 61c4972b..5ebf3cf4 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -90,7 +90,7 @@ optional arguments:
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
 ```
@@ -208,9 +208,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```
 
 ## Pretrained Model
-Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip).
+Pretrained SpeedySpeech model with no silence in the edge of audios[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip).
 
-Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_static_0.5.zip).
+Static model can be downloaded here [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip).
 
 SpeedySpeech checkpoint contains files listed below.
 ```text
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 6570d33d..104964c8 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -88,7 +88,7 @@ optional arguments:
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
 ```
@@ -199,9 +199,9 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ```
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip).
+Pretrained FastSpeech2 model with no silence in the edge of audios [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip).
 
-Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_static_0.4.zip).
+Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
 
 FastSpeech2 checkpoint contains files listed below.
 ```text
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index b9c8a465..86114a42 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -122,9 +122,9 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+Pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip).
 
-Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_static_0.4.zip).
+Static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip).
 
 Parallel WaveGAN checkpoint contains files listed below.
 
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index a72f60f1..4925b649 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -113,7 +113,7 @@ The length of mel-spectrograms should align with the length of wavs, so we shoul
 
 But since we are fine-tuning, we should use the statistics computed during training step.
 
-You should  first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
+You should  first download pretrained `FastSpeech2` model from [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip) and `unzip` it.
 
 Assume the path to the dump-dir of  training  step is `dump`.
 Assume the path to the duration result of CSMSC is `durations.txt` (generated during training step's preprocessing).
@@ -147,11 +147,11 @@ TODO:
 The hyperparameter of  `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
 
 ## Pretrained Models
-Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_ckpt_0.5.zip).
+Pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip).
 
-Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_finetune_ckpt_0.5.zip).
+Finetuned model can ben downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
 
-Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/mb_melgan_baker_static_0.5.zip)
+Static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip)
 
 Multi Band MelGAN checkpoint contains files listed below.
 
diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
index e32f619e..85f478c2 100644
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path
 
 import soundfile
 
@@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix):
 
                 audio_path = os.path.abspath(os.path.join(subfolder, fname))
                 audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
 
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
@@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                     json.dumps(
                         {
                             'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
                             'text': text,
diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py
index 66e06901..7431fc08 100644
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path
 
 import soundfile
 
@@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                 # if no transcription for audio then skipped
                 if audio_id not in transcript_dict:
                     continue
+
+                utt2spk = Path(audio_path).parent.name
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
                 text = transcript_dict[audio_id]
@@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                     json.dumps(
                         {
                             'utt': audio_id,
+                            'utt2spk': str(utt2spk),
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
                             'text': text
diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py
index e85bbb3a..69f0db59 100644
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
@@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path):
     print("Creating manifest %s ..." % manifest_path)
     json_lines = []
     total_sec = 0.0
-    total_text = 0.0
+    total_char = 0.0
     total_num = 0
 
     for subfolder, _, filelist in sorted(os.walk(data_dir)):
@@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
             text_filepath = os.path.join(subfolder, text_filelist[0])
             for line in io.open(text_filepath, encoding="utf8"):
                 segments = line.strip().split()
+                nchars = len(segments[1:])
                 text = ' '.join(segments[1:]).lower()
 
                 audio_filepath = os.path.abspath(
                     os.path.join(subfolder, segments[0] + '.flac'))
                 audio_data, samplerate = soundfile.read(audio_filepath)
                 duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
+
                 json_lines.append(
                     json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
-                        'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
+                        'feat_shape': (duration, ),  # second
+                        'text': text,
                     }))
 
                 total_sec += duration
-                total_text += len(text)
+                total_char += nchars
                 total_num += 1
 
     with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
@@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path):
         print(f"{subset}:", file=f)
         print(f"{total_num} utts", file=f)
         print(f"{total_sec / (60*60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
         print(f"{total_sec / total_num} sec/utt", file=f)
 
 
diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py
index 65fee81a..730c73a8 100644
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
@@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path):
                 audio_filepath = os.path.join(subfolder, segments[0] + '.flac')
                 audio_data, samplerate = soundfile.read(audio_filepath)
                 duration = float(len(audio_data)) / samplerate
+
+                utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+                utt2spk = '-'.join(utt.split('-')[:2])
                 json_lines.append(
                     json.dumps({
-                        'utt':
-                        os.path.splitext(os.path.basename(audio_filepath))[0],
-                        'feat':
-                        audio_filepath,
+                        'utt': utt,
+                        'utt2spk': utt2spk,
+                        'feat': audio_filepath,
                         'feat_shape': (duration, ),  #second
-                        'text':
-                        text
+                        'text': text,
                     }))
 
                 total_sec += duration
diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py
index 14bef01d..9a3ba3b3 100644
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -72,14 +72,16 @@ def create_manifest(data_dir, manifest_path_prefix):
                     continue
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
+
+                translation_str = " ".join(translation.split())
+                trancription_str = " ".join(trancription.split())
                 json_lines.append(
                     json.dumps(
                         {
                             'utt': utt,
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
-                            'text': " ".join(translation.split()),
-                            'text1': " ".join(trancription.split())
+                            'text': [translation_str, trancription_str],
                         },
                         ensure_ascii=False))
 
diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py
index 77a264cb..cdfc0a75 100644
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
@@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix):
                 assert os.path.exists(audio_path) and os.path.exists(text_path)
 
                 audio_id = os.path.basename(audio_path)[:-4]
+                spk = audio_id.split('_')[0]
+
                 word_text, syllable_text, phone_text = read_trn(text_path)
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
@@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                     json.dumps(
                         {
                             'utt': audio_id,
+                            'utt2spk': spk,
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
                             'text': word_text,  # charactor
diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py
index 311d445c..c4a9f066 100644
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
@@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix):
                 json.dumps(
                     {
                         'utt': utt_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                         'feat': str(audio_path),
                         'feat_shape': (duration, ),  # second
                         'text': word_text,  # word
                         'phone': phone_text,
-                        'spk': spk,
-                        'gender': gender,
                     },
                     ensure_ascii=False))
 
diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py
index 2b494c06..473fc856 100644
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
@@ -22,6 +22,7 @@ import argparse
 import codecs
 import json
 import os
+from pathlib import Path
 
 import soundfile
 
@@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix):
             audio_data, samplerate = soundfile.read(audio_path)
             duration = float(len(audio_data) / samplerate)
             text = phn_dict[audio_id]
+
+            gender_spk = str(Path(audio_path).parent.stem)
+            spk = gender_spk[1:]
+            gender = gender_spk[0]
+            utt_id = '_'.join([spk, gender, audio_id])
             json_lines.append(
                 json.dumps(
                     {
                         'utt': audio_id,
+                        'utt2spk': spk,
+                        'utt2gender': gender,
                         'feat': audio_path,
                         'feat_shape': (duration, ),  # second
                         'text': text
diff --git a/examples/dataset/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py
index 36282bd6..373791bf 100644
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
@@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path):
 
             audio_data, samplerate = soundfile.read(u)
             duration = float(len(audio_data)) / samplerate
+
+            utt = os.path.splitext(os.path.basename(u))[0]
             json_lines.append(
                 json.dumps({
-                    'utt': os.path.splitext(os.path.basename(u))[0],
+                    'utt': utt,
+                    'utt2spk': speaker,
                     'feat': u,
                     'feat_shape': (duration, ),  #second
                     'text': trans.lower()
diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md
index 5943cf1d..74441fd0 100644
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@@ -1,8 +1,9 @@
 # ASR
 
-* s0 is for deepspeech2 offline
-* s1 is for transformer/conformer/U2
-* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
 
 ## Data
 | Data Subset | Duration in Seconds |
diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/asr0/README.md
similarity index 100%
rename from examples/librispeech/s0/README.md
rename to examples/librispeech/asr0/README.md
diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/asr0/conf/augmentation.json
similarity index 100%
rename from examples/librispeech/s0/conf/augmentation.json
rename to examples/librispeech/asr0/conf/augmentation.json
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
similarity index 100%
rename from examples/librispeech/s0/conf/deepspeech2.yaml
rename to examples/librispeech/asr0/conf/deepspeech2.yaml
diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
similarity index 100%
rename from examples/librispeech/s0/conf/deepspeech2_online.yaml
rename to examples/librispeech/asr0/conf/deepspeech2_online.yaml
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/asr0/local/data.sh
similarity index 97%
rename from examples/librispeech/s0/local/data.sh
rename to examples/librispeech/asr0/local/data.sh
index fd2b0c01..0f276cec 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
     --use_dB_normalization=True \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
@@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for set in train dev test dev-clean dev-other test-clean test-other; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
         --cmvn_path "data/mean_std.json" \
         --unit_type ${unit_type} \
         --vocab_path="data/vocab.txt" \
diff --git a/examples/librispeech/s0/local/download_lm_en.sh b/examples/librispeech/asr0/local/download_lm_en.sh
similarity index 100%
rename from examples/librispeech/s0/local/download_lm_en.sh
rename to examples/librispeech/asr0/local/download_lm_en.sh
diff --git a/examples/librispeech/s0/local/export.sh b/examples/librispeech/asr0/local/export.sh
similarity index 100%
rename from examples/librispeech/s0/local/export.sh
rename to examples/librispeech/asr0/local/export.sh
diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/asr0/local/test.sh
similarity index 100%
rename from examples/librispeech/s0/local/test.sh
rename to examples/librispeech/asr0/local/test.sh
diff --git a/examples/librispeech/s0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh
similarity index 100%
rename from examples/librispeech/s0/local/test_hub.sh
rename to examples/librispeech/asr0/local/test_hub.sh
diff --git a/examples/librispeech/s0/local/train.sh b/examples/librispeech/asr0/local/train.sh
similarity index 100%
rename from examples/librispeech/s0/local/train.sh
rename to examples/librispeech/asr0/local/train.sh
diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/asr0/path.sh
similarity index 100%
rename from examples/librispeech/s0/path.sh
rename to examples/librispeech/asr0/path.sh
diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/asr0/run.sh
similarity index 100%
rename from examples/librispeech/s0/run.sh
rename to examples/librispeech/asr0/run.sh
diff --git a/examples/librispeech/s1/.gitignore b/examples/librispeech/asr1/.gitignore
similarity index 100%
rename from examples/librispeech/s1/.gitignore
rename to examples/librispeech/asr1/.gitignore
diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/asr1/README.md
similarity index 74%
rename from examples/librispeech/s1/README.md
rename to examples/librispeech/asr1/README.md
index b7ec93eb..73f0863e 100644
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/asr1/README.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.805267604192098, | 0.049795 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |  
diff --git a/examples/librispeech/s1/cmd.sh b/examples/librispeech/asr1/cmd.sh
similarity index 100%
rename from examples/librispeech/s1/cmd.sh
rename to examples/librispeech/asr1/cmd.sh
diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/asr1/conf/augmentation.json
similarity index 100%
rename from examples/librispeech/s1/conf/augmentation.json
rename to examples/librispeech/asr1/conf/augmentation.json
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/chunk_conformer.yaml
rename to examples/librispeech/asr1/conf/chunk_conformer.yaml
index 4d0e6ceb..2bfb0fb6 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/chunk_transformer.yaml
rename to examples/librispeech/asr1/conf/chunk_transformer.yaml
index c7b53f95..fe533777 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: transformer
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/conformer.yaml
rename to examples/librispeech/asr1/conf/conformer.yaml
index 3bc942dc..c844baaa 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: conformer
diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..021ca4c5
--- /dev/null
+++ b/examples/librispeech/asr1/conf/preprocess.yaml
@@ -0,0 +1,25 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
similarity index 97%
rename from examples/librispeech/s1/conf/transformer.yaml
rename to examples/librispeech/asr1/conf/transformer.yaml
index 3cc17004..5a158f3e 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -15,7 +15,7 @@ collator:
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -38,7 +38,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: transformer
diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/asr1/local/align.sh
similarity index 100%
rename from examples/librispeech/s1/local/align.sh
rename to examples/librispeech/asr1/local/align.sh
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/asr1/local/data.sh
similarity index 66%
rename from examples/librispeech/s1/local/data.sh
rename to examples/librispeech/asr1/local/data.sh
index 56fec846..35f4e635 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@@ -8,6 +8,11 @@ nbpe=5000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
 
+stride_ms=10
+window_ms=25
+sample_rate=16000
+feat_dim=80
+
 source ${MAIN_ROOT}/utils/parse_options.sh
 
 
@@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
         exit 1
     fi
 
-    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
-        mv data/manifest.${set} data/manifest.${set}.raw
+    for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${sub} data/manifest.${sub}.raw
     done
 
     rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
-    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    for sub in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${sub}.raw >> data/manifest.train.raw
     done
 
-    for set in dev-clean dev-other; do
-        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    for sub in dev-clean dev-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.dev.raw
     done
 
-    for set in test-clean test-other; do
-        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    for sub in test-clean test-other; do
+        cat data/manifest.${sub}.raw >> data/manifest.test.raw
     done
 fi
 
@@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
     --spectrum_type="fbank" \
-    --feat_dim=80 \
+    --feat_dim=${feat_dim} \
     --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
     --use_dB_normalization=False \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
@@ -85,16 +90,15 @@ fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
-    for set in train dev test dev-clean dev-other test-clean test-other; do
+    for sub in train dev test dev-clean dev-other test-clean test-other; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
         --cmvn_path "data/mean_std.json" \
         --unit_type "spm" \
         --spm_model_prefix ${bpeprefix} \
         --vocab_path="data/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"
 
         if [ $? -ne 0 ]; then
             echo "Formt mnaifest failed. Terminated."
@@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     }&
     done
     wait
+
+    for sub in train dev; do
+        mv data/manifest.${sub} data/manifest.${sub}.fmt
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    for sub in train dev; do
+        remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub}
+    done
 fi
 
 echo "LibriSpeech Data preparation done."
diff --git a/examples/librispeech/s1/local/download_lm_en.sh b/examples/librispeech/asr1/local/download_lm_en.sh
similarity index 100%
rename from examples/librispeech/s1/local/download_lm_en.sh
rename to examples/librispeech/asr1/local/download_lm_en.sh
diff --git a/examples/librispeech/s1/local/export.sh b/examples/librispeech/asr1/local/export.sh
similarity index 100%
rename from examples/librispeech/s1/local/export.sh
rename to examples/librispeech/asr1/local/export.sh
diff --git a/examples/librispeech/s1/local/test.sh b/examples/librispeech/asr1/local/test.sh
similarity index 100%
rename from examples/librispeech/s1/local/test.sh
rename to examples/librispeech/asr1/local/test.sh
diff --git a/examples/librispeech/s1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh
similarity index 100%
rename from examples/librispeech/s1/local/test_hub.sh
rename to examples/librispeech/asr1/local/test_hub.sh
diff --git a/examples/librispeech/s1/local/train.sh b/examples/librispeech/asr1/local/train.sh
similarity index 100%
rename from examples/librispeech/s1/local/train.sh
rename to examples/librispeech/asr1/local/train.sh
diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/asr1/path.sh
similarity index 100%
rename from examples/librispeech/s1/path.sh
rename to examples/librispeech/asr1/path.sh
diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/asr1/run.sh
similarity index 100%
rename from examples/librispeech/s1/run.sh
rename to examples/librispeech/asr1/run.sh
diff --git a/examples/librispeech/s1/utils b/examples/librispeech/asr1/utils
similarity index 100%
rename from examples/librispeech/s1/utils
rename to examples/librispeech/asr1/utils
diff --git a/examples/librispeech/s2/.gitignore b/examples/librispeech/asr2/.gitignore
similarity index 100%
rename from examples/librispeech/s2/.gitignore
rename to examples/librispeech/asr2/.gitignore
diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/asr2/README.md
similarity index 100%
rename from examples/librispeech/s2/README.md
rename to examples/librispeech/asr2/README.md
diff --git a/examples/librispeech/s2/cmd.sh b/examples/librispeech/asr2/cmd.sh
similarity index 100%
rename from examples/librispeech/s2/cmd.sh
rename to examples/librispeech/asr2/cmd.sh
diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/asr2/conf/augmentation.json
similarity index 100%
rename from examples/librispeech/s2/conf/augmentation.json
rename to examples/librispeech/asr2/conf/augmentation.json
diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/asr2/conf/decode/decode.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode.yaml
rename to examples/librispeech/asr2/conf/decode/decode.yaml
diff --git a/examples/librispeech/s2/conf/decode/decode_att.yaml b/examples/librispeech/asr2/conf/decode/decode_att.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode_att.yaml
rename to examples/librispeech/asr2/conf/decode/decode_att.yaml
diff --git a/examples/librispeech/s2/conf/decode/decode_ctc.yaml b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode_ctc.yaml
rename to examples/librispeech/asr2/conf/decode/decode_ctc.yaml
diff --git a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/decode/decode_wo_lm.yaml
rename to examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml
diff --git a/examples/librispeech/s2/conf/fbank.conf b/examples/librispeech/asr2/conf/fbank.conf
similarity index 100%
rename from examples/librispeech/s2/conf/fbank.conf
rename to examples/librispeech/asr2/conf/fbank.conf
diff --git a/examples/librispeech/s2/conf/lm/transformer.yaml b/examples/librispeech/asr2/conf/lm/transformer.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/lm/transformer.yaml
rename to examples/librispeech/asr2/conf/lm/transformer.yaml
diff --git a/examples/librispeech/s2/conf/pitch.conf b/examples/librispeech/asr2/conf/pitch.conf
similarity index 100%
rename from examples/librispeech/s2/conf/pitch.conf
rename to examples/librispeech/asr2/conf/pitch.conf
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
similarity index 100%
rename from examples/librispeech/s2/conf/transformer.yaml
rename to examples/librispeech/asr2/conf/transformer.yaml
diff --git a/examples/librispeech/s2/local/align.sh b/examples/librispeech/asr2/local/align.sh
similarity index 100%
rename from examples/librispeech/s2/local/align.sh
rename to examples/librispeech/asr2/local/align.sh
diff --git a/examples/librispeech/s2/local/cacu_perplexity.sh b/examples/librispeech/asr2/local/cacu_perplexity.sh
similarity index 100%
rename from examples/librispeech/s2/local/cacu_perplexity.sh
rename to examples/librispeech/asr2/local/cacu_perplexity.sh
diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/asr2/local/data.sh
similarity index 100%
rename from examples/librispeech/s2/local/data.sh
rename to examples/librispeech/asr2/local/data.sh
diff --git a/examples/librispeech/s2/local/data_prep.sh b/examples/librispeech/asr2/local/data_prep.sh
similarity index 100%
rename from examples/librispeech/s2/local/data_prep.sh
rename to examples/librispeech/asr2/local/data_prep.sh
diff --git a/examples/librispeech/s2/local/download_lm_en.sh b/examples/librispeech/asr2/local/download_lm_en.sh
similarity index 100%
rename from examples/librispeech/s2/local/download_lm_en.sh
rename to examples/librispeech/asr2/local/download_lm_en.sh
diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/asr2/local/espnet_json_to_manifest.py
similarity index 100%
rename from examples/librispeech/s2/local/espnet_json_to_manifest.py
rename to examples/librispeech/asr2/local/espnet_json_to_manifest.py
diff --git a/examples/librispeech/s2/local/export.sh b/examples/librispeech/asr2/local/export.sh
similarity index 100%
rename from examples/librispeech/s2/local/export.sh
rename to examples/librispeech/asr2/local/export.sh
diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/asr2/local/recog.sh
similarity index 100%
rename from examples/librispeech/s2/local/recog.sh
rename to examples/librispeech/asr2/local/recog.sh
diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/asr2/local/test.sh
similarity index 100%
rename from examples/librispeech/s2/local/test.sh
rename to examples/librispeech/asr2/local/test.sh
diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/asr2/local/train.sh
similarity index 100%
rename from examples/librispeech/s2/local/train.sh
rename to examples/librispeech/asr2/local/train.sh
diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/asr2/path.sh
similarity index 100%
rename from examples/librispeech/s2/path.sh
rename to examples/librispeech/asr2/path.sh
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/asr2/run.sh
similarity index 100%
rename from examples/librispeech/s2/run.sh
rename to examples/librispeech/asr2/run.sh
diff --git a/examples/librispeech/s2/steps b/examples/librispeech/asr2/steps
similarity index 100%
rename from examples/librispeech/s2/steps
rename to examples/librispeech/asr2/steps
diff --git a/examples/librispeech/s2/utils b/examples/librispeech/asr2/utils
similarity index 100%
rename from examples/librispeech/s2/utils
rename to examples/librispeech/asr2/utils
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index 09fd0c13..305add20 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -80,6 +80,6 @@ optional arguments:
 ## Pretrained Models
 Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
 
-1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
+1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)
 
-2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
+2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 12e43e2e..8a43ecd9 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -79,7 +79,7 @@ optional arguments:
 
 ## Synthesize
 We use [waveflow](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0) as the neural vocoder.
-Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
+Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
 ```bash
 unzip waveflow_ljspeech_ckpt_0.3.zip
 ```
@@ -173,7 +173,7 @@ optional arguments:
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
+Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)
 
 TransformerTTS  checkpoint contains files listed below.
 ```text
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index cda53541..5bdaf4b8 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -87,7 +87,7 @@ optional arguments:
 
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
-Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
+Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
 ```bash
 unzip pwg_ljspeech_ckpt_0.5.zip
 ```
@@ -191,7 +191,7 @@ optional arguments:
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
 
 FastSpeech2 checkpoint contains files listed below.
 ```text
diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md
index 09856c36..0d4e6c51 100644
--- a/examples/ljspeech/voc0/README.md
+++ b/examples/ljspeech/voc0/README.md
@@ -48,4 +48,4 @@ Synthesize waveform.
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
+Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip).
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 0506d5d8..24f6dbca 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -123,7 +123,7 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)
 
 Parallel WaveGAN checkpoint contains files listed below.
 
diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
index 0bf35e1f..85574260 100755
--- a/examples/other/1xt2x/aishell/local/data.sh
+++ b/examples/other/1xt2x/aishell/local/data.sh
@@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for dataset in train dev test; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-                --feat_type "raw" \
                 --cmvn_path "data/mean_std.npz" \
                 --unit_type "char" \
                 --vocab_path="data/vocab.txt" \
diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
index f0bde77f..8e378ff0 100755
--- a/examples/other/1xt2x/baidu_en8k/local/data.sh
+++ b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for set in train dev test dev-clean dev-other test-clean test-other; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
         --cmvn_path "data/mean_std.npz" \
         --unit_type ${unit_type} \
         --vocab_path="data/vocab.txt" \
diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
index 6f9bc556..7387472d 100755
--- a/examples/other/1xt2x/librispeech/local/data.sh
+++ b/examples/other/1xt2x/librispeech/local/data.sh
@@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for set in train dev test dev-clean dev-other test-clean test-other; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
         --cmvn_path "data/mean_std.npz" \
         --unit_type ${unit_type} \
         --vocab_path="data/vocab.txt" \
diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md
index d86c8c13..d58ca513 100644
--- a/examples/other/ge2e/README.md
+++ b/examples/other/ge2e/README.md
@@ -95,7 +95,7 @@ In `${BIN_DIR}/inference.py`:
 ## Pretrained Model
 The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
 
-Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
+Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip).
 
 ## References
 
diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md
index 5664b06b..6d6886da 100644
--- a/examples/ted_en_zh/README.md
+++ b/examples/ted_en_zh/README.md
@@ -1,3 +1,3 @@
 # TED En -> Zh
 
-* t0 for u2 speech translation
+* st0 - conformer/transformer speech translation
diff --git a/examples/ted_en_zh/st0/.gitignore b/examples/ted_en_zh/st0/.gitignore
new file mode 100644
index 00000000..469c6171
--- /dev/null
+++ b/examples/ted_en_zh/st0/.gitignore
@@ -0,0 +1,3 @@
+TED-En-Zh
+data
+exp
diff --git a/examples/ted_en_zh/t0/README.md b/examples/ted_en_zh/st0/README.md
similarity index 100%
rename from examples/ted_en_zh/t0/README.md
rename to examples/ted_en_zh/st0/README.md
diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
similarity index 100%
rename from examples/ted_en_zh/t0/conf/transformer.yaml
rename to examples/ted_en_zh/st0/conf/transformer.yaml
diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
similarity index 100%
rename from examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
rename to examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
similarity index 91%
rename from examples/ted_en_zh/t0/local/data.sh
rename to examples/ted_en_zh/st0/local/data.sh
index b080a5b4..d3acbd44 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -9,7 +9,7 @@ stop_stage=100
 nbpe=8000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
-data_dir=./TED_EnZh
+data_dir=./TED-En-Zh
 
 
 source ${MAIN_ROOT}/utils/parse_options.sh
@@ -21,7 +21,7 @@ mkdir -p data
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     if [ ! -e ${data_dir} ]; then
-        echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+        echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
         echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
         echo "The tree of the directory should be:"
         echo "."
@@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --use_dB_normalization=False \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
@@ -88,8 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
     for set in train dev test; do
     {
-        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
-        --feat_type "raw" \
+        python3 ${MAIN_ROOT}/utils/format_data.py \
         --cmvn_path "data/mean_std.json" \
         --unit_type "spm" \
         --spm_model_prefix ${bpeprefix} \
diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh
similarity index 100%
rename from examples/ted_en_zh/t0/local/test.sh
rename to examples/ted_en_zh/st0/local/test.sh
diff --git a/examples/ted_en_zh/t0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh
similarity index 100%
rename from examples/ted_en_zh/t0/local/train.sh
rename to examples/ted_en_zh/st0/local/train.sh
diff --git a/examples/ted_en_zh/t0/path.sh b/examples/ted_en_zh/st0/path.sh
similarity index 100%
rename from examples/ted_en_zh/t0/path.sh
rename to examples/ted_en_zh/st0/path.sh
diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/st0/run.sh
similarity index 93%
rename from examples/ted_en_zh/t0/run.sh
rename to examples/ted_en_zh/st0/run.sh
index ed9ab5f8..fb4bc338 100755
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@@ -22,7 +22,7 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/t0/.gitignore
deleted file mode 100644
index 123e5174..00000000
--- a/examples/ted_en_zh/t0/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-TED_EnZh
-data
-exp
diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md
index 7b3cc3d9..9a0026a0 100644
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
@@ -1,3 +1,3 @@
 # thchs30
 
-* a0 for mfa alignment
+* align0 - mfa alignment
diff --git a/examples/thchs30/a0/README.md b/examples/thchs30/align0/README.md
similarity index 100%
rename from examples/thchs30/a0/README.md
rename to examples/thchs30/align0/README.md
diff --git a/examples/thchs30/a0/data/dict/syllable.lexicon b/examples/thchs30/align0/data/dict/syllable.lexicon
similarity index 100%
rename from examples/thchs30/a0/data/dict/syllable.lexicon
rename to examples/thchs30/align0/data/dict/syllable.lexicon
diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/align0/local/data.sh
similarity index 100%
rename from examples/thchs30/a0/local/data.sh
rename to examples/thchs30/align0/local/data.sh
diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/align0/local/gen_word2phone.py
similarity index 100%
rename from examples/thchs30/a0/local/gen_word2phone.py
rename to examples/thchs30/align0/local/gen_word2phone.py
diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/align0/local/reorganize_thchs30.py
similarity index 100%
rename from examples/thchs30/a0/local/reorganize_thchs30.py
rename to examples/thchs30/align0/local/reorganize_thchs30.py
diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/align0/path.sh
similarity index 100%
rename from examples/thchs30/a0/path.sh
rename to examples/thchs30/align0/path.sh
diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/align0/run.sh
similarity index 100%
rename from examples/thchs30/a0/run.sh
rename to examples/thchs30/align0/run.sh
diff --git a/examples/timit/README.md b/examples/timit/README.md
index b7c8b754..51fcfd57 100644
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
@@ -1,3 +1,7 @@
 # TIMIT
 
-* s1 u2 model with phone unit
+asr model with phone unit
+
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
diff --git a/examples/timit/s1/.gitignore b/examples/timit/asr1/.gitignore
similarity index 100%
rename from examples/timit/s1/.gitignore
rename to examples/timit/asr1/.gitignore
diff --git a/examples/timit/s1/README.md b/examples/timit/asr1/README.md
similarity index 100%
rename from examples/timit/s1/README.md
rename to examples/timit/asr1/README.md
diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/asr1/conf/augmentation.json
similarity index 100%
rename from examples/timit/s1/conf/augmentation.json
rename to examples/timit/asr1/conf/augmentation.json
diff --git a/examples/timit/s1/conf/dev_spk.list b/examples/timit/asr1/conf/dev_spk.list
similarity index 100%
rename from examples/timit/s1/conf/dev_spk.list
rename to examples/timit/asr1/conf/dev_spk.list
diff --git a/examples/timit/s1/conf/phones.60-48-39.map b/examples/timit/asr1/conf/phones.60-48-39.map
similarity index 100%
rename from examples/timit/s1/conf/phones.60-48-39.map
rename to examples/timit/asr1/conf/phones.60-48-39.map
diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..dd4cfd27
--- /dev/null
+++ b/examples/timit/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/timit/s1/conf/test_spk.list b/examples/timit/asr1/conf/test_spk.list
similarity index 100%
rename from examples/timit/s1/conf/test_spk.list
rename to examples/timit/asr1/conf/test_spk.list
diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml
similarity index 97%
rename from examples/timit/s1/conf/transformer.yaml
rename to examples/timit/asr1/conf/transformer.yaml
index d3ced898..1d18468b 100644
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/asr1/conf/transformer.yaml
@@ -14,7 +14,7 @@ collator:
   vocab_filepath: data/vocab.txt
   unit_type: "word"
   mean_std_filepath: ""
-  augmentation_config: ""
+  augmentation_config: conf/preprocess.yaml
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: transformer
diff --git a/examples/timit/s1/local/align.sh b/examples/timit/asr1/local/align.sh
similarity index 100%
rename from examples/timit/s1/local/align.sh
rename to examples/timit/asr1/local/align.sh
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/asr1/local/data.sh
similarity index 96%
rename from examples/timit/s1/local/data.sh
rename to examples/timit/asr1/local/data.sh
index ad4ddde3..e588e48d 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/asr1/local/data.sh
@@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --use_dB_normalization=False \
     --num_workers=${num_workers} \
     --output_path="data/mean_std.json"
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     for set in train dev test; do
     {
         python3 ${MAIN_ROOT}/utils/format_data.py \
-        --feat_type "raw" \
         --cmvn_path "data/mean_std.json" \
         --unit_type ${unit_type} \
         --vocab_path="data/vocab.txt" \
diff --git a/examples/timit/s1/local/export.sh b/examples/timit/asr1/local/export.sh
similarity index 100%
rename from examples/timit/s1/local/export.sh
rename to examples/timit/asr1/local/export.sh
diff --git a/examples/timit/s1/local/test.sh b/examples/timit/asr1/local/test.sh
similarity index 100%
rename from examples/timit/s1/local/test.sh
rename to examples/timit/asr1/local/test.sh
diff --git a/examples/timit/s1/local/timit_data_prep.sh b/examples/timit/asr1/local/timit_data_prep.sh
similarity index 100%
rename from examples/timit/s1/local/timit_data_prep.sh
rename to examples/timit/asr1/local/timit_data_prep.sh
diff --git a/examples/timit/s1/local/timit_norm_trans.pl b/examples/timit/asr1/local/timit_norm_trans.pl
similarity index 100%
rename from examples/timit/s1/local/timit_norm_trans.pl
rename to examples/timit/asr1/local/timit_norm_trans.pl
diff --git a/examples/timit/s1/local/train.sh b/examples/timit/asr1/local/train.sh
similarity index 100%
rename from examples/timit/s1/local/train.sh
rename to examples/timit/asr1/local/train.sh
diff --git a/examples/timit/s1/path.sh b/examples/timit/asr1/path.sh
similarity index 100%
rename from examples/timit/s1/path.sh
rename to examples/timit/asr1/path.sh
diff --git a/examples/timit/s1/run.sh b/examples/timit/asr1/run.sh
similarity index 100%
rename from examples/timit/s1/run.sh
rename to examples/timit/asr1/run.sh
diff --git a/examples/tiny/README.md b/examples/tiny/README.md
index 6766f59a..f36baae6 100644
--- a/examples/tiny/README.md
+++ b/examples/tiny/README.md
@@ -1,2 +1,3 @@
-* s0 for deepspeech2
-* s1 for U2
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
diff --git a/examples/tiny/s0/.gitignore b/examples/tiny/asr0/.gitignore
similarity index 100%
rename from examples/tiny/s0/.gitignore
rename to examples/tiny/asr0/.gitignore
diff --git a/examples/tiny/s0/README.md b/examples/tiny/asr0/README.md
similarity index 100%
rename from examples/tiny/s0/README.md
rename to examples/tiny/asr0/README.md
diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/asr0/conf/augmentation.json
similarity index 100%
rename from examples/tiny/s0/conf/augmentation.json
rename to examples/tiny/asr0/conf/augmentation.json
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
similarity index 100%
rename from examples/tiny/s0/conf/deepspeech2.yaml
rename to examples/tiny/asr0/conf/deepspeech2.yaml
diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
similarity index 100%
rename from examples/tiny/s0/conf/deepspeech2_online.yaml
rename to examples/tiny/asr0/conf/deepspeech2_online.yaml
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/asr0/local/data.sh
similarity index 96%
rename from examples/tiny/s0/local/data.sh
rename to examples/tiny/asr0/local/data.sh
index 711ebee4..f1fb8cb1 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/asr0/local/data.sh
@@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=20.0 \
+    --stride_ms=10 \
+    --window_ms=20 \
     --use_dB_normalization=False \
     --num_workers=2 \
     --output_path="data/mean_std.json"
@@ -63,7 +63,6 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
     python3 ${MAIN_ROOT}/utils/format_data.py \
-    --feat_type "raw" \
     --cmvn_path "data/mean_std.json" \
     --unit_type ${unit_type} \
     --vocab_path="data/vocab.txt" \
diff --git a/examples/tiny/s0/local/download_lm_en.sh b/examples/tiny/asr0/local/download_lm_en.sh
similarity index 100%
rename from examples/tiny/s0/local/download_lm_en.sh
rename to examples/tiny/asr0/local/download_lm_en.sh
diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/asr0/local/export.sh
similarity index 100%
rename from examples/tiny/s0/local/export.sh
rename to examples/tiny/asr0/local/export.sh
diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/asr0/local/test.sh
similarity index 100%
rename from examples/tiny/s0/local/test.sh
rename to examples/tiny/asr0/local/test.sh
diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/asr0/local/train.sh
similarity index 100%
rename from examples/tiny/s0/local/train.sh
rename to examples/tiny/asr0/local/train.sh
diff --git a/examples/tiny/s0/path.sh b/examples/tiny/asr0/path.sh
similarity index 100%
rename from examples/tiny/s0/path.sh
rename to examples/tiny/asr0/path.sh
diff --git a/examples/tiny/s0/run.sh b/examples/tiny/asr0/run.sh
similarity index 100%
rename from examples/tiny/s0/run.sh
rename to examples/tiny/asr0/run.sh
diff --git a/examples/tiny/s1/.gitignore b/examples/tiny/asr1/.gitignore
similarity index 100%
rename from examples/tiny/s1/.gitignore
rename to examples/tiny/asr1/.gitignore
diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/asr1/conf/augmentation.json
similarity index 100%
rename from examples/tiny/s1/conf/augmentation.json
rename to examples/tiny/asr1/conf/augmentation.json
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
similarity index 98%
rename from examples/tiny/s1/conf/chunk_confermer.yaml
rename to examples/tiny/asr1/conf/chunk_confermer.yaml
index c5186669..6bed27f5 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
similarity index 98%
rename from examples/tiny/s1/conf/chunk_transformer.yaml
rename to examples/tiny/asr1/conf/chunk_transformer.yaml
index 29c30b26..7aed1b19 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
similarity index 98%
rename from examples/tiny/s1/conf/conformer.yaml
rename to examples/tiny/asr1/conf/conformer.yaml
index 8487da77..2c09b3ae 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..dd4cfd27
--- /dev/null
+++ b/examples/tiny/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
similarity index 96%
rename from examples/tiny/s1/conf/transformer.yaml
rename to examples/tiny/asr1/conf/transformer.yaml
index cc9b5c51..1378e848 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -11,11 +11,11 @@ data:
   max_output_input_ratio: 10.0
   
 collator:
-  mean_std_filepath: ""
+  mean_std_filepath: data/mean_std.json
   vocab_filepath: data/vocab.txt 
   unit_type: 'spm'
   spm_model_prefix: 'data/bpe_unigram_200'
-  augmentation_config: conf/augmentation.json
+  augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
   spectrum_type: fbank #linear, mfcc, fbank
@@ -37,7 +37,7 @@ collator:
 
 # network architecture
 model:
-    cmvn_file: "data/mean_std.json"
+    cmvn_file: 
     cmvn_file_type: "json"
     # encoder related
     encoder: transformer
diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/asr1/local/align.sh
similarity index 100%
rename from examples/tiny/s1/local/align.sh
rename to examples/tiny/asr1/local/align.sh
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/asr1/local/data.sh
similarity index 96%
rename from examples/tiny/s1/local/data.sh
rename to examples/tiny/asr1/local/data.sh
index b25f993f..87539d5e 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/asr1/local/data.sh
@@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
+    --stride_ms=10 \
+    --window_ms=25 \
     --use_dB_normalization=False \
     --num_workers=2 \
     --output_path="data/mean_std.json"
@@ -69,7 +69,6 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
     python3 ${MAIN_ROOT}/utils/format_data.py \
-    --feat_type "raw" \
     --cmvn_path "data/mean_std.json" \
     --unit_type "spm" \
     --spm_model_prefix ${bpeprefix} \
diff --git a/examples/tiny/s1/local/export.sh b/examples/tiny/asr1/local/export.sh
similarity index 100%
rename from examples/tiny/s1/local/export.sh
rename to examples/tiny/asr1/local/export.sh
diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/asr1/local/test.sh
similarity index 100%
rename from examples/tiny/s1/local/test.sh
rename to examples/tiny/asr1/local/test.sh
diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/asr1/local/train.sh
similarity index 100%
rename from examples/tiny/s1/local/train.sh
rename to examples/tiny/asr1/local/train.sh
diff --git a/examples/tiny/s1/path.sh b/examples/tiny/asr1/path.sh
similarity index 100%
rename from examples/tiny/s1/path.sh
rename to examples/tiny/asr1/path.sh
diff --git a/examples/tiny/s1/run.sh b/examples/tiny/asr1/run.sh
similarity index 100%
rename from examples/tiny/s1/run.sh
rename to examples/tiny/asr1/run.sh
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 334372f9..894d6b14 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -90,7 +90,7 @@ optional arguments:
 ### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder.
 
-Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it.
+Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it.
 ```bash
 unzip pwg_vctk_ckpt_0.5.zip
 ```
@@ -196,7 +196,7 @@ optional arguments:
 6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)
 
 FastSpeech2 checkpoint contains files listed below.
 ```text
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 5063b869..8692f010 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -127,7 +127,7 @@ optional arguments:
 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
 ## Pretrained Models
-Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip).
+Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip).
 
 Parallel WaveGAN checkpoint contains files listed below.
 
diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md
new file mode 100644
index 00000000..cbd01eb8
--- /dev/null
+++ b/examples/wenetspeech/README.md
@@ -0,0 +1,58 @@
+* asr0 - deepspeech2 Streaming/Non-Streaming
+* asr1 - transformer/conformer Streaming/Non-Streaming
+* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
+
+# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech)
+
+A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition
+
+## Description
+
+### Creation
+
+All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data.
+
+### Categories
+
+In summary, WenetSpeech groups all data into 3 categories, as the following table shows:
+
+| Set        | Hours | Confidence  | Usage                                 |
+|------------|-------|-------------|---------------------------------------|
+| High Label | 10005 | >=0.95      | Supervised Training                   |
+| Weak Label | 2478  | [0.6, 0.95] | Semi-supervised or noise training     |
+| Unlabel    | 9952  | /           | Unsupervised training or Pre-training |
+| In Total   | 22435 | /           | All above                             |
+
+### High Label Data
+
+We classify the high label into 10 groups according to its domain, speaking style, and scenarios.
+
+| Domain      | Youtube | Podcast | Total  |
+|-------------|---------|---------|--------|
+| audiobook   | 0       | 250.9   | 250.9  |
+| commentary  | 112.6   | 135.7   | 248.3  |
+| documentary | 386.7   | 90.5    | 477.2  |
+| drama       | 4338.2  | 0       | 4338.2 |
+| interview   | 324.2   | 614     | 938.2  |
+| news        | 0       | 868     | 868    |
+| reading     | 0       | 1110.2  | 1110.2 |
+| talk        | 204     | 90.7    | 294.7  |
+| variety     | 603.3   | 224.5   | 827.8  |
+| others      | 144     | 507.5   | 651.5  |
+| Total       | 6113    | 3892    | 10005  |
+
+As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales.
+
+| Training Subsets | Confidence  | Hours |
+|------------------|-------------|-------|
+| L                | [0.95, 1.0] | 10005 |
+| M                | 1.0         | 1000  |
+| S                | 1.0         | 100   |
+
+### Evaluation Sets
+
+| Evaluation Sets | Hours | Source       | Description                                                                             |
+|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
+| DEV             | 20    | Internet     | Specially designed for some speech tools which require cross-validation set in training |
+| TEST\_NET       | 23    | Internet     | Match test                                                                              |
+| TEST\_MEETING   | 15    | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset   |
diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore
new file mode 100644
index 00000000..02a22922
--- /dev/null
+++ b/examples/wenetspeech/asr1/.gitignore
@@ -0,0 +1,3 @@
+data
+exp
+*.profile
diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md
new file mode 100644
index 00000000..c08b94e2
--- /dev/null
+++ b/examples/wenetspeech/asr1/README.md
@@ -0,0 +1,14 @@
+## Pack Model
+
+pack model to tar.gz, e.g.
+
+```bash
+./utils/pack_model.sh  --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec
+h.pdparams 
+
+```
+
+show model.tar.gz
+```
+tar tf model.tar.gz 
+```
diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md
new file mode 100644
index 00000000..5c2b8143
--- /dev/null
+++ b/examples/wenetspeech/asr1/RESULTS.md
@@ -0,0 +1,24 @@
+# WenetSpeech
+
+
+## Conformer
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | dev | attention |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test net | ctc_greedy_search |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test meeting | ctc_prefix_beam_search |  |  |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | test net | attention_rescoring |  |  |  
+
+
+
+## Conformer Pretrain Model
+
+Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | attention | - | 0.048456 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | ctc_greedy_search | - | 0.052534 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | ctc_prefix_beam_search | - | 0.052915 |  
+| conformer | 32.52 M | conf/conformer.yaml | spec_aug  | aishell1 | attention_rescoring | - | 0.047904 |  
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
new file mode 100644
index 00000000..0340dc85
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -0,0 +1,113 @@
+# network architecture
+model:
+    # encoder related
+    encoder: conformer
+    encoder_conf:
+        output_size: 512    # dimension of attention
+        attention_heads: 8
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: True
+        use_cnn_module: True
+        cnn_module_kernel: 15
+        cnn_module_norm: layer_norm
+        activation_type: swish
+        pos_enc_layer_type: rel_pos
+        selfattention_layer_type: rel_selfattn
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 8
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.1 # second
+  max_input_len: 12.0 # second
+  min_output_len: 1.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/preprocess.yaml
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True 
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+training:
+  n_epoch: 240 
+  accum_grad: 16
+  global_grad_clip: 5.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+  optim: adam
+  optim_conf:
+    lr: 0.001
+    weight_decay: 1e-6
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 5000
+    lr_decay: 1.0
+
+
+decoding:
+  batch_size: 128
+  error_rate_type: cer 
+  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..dd4cfd27
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: true
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
new file mode 100755
index 00000000..67b3d5a5
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/data.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright 2021  Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+stage=-1
+stop_stage=100
+
+# Use your own data path. You need to download the WenetSpeech dataset by yourself.
+wenetspeech_data_dir=./wenetspeech
+# Make sure you have 1.2T for ${shards_dir}
+shards_dir=./wenetspeech_shards
+
+#wenetspeech training set
+set=L
+train_set=train_`echo $set | tr 'A-Z' 'a-z'`
+dev_set=dev
+test_sets="test_net test_meeting"
+
+cmvn=true
+cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn
+
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+set -u
+set -o pipefail
+
+
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
+    # download data
+    echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
+    exit 0;
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "Data preparation"
+    local/wenetspeech_data_prep.sh \
+        --train-subset $set \
+        $wenetspeech_data_dir \
+        data || exit 1;
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # generate manifests
+    python3 ${TARGET_DIR}/aishell/aishell.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/aishell"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare Aishell failed. Terminated."
+        exit 1
+    fi
+
+    for dataset in train dev test; do
+        mv data/manifest.${dataset} data/manifest.${dataset}.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    if $cmvn; then
+        full_size=`cat data/${train_set}/wav.scp | wc -l`
+        sampling_size=$((full_size / cmvn_sampling_divisor))
+        shuf -n $sampling_size data/$train_set/wav.scp \
+            > data/$train_set/wav.scp.sampled
+        num_workers=$(nproc)
+
+        python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+        --manifest_path="data/manifest.train.raw" \
+        --spectrum_type="fbank" \
+        --feat_dim=80 \
+        --delta_delta=false \
+        --stride_ms=10 \
+        --window_ms=25 \
+        --sample_rate=16000 \
+        --use_dB_normalization=False \
+        --num_samples=-1 \
+        --num_workers=${num_workers} \
+        --output_path="data/mean_std.json"
+
+        if [ $? -ne 0 ]; then
+            echo "Compute mean and stddev failed. Terminated."
+            exit 1
+        fi
+    fi
+fi
+
+dict=data/dict/lang_char.txt
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for dataset in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+            --cmvn_path "data/mean_std.json" \
+            --unit_type "char" \
+            --vocab_path="data/vocab.txt" \
+            --manifest_path="data/manifest.${dataset}.raw" \
+            --output_path="data/manifest.${dataset}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    } &
+    done
+    wait
+fi
+
+echo "Aishell data preparation done."
+exit 0
diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
new file mode 100644
index 00000000..0e1b2727
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import sys
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+      This script is used to process raw json dataset of WenetSpeech,
+      where the long wav is splitinto segments and
+      data of wenet format is generated.
+      """)
+    parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
+    parser.add_argument('output_dir', help="""Output dir for prepared data""")
+
+    args = parser.parse_args()
+    return args
+
+
+def meta_analysis(input_json, output_dir):
+    input_dir = os.path.dirname(input_json)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    try:
+        with open(input_json, 'r') as injson:
+            json_data = json.load(injson)
+    except Exception:
+        sys.exit(f'Failed to load input json file: {input_json}')
+    else:
+        if json_data['audios'] is not None:
+            with open(f'{output_dir}/text', 'w') as utt2text, \
+                 open(f'{output_dir}/segments', 'w') as segments, \
+                 open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
+                 open(f'{output_dir}/wav.scp', 'w') as wavscp, \
+                 open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
+                 open(f'{output_dir}/reco2dur', 'w') as reco2dur:
+                for long_audio in json_data['audios']:
+                    try:
+                        long_audio_path = os.path.realpath(
+                            os.path.join(input_dir, long_audio['path']))
+                        aid = long_audio['aid']
+                        segments_lists = long_audio['segments']
+                        duration = long_audio['duration']
+                        assert (os.path.exists(long_audio_path))
+                    except AssertionError:
+                        print(f'''Warning: {aid} something is wrong,
+                                  maybe AssertionError, skipped''')
+                        continue
+                    except Exception:
+                        print(f'''Warning: {aid} something is wrong, maybe the
+                                  error path: {long_audio_path}, skipped''')
+                        continue
+                    else:
+                        wavscp.write(f'{aid}\t{long_audio_path}\n')
+                        reco2dur.write(f'{aid}\t{duration}\n')
+                        for segment_file in segments_lists:
+                            try:
+                                sid = segment_file['sid']
+                                start_time = segment_file['begin_time']
+                                end_time = segment_file['end_time']
+                                dur = end_time - start_time
+                                text = segment_file['text']
+                                segment_subsets = segment_file["subsets"]
+                            except Exception:
+                                print(f'''Warning: {segment_file} something
+                                          is wrong, skipped''')
+                                continue
+                            else:
+                                utt2text.write(f'{sid}\t{text}\n')
+                                segments.write(
+                                    f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
+                                utt2dur.write(f'{sid}\t{dur}\n')
+                                segment_sub_names = " ".join(segment_subsets)
+                                utt2subsets.write(
+                                    f'{sid}\t{segment_sub_names}\n')
+
+
+def main():
+    args = get_args()
+
+    meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py
new file mode 100644
index 00000000..f1b9287e
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/process_opus.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2021  NPU, ASLP Group (Author: Qijie Shao)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# process_opus.py: segmentation and downsampling of opus audio
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+import os
+import sys
+
+from pydub import AudioSegment
+
+
+def read_file(wav_scp, segments):
+    wav_scp_dict = {}
+    with open(wav_scp, 'r', encoding='UTF-8') as fin:
+        for line_str in fin:
+            wav_id, path = line_str.strip().split()
+            wav_scp_dict[wav_id] = path
+
+    utt_list = []
+    seg_path_list = []
+    start_time_list = []
+    end_time_list = []
+    with open(segments, 'r', encoding='UTF-8') as fin:
+        for line_str in fin:
+            arr = line_str.strip().split()
+            assert len(arr) == 4
+            utt_list.append(arr[0])
+            seg_path_list.append(wav_scp_dict[arr[1]])
+            start_time_list.append(float(arr[2]))
+            end_time_list.append(float(arr[3]))
+    return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+           end_time_list):
+    num_utts = len(utt_list)
+    step = int(num_utts * 0.01)
+    with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
+        previous_wav_path = ""
+        for i in range(num_utts):
+            utt_id = utt_list[i]
+            current_wav_path = seg_path_list[i]
+            output_dir = (os.path.dirname(current_wav_path)) \
+                .replace("audio", 'audio_seg')
+            seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
+
+            # if not os.path.exists(output_dir):
+            #     os.makedirs(output_dir)
+
+            if current_wav_path != previous_wav_path:
+                source_wav = AudioSegment.from_file(current_wav_path)
+            previous_wav_path = current_wav_path
+
+            start = int(start_time_list[i] * 1000)
+            end = int(end_time_list[i] * 1000)
+            target_audio = source_wav[start:end].set_frame_rate(16000)
+            target_audio.export(seg_wav_path, format="wav")
+
+            fout.write("{} {}\n".format(utt_id, seg_wav_path))
+            if i % step == 0:
+                print("seg wav finished: {}%".format(int(i / step)))
+
+
+def main():
+    wav_scp = sys.argv[1]
+    segments = sys.argv[2]
+    output_wav_scp = sys.argv[3]
+
+    utt_list, seg_path_list, start_time_list, end_time_list \
+        = read_file(wav_scp, segments)
+    output(output_wav_scp, utt_list, seg_path_list, start_time_list,
+           end_time_list)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh
new file mode 100755
index 00000000..47bd2f63
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/test.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+
+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    if [ ${chunk_mode} == true ];then
+        # stream decoding only support batchsize=1
+        batch_size=1
+    else
+        batch_size=64
+    fi
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+exit 0
diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
new file mode 100755
index 00000000..85853053
--- /dev/null
+++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Seasalt AI, Inc (Author: Guoguo Chen)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. ./tools/parse_options.sh || exit 1;
+
+filter_by_id () {
+  idlist=$1
+  input=$2
+  output=$3
+  field=1
+  if [ $# -eq 4 ]; then
+    field=$4
+  fi
+  cat $input | perl -se '
+    open(F, "<$idlist") || die "Could not open id-list file $idlist";
+    while(<F>) {
+      @A = split;
+      @A>=1 || die "Invalid id-list file line $_";
+      $seen{$A[0]} = 1;
+    }
+    while(<>) {
+      @A = split;
+      @A > 0 || die "Invalid file line $_";
+      @A >= $field || die "Invalid file line $_";
+      if ($seen{$A[$field-1]}) {
+        print $_;
+      }
+    }' -- -idlist="$idlist" -field="$field" > $output ||\
+  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+  utt_list=$1
+  src_dir=$2
+  dest_dir=$3
+  mkdir -p $dest_dir || exit 1;
+  # wav.scp text segments utt2dur
+  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+  rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
+  echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+  echo ""
+  echo "This script takes the WenetSpeech source directory, and prepares the"
+  echo "WeNet format data directory."
+  echo "  --prefix <prefix>                # Prefix for output data directory."
+  echo "  --stage <stage>                  # Processing stage."
+  echo "  --train-subset <L|M|S|W>     # Train subset to be created."
+  exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+  [L]="train_l"
+  [M]="train_m"
+  [S]="train_s"
+  [W]="train_w"
+  [DEV]="dev"
+  [TEST_NET]="test_net"
+  [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+  echo "$0: Extract meta into $corpus_dir"
+  # Sanity check.
+  [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+    echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+  [ ! -d $wenetspeech_dir/audio ] &&\
+    echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+  # Files to be created:
+  # wav.scp text segments utt2dur
+  python3 local/extract_meta.py \
+    $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Split data to train, dev, test_net, and test_meeting"
+  [ ! -f $corpus_dir/utt2subsets ] &&\
+    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+  for label in $train_subset DEV TEST_NET TEST_MEETING; do
+    if [ ! ${subsets[$label]+set} ]; then
+      echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+    fi
+    subset=${subsets[$label]}
+    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+    cat $corpus_dir/utt2subsets | \
+       awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+       > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+      $corpus_dir $data_dir/${prefix}$subset || exit 1;
+  done
+fi
+
+echo "$0: Done"
\ No newline at end of file
diff --git a/examples/wenetspeech/asr1/path.sh b/examples/wenetspeech/asr1/path.sh
new file mode 100644
index 00000000..666b29bc
--- /dev/null
+++ b/examples/wenetspeech/asr1/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+# model exp
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh
new file mode 100644
index 00000000..8c4a12cb
--- /dev/null
+++ b/examples/wenetspeech/asr1/run.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+. path.sh || exit 1;
+set -e
+
+gpus=0,1,2,3,4,5,6,7
+stage=0
+stop_stage=100
+conf_path=conf/conformer.yaml
+
+average_checkpoint=true
+avg_num=10
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+audio_file="data/tmp.wav"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # ctc alignment of test data
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
diff --git a/examples/wenetspeech/asr1/utils b/examples/wenetspeech/asr1/utils
new file mode 120000
index 00000000..973afe67
--- /dev/null
+++ b/examples/wenetspeech/asr1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 177d710b..e827414d 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -409,7 +409,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
     @paddle.no_grad()
     def test(self):
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
-        if self.args.enable_auto_log == True:
+        if self.args.enable_auto_log is True:
             from paddlespeech.s2t.utils.log import Autolog
             self.autolog = Autolog(
                 batch_size=self.config.decoding.batch_size,
@@ -438,7 +438,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
         msg += "Final error rate [%s] (%d/%d) = %f" % (
             error_rate_type, num_ins, num_ins, errors_sum / len_refs)
         logger.info(msg)
-        if self.args.enable_auto_log == True:
+        if self.args.enable_auto_log is True:
             self.autolog.report()
 
     def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
@@ -512,7 +512,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
         x_len_list = np.split(x_len_batch, batch_size, axis=0)
 
         for x, x_len in zip(x_list, x_len_list):
-            if self.args.enable_auto_log == True:
+            if self.args.enable_auto_log is True:
                 self.autolog.times.start()
             x_len = x_len[0]
             assert (chunk_size <= x_len)
@@ -547,7 +547,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
 
             probs_chunk_list = []
             probs_chunk_lens_list = []
-            if self.args.enable_auto_log == True:
+            if self.args.enable_auto_log is True:
                 # record the model preprocessing time
                 self.autolog.times.stamp()
 
@@ -606,7 +606,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
                 [output_probs, output_probs_padding], axis=1)
             output_probs_list.append(output_probs)
             output_lens_list.append(output_lens)
-            if self.args.enable_auto_log == True:
+            if self.args.enable_auto_log is True:
                 # record the model inference time
                 self.autolog.times.stamp()
                 # record the post processing time
@@ -641,12 +641,12 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
         audio_len_handle.reshape(x_len.shape)
         audio_len_handle.copy_from_cpu(x_len)
 
-        if self.args.enable_auto_log == True:
+        if self.args.enable_auto_log is True:
             self.autolog.times.start()
             # record the prefix processing time
             self.autolog.times.stamp()
         self.predictor.run()
-        if self.args.enable_auto_log == True:
+        if self.args.enable_auto_log is True:
             # record the model inference time
             self.autolog.times.stamp()
             # record the post processing time
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 22d4238a..27bc47d2 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -24,13 +24,10 @@ import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
-from paddle.io import DataLoader
 from yacs.config import CfgNode
 
-from paddlespeech.s2t.io.collator import SpeechCollator
-from paddlespeech.s2t.io.dataset import ManifestDataset
-from paddlespeech.s2t.io.sampler import SortagradBatchSampler
-from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.u2 import U2Model
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
 from paddlespeech.s2t.training.reporter import ObsScope
@@ -213,7 +210,7 @@ class U2Trainer(Trainer):
                             msg += f"{v:>.8f}" if isinstance(v,
                                                              float) else f"{v}"
                             msg += f" {k.split(',')[1]}" if len(
-                                k.split(',')) == 2 else f""
+                                k.split(',')) == 2 else ""
                             msg += ","
                         msg = msg[:-1]  # remove the last ","
                         if (batch_index + 1
@@ -249,92 +246,103 @@ class U2Trainer(Trainer):
 
     def setup_dataloader(self):
         config = self.config.clone()
-        config.defrost()
-        config.collator.keep_transcription_text = False
 
-        # train/valid dataset, return token ids
-        config.data.manifest = config.data.train_manifest
-        train_dataset = ManifestDataset.from_config(config)
-
-        config.data.manifest = config.data.dev_manifest
-        dev_dataset = ManifestDataset.from_config(config)
-
-        collate_fn_train = SpeechCollator.from_config(config)
-
-        config.collator.augmentation_config = ""
-        collate_fn_dev = SpeechCollator.from_config(config)
-
-        if self.parallel:
-            batch_sampler = SortagradDistributedBatchSampler(
-                train_dataset,
+        if self.train:
+            # train/valid dataset, return token ids
+            self.train_loader = BatchDataLoader(
+                json_file=config.data.train_manifest,
+                train_mode=True,
+                sortagrad=False,
                 batch_size=config.collator.batch_size,
-                num_replicas=None,
-                rank=None,
-                shuffle=True,
-                drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
-        else:
-            batch_sampler = SortagradBatchSampler(
-                train_dataset,
-                shuffle=True,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=self.args.nprocs,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.collator.num_workers,
+                subsampling_factor=1,
+                num_encs=1)
+
+            self.valid_loader = BatchDataLoader(
+                json_file=config.data.dev_manifest,
+                train_mode=False,
+                sortagrad=False,
                 batch_size=config.collator.batch_size,
-                drop_last=True,
-                sortagrad=config.collator.sortagrad,
-                shuffle_method=config.collator.shuffle_method)
-        self.train_loader = DataLoader(
-            train_dataset,
-            batch_sampler=batch_sampler,
-            collate_fn=collate_fn_train,
-            num_workers=config.collator.num_workers, )
-        self.valid_loader = DataLoader(
-            dev_dataset,
-            batch_size=config.collator.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=collate_fn_dev,
-            num_workers=config.collator.num_workers, )
-
-        # test dataset, return raw text
-        config.data.manifest = config.data.test_manifest
-        # filter test examples, will cause less examples, but no mismatch with training
-        # and can use large batch size , save training time, so filter test egs now.
-        config.data.min_input_len = 0.0  # second
-        config.data.max_input_len = float('inf')  # second
-        config.data.min_output_len = 0.0  # tokens
-        config.data.max_output_len = float('inf')  # tokens
-        config.data.min_output_input_ratio = 0.00
-        config.data.max_output_input_ratio = float('inf')
-
-        test_dataset = ManifestDataset.from_config(config)
-        # return text ord id
-        config.collator.keep_transcription_text = True
-        config.collator.augmentation_config = ""
-        self.test_loader = DataLoader(
-            test_dataset,
-            batch_size=config.decoding.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config),
-            num_workers=config.collator.num_workers, )
-        # return text token id
-        config.collator.keep_transcription_text = False
-        self.align_loader = DataLoader(
-            test_dataset,
-            batch_size=config.decoding.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=SpeechCollator.from_config(config),
-            num_workers=config.collator.num_workers, )
-        logger.info("Setup train/valid/test/align Dataloader!")
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=self.args.nprocs,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=config.collator.num_workers,
+                subsampling_factor=1,
+                num_encs=1)
+            logger.info("Setup train/valid Dataloader!")
+        else:
+            # test dataset, return raw text
+            self.test_loader = BatchDataLoader(
+                json_file=config.data.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=config.decoding.batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=1,
+                subsampling_factor=1,
+                num_encs=1)
+
+            self.align_loader = BatchDataLoader(
+                json_file=config.data.test_manifest,
+                train_mode=False,
+                sortagrad=False,
+                batch_size=config.decoding.batch_size,
+                maxlen_in=float('inf'),
+                maxlen_out=float('inf'),
+                minibatches=0,
+                mini_batch_size=1,
+                batch_count='auto',
+                batch_bins=0,
+                batch_frames_in=0,
+                batch_frames_out=0,
+                batch_frames_inout=0,
+                preprocess_conf=config.collator.
+                augmentation_config,  # aug will be off when train_mode=False
+                n_iter_processes=1,
+                subsampling_factor=1,
+                num_encs=1)
+            logger.info("Setup test/align Dataloader!")
 
     def setup_model(self):
         config = self.config
         model_conf = config.model
 
         with UpdateConfig(model_conf):
-            model_conf.input_dim = self.train_loader.collate_fn.feature_size
-            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+            if self.train:
+                model_conf.input_dim = self.train_loader.feat_dim
+                model_conf.output_dim = self.train_loader.vocab_size
+            else:
+                model_conf.input_dim = self.test_loader.feat_dim
+                model_conf.output_dim = self.test_loader.vocab_size
 
         model = U2Model.from_config(model_conf)
 
@@ -343,6 +351,11 @@ class U2Trainer(Trainer):
 
         logger.info(f"{model}")
         layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+
+        if not self.train:
+            return
 
         train_config = config.training
         optim_type = train_config.optim
@@ -383,10 +396,9 @@ class U2Trainer(Trainer):
         optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
         optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
 
-        self.model = model
         self.optimizer = optimizer
         self.lr_scheduler = lr_scheduler
-        logger.info("Setup model/optimizer/lr_scheduler!")
+        logger.info("Setup optimizer/lr_scheduler!")
 
 
 class U2Tester(U2Trainer):
@@ -421,14 +433,19 @@ class U2Tester(U2Trainer):
 
     def __init__(self, config, args):
         super().__init__(config, args)
+        self.text_feature = TextFeaturizer(
+            unit_type=self.config.collator.unit_type,
+            vocab_filepath=self.config.collator.vocab_filepath,
+            spm_model_prefix=self.config.collator.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list
 
-    def ordid2token(self, texts, texts_len):
+    def id2token(self, texts, texts_len, text_feature):
         """ ord() id to chr() chr """
         trans = []
         for text, n in zip(texts, texts_len):
             n = n.numpy().item()
             ids = text[:n]
-            trans.append(''.join([chr(i) for i in ids]))
+            trans.append(text_feature.defeaturize(ids.numpy().tolist()))
         return trans
 
     def compute_metrics(self,
@@ -444,12 +461,11 @@ class U2Tester(U2Trainer):
         error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
 
         start_time = time.time()
-        text_feature = self.test_loader.collate_fn.text_feature
-        target_transcripts = self.ordid2token(texts, texts_len)
+        target_transcripts = self.id2token(texts, texts_len, self.text_feature)
         result_transcripts, result_tokenids = self.model.decode(
             audio,
             audio_len,
-            text_feature=text_feature,
+            text_feature=self.text_feature,
             decoding_method=cfg.decoding_method,
             lang_model_path=cfg.lang_model_path,
             beam_alpha=cfg.alpha,
@@ -499,7 +515,7 @@ class U2Tester(U2Trainer):
         self.model.eval()
         logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
 
-        stride_ms = self.test_loader.collate_fn.stride_ms
+        stride_ms = self.config.collator.stride_ms
         error_rate_type = None
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         num_frames = 0.0
@@ -558,8 +574,7 @@ class U2Tester(U2Trainer):
     def align(self):
         ctc_utils.ctc_align(self.config, self.model, self.align_loader,
                             self.config.decoding.batch_size,
-                            self.align_loader.collate_fn.stride_ms,
-                            self.align_loader.collate_fn.vocab_list,
+                            self.config.collator.stride_ms, self.vocab_list,
                             self.args.result_file)
 
     def load_inferspec(self):
@@ -573,7 +588,7 @@ class U2Tester(U2Trainer):
         infer_model = U2InferModel.from_pretrained(self.test_loader,
                                                    self.config.model.clone(),
                                                    self.args.checkpoint_path)
-        feat_dim = self.test_loader.collate_fn.feature_size
+        feat_dim = self.test_loader.feat_dim
         input_spec = [
             paddle.static.InputSpec(shape=[1, None, feat_dim],
                                     dtype='float32'),  # audio, [B,T,D]
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index 0d8508c2..d82034c8 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -392,6 +392,7 @@ class U2Tester(U2Trainer):
             unit_type=self.config.collator.unit_type,
             vocab_filepath=self.config.collator.vocab_filepath,
             spm_model_prefix=self.config.collator.spm_model_prefix)
+        self.vocab_list = self.text_feature.vocab_list
 
     def id2token(self, texts, texts_len, text_feature):
         """ ord() id to chr() chr """
@@ -529,8 +530,7 @@ class U2Tester(U2Trainer):
     def align(self):
         ctc_utils.ctc_align(self.config, self.model, self.align_loader,
                             self.config.decoding.batch_size,
-                            self.align_loader.collate_fn.stride_ms,
-                            self.align_loader.collate_fn.vocab_list,
+                            self.config.collator.stride_ms, self.vocab_list,
                             self.args.result_file)
 
     def load_inferspec(self):
diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index 13dc3a44..65dccad3 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -24,6 +24,8 @@ import soundfile
 import soxbindings as sox
 from scipy import signal
 
+from .utility import convert_samples_from_float32
+from .utility import convert_samples_to_float32
 from .utility import subfile_from_tar
 
 
@@ -689,15 +691,7 @@ class AudioSegment():
         Audio sample type is usually integer or float-point.
         Integers will be scaled to [-1, 1] in float32.
         """
-        float32_samples = samples.astype('float32')
-        if samples.dtype in np.sctypes['int']:
-            bits = np.iinfo(samples.dtype).bits
-            float32_samples *= (1. / 2**(bits - 1))
-        elif samples.dtype in np.sctypes['float']:
-            pass
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return float32_samples
+        return convert_samples_to_float32(samples)
 
     def _convert_samples_from_float32(self, samples, dtype):
         """Convert sample type from float32 to dtype.
@@ -708,20 +702,4 @@ class AudioSegment():
 
         This is for writing a audio file.
         """
-        dtype = np.dtype(dtype)
-        output_samples = samples.copy()
-        if dtype in np.sctypes['int']:
-            bits = np.iinfo(dtype).bits
-            output_samples *= (2**(bits - 1) / 1.)
-            min_val = np.iinfo(dtype).min
-            max_val = np.iinfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        elif samples.dtype in np.sctypes['float']:
-            min_val = np.finfo(dtype).min
-            max_val = np.finfo(dtype).max
-            output_samples[output_samples > max_val] = max_val
-            output_samples[output_samples < min_val] = min_val
-        else:
-            raise TypeError("Unsupported sample type: %s." % samples.dtype)
-        return output_samples.astype(dtype)
+        return convert_samples_from_float32(samples, dtype)
diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
index 7f3bd9e1..21f512e9 100644
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -92,7 +92,9 @@ class TextFeaturizer():
         tokens = self.tokenize(text)
         ids = []
         for token in tokens:
-            token = token if token in self.vocab_dict else self.unk
+            if token not in self.vocab_dict:
+                logger.debug(f"Text Token: {token} -> {self.unk}")
+                token = self.unk
             ids.append(self.vocab_dict[token])
         return ids
 
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 089890d2..703f2127 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
 __all__ = [
     "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
     "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
-    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32",
+    "convert_samples_from_float32"
 ]
 
 IGNORE_ID = -1
@@ -342,3 +343,50 @@ def load_cmvn(cmvn_file: str, filetype: str):
     else:
         raise ValueError(f"cmvn file type no support: {filetype}")
     return cmvn[0], cmvn[1]
+
+
+def convert_samples_to_float32(samples):
+    """Convert sample type to float32.
+
+    Audio sample type is usually integer or float-point.
+    Integers will be scaled to [-1, 1] in float32.
+
+    PCM16 -> PCM32
+    """
+    float32_samples = samples.astype('float32')
+    if samples.dtype in np.sctypes['int']:
+        bits = np.iinfo(samples.dtype).bits
+        float32_samples *= (1. / 2**(bits - 1))
+    elif samples.dtype in np.sctypes['float']:
+        pass
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return float32_samples
+
+
+def convert_samples_from_float32(samples, dtype):
+    """Convert sample type from float32 to dtype.
+
+    Audio sample type is usually integer or float-point. For integer
+    type, float32 will be rescaled from [-1, 1] to the maximum range
+    supported by the integer type.
+
+    PCM32 -> PCM16
+    """
+    dtype = np.dtype(dtype)
+    output_samples = samples.copy()
+    if dtype in np.sctypes['int']:
+        bits = np.iinfo(dtype).bits
+        output_samples *= (2**(bits - 1) / 1.)
+        min_val = np.iinfo(dtype).min
+        max_val = np.iinfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    elif samples.dtype in np.sctypes['float']:
+        min_val = np.finfo(dtype).min
+        max_val = np.finfo(dtype).max
+        output_samples[output_samples > max_val] = max_val
+        output_samples[output_samples < min_val] = min_val
+    else:
+        raise TypeError("Unsupported sample type: %s." % samples.dtype)
+    return output_samples.astype(dtype)
diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py
index cb7349d0..5f233549 100644
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
@@ -199,8 +199,8 @@ class SpeechCollatorBase():
         for idx, item in enumerate(batch):
             utts.append(item['utt'])
 
-            audio = item['feat']
-            text = item['text']
+            audio = item['input'][0]['feat']
+            text = item['output'][0]['text']
             audio, text = self.process_utterance(audio, text)
 
             audios.append(audio)  # [T, D]
@@ -343,9 +343,10 @@ class TripletSpeechCollator(SpeechCollator):
         for idx, item in enumerate(batch):
             utts.append(item['utt'])
 
-            audio = item['feat']
-            translation = item['text']
-            transcription = item['text1']
+            audio = item['input'][0]['feat']
+            translation = item['output'][0]['text']
+            transcription = item['output'][1]['text']
+
             audio, translation, transcription = self.process_utterance(
                 audio, translation, transcription)
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index c503107a..61eeb00f 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -103,7 +103,7 @@ class ManifestDataset(Dataset):
             min_output_len=min_output_len,
             max_output_input_ratio=max_output_input_ratio,
             min_output_input_ratio=min_output_input_ratio)
-        self._manifest.sort(key=lambda x: x["feat_shape"][0])
+        self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
 
     def __len__(self):
         return len(self._manifest)
@@ -188,34 +188,16 @@ class AudioDataset(Dataset):
         if sort:
             data = sorted(data, key=lambda x: x["feat_shape"][0])
         if raw_wav:
-            assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
-                                                                        '.scp')
-            data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
+            path_suffix = data[0]['feat'].split(':')[0].splitext()[-1]
+            assert path_suffix not in ('.ark', '.scp')
+            # m second to n frame
+            data = list(
+                map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms),
+                    data))
 
         self.input_dim = data[0]['feat_shape'][1]
         self.output_dim = data[0]['token_shape'][1]
 
-        # with open(data_file, 'r') as f:
-        #     for line in f:
-        #         arr = line.strip().split('\t')
-        #         if len(arr) != 7:
-        #             continue
-        #         key = arr[0].split(':')[1]
-        #         tokenid = arr[5].split(':')[1]
-        #         output_dim = int(arr[6].split(':')[1].split(',')[1])
-        #         if raw_wav:
-        #             wav_path = ':'.join(arr[1].split(':')[1:])
-        #             duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
-        #             data.append((key, wav_path, duration, tokenid))
-        #         else:
-        #             feat_ark = ':'.join(arr[1].split(':')[1:])
-        #             feat_info = arr[2].split(':')[1].split(',')
-        #             feat_dim = int(feat_info[1].strip())
-        #             num_frames = int(feat_info[0].strip())
-        #             data.append((key, feat_ark, num_frames, tokenid))
-        #             self.input_dim = feat_dim
-        #         self.output_dim = output_dim
-
         valid_data = []
         for i in range(len(data)):
             length = data[i]['feat_shape'][0]
@@ -223,17 +205,17 @@ class AudioDataset(Dataset):
             # remove too lang or too short utt for both input and output
             # to prevent from out of memory
             if length > max_length or length < min_length:
-                # logging.warn('ignore utterance {} feature {}'.format(
-                #     data[i][0], length))
                 pass
             elif token_length > token_max_length or token_length < token_min_length:
                 pass
             else:
                 valid_data.append(data[i])
+        logger.info(f"raw dataset len: {len(data)}")
         data = valid_data
+        num_data = len(data)
+        logger.info(f"dataset len after filter: {num_data}")
 
         self.minibatch = []
-        num_data = len(data)
         # Dynamic batch size
         if batch_type == 'dynamic':
             assert (max_frames_in_batch > 0)
@@ -258,7 +240,9 @@ class AudioDataset(Dataset):
                 cur = end
 
     def __len__(self):
+        """number of example(batch)"""
         return len(self.minibatch)
 
     def __getitem__(self, idx):
+        """batch example of idx"""
         return self.minibatch[idx]
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index e810662d..38ff1396 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -18,8 +18,10 @@ import kaldiio
 import numpy as np
 import soundfile
 
-from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
+from .utility import feat_type
+from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.log import Log
+# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation
 
 __all__ = ["LoadInputsAndTargets"]
 
@@ -322,20 +324,7 @@ class LoadInputsAndTargets():
                 "Not supported: loader_type={}".format(filetype))
 
     def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[-1].lower()
-        if suffix == 'ark':
-            return 'mat'
-        elif suffix == 'scp':
-            return 'scp'
-        elif suffix == 'npy':
-            return 'npy'
-        elif suffix == 'npz':
-            return 'npz'
-        elif suffix in ['wav', 'flac']:
-            # PCM16
-            return 'sound'
-        else:
-            raise ValueError(f"Not support filetype: {suffix}")
+        return feat_type(filepath)
 
 
 class SoundHDF5File():
diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py
index 392031ba..1a90e3d0 100644
--- a/paddlespeech/s2t/io/utility.py
+++ b/paddlespeech/s2t/io/utility.py
@@ -17,7 +17,7 @@ import numpy as np
 
 from paddlespeech.s2t.utils.log import Log
 
-__all__ = ["pad_list", "pad_sequence"]
+__all__ = ["pad_list", "pad_sequence", "feat_type"]
 
 logger = Log(__name__).getlog()
 
@@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray],
             out_tensor[:length, i, ...] = tensor
 
     return out_tensor
+
+
+def feat_type(filepath):
+    suffix = filepath.split(":")[0].split('.')[-1].lower()
+    if suffix == 'ark':
+        return 'mat'
+    elif suffix == 'scp':
+        return 'scp'
+    elif suffix == 'npy':
+        return 'npy'
+    elif suffix == 'npz':
+        return 'npz'
+    elif suffix in ['wav', 'flac']:
+        # PCM16
+        return 'sound'
+    else:
+        raise ValueError(f"Not support filetype: {suffix}")
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 9977cecc..4f833372 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -860,7 +860,7 @@ class U2Model(U2DecodeModel):
             int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
         """
         # cmvn
-        if 'cmvn_file' in configs and configs['cmvn_file'] is not None:
+        if 'cmvn_file' in configs and configs['cmvn_file']:
             mean, istd = load_cmvn(configs['cmvn_file'],
                                    configs['cmvn_file_type'])
             global_cmvn = GlobalCMVN(
@@ -934,8 +934,8 @@ class U2Model(U2DecodeModel):
             DeepSpeech2Model: The model built from pretrained result.
         """
         with UpdateConfig(config):
-            config.input_dim = dataloader.collate_fn.feature_size
-            config.output_dim = dataloader.collate_fn.vocab_size
+            config.input_dim = dataloader.feat_dim
+            config.output_dim = dataloader.vocab_size
 
         model = cls.from_config(config)
 
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 80eaf975..3d5f8cd1 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py
index 6e97f824..67f71b66 100644
--- a/paddlespeech/s2t/modules/cmvn.py
+++ b/paddlespeech/s2t/modules/cmvn.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index 7601a5cc..7ec92554 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index b0ab869a..6b4d9591 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index 4d516068..520b18de 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index 9207658f..5d4e9175 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 0cde5b9f..5c8ba081 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index 29d5a2d8..d39c0695 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py
index 5750f5a0..c7d9bd45 100644
--- a/paddlespeech/s2t/modules/loss.py
+++ b/paddlespeech/s2t/modules/loss.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py
index 6576cb92..d6b63761 100644
--- a/paddlespeech/s2t/modules/mask.py
+++ b/paddlespeech/s2t/modules/mask.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py
index 347264e9..e2619cd4 100644
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py
index 759bd540..99a8300f 100644
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py
index 4d2d2324..aa1e6b44 100644
--- a/paddlespeech/s2t/transform/cmvn.py
+++ b/paddlespeech/s2t/transform/cmvn.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 import io
+import json
 
 import h5py
 import kaldiio
@@ -157,3 +158,40 @@ class UtteranceCMVN():
             x = np.divide(x, std)
 
         return x
+
+
+class GlobalCMVN():
+    "Apply Global CMVN"
+
+    def __init__(self,
+                 cmvn_path,
+                 norm_means=True,
+                 norm_vars=True,
+                 std_floor=1.0e-20):
+        self.cmvn_path = cmvn_path
+        self.norm_means = norm_means
+        self.norm_vars = norm_vars
+        self.std_floor = std_floor
+
+        with open(cmvn_path) as f:
+            cmvn_stats = json.load(f)
+        self.count = cmvn_stats['frame_num']
+        self.mean = np.array(cmvn_stats['mean_stat']) / self.count
+        self.square_sums = np.array(cmvn_stats['var_stat'])
+        self.var = self.square_sums / self.count - self.mean**2
+        self.std = np.maximum(np.sqrt(self.var), self.std_floor)
+
+    def __repr__(self):
+        return f"""{self.__class__.__name__}(
+            cmvn_path={self.cmvn_path}, 
+            norm_means={self.norm_means}, 
+            norm_vars={self.norm_vars},)"""
+
+    def __call__(self, x, uttid=None):
+        # x: [Time, Dim]
+        if self.norm_means:
+            x = np.subtract(x, self.mean)
+
+        if self.norm_vars:
+            x = np.divide(x, self.std)
+        return x
diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py
index 153d494b..873adb0b 100644
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -16,6 +16,7 @@ import librosa
 import numpy
 import scipy
 import soundfile
+import soxbindings as sox
 
 from paddlespeech.s2t.io.reader import SoundHDF5File
 
@@ -82,7 +83,6 @@ class SpeedPerturbation():
     def __call__(self, x, uttid=None, train=True):
         if not train:
             return x
-
         x = x.astype(numpy.float32)
         if self.accept_uttid:
             ratio = self.utt2ratio[uttid]
@@ -108,6 +108,110 @@ class SpeedPerturbation():
         return y
 
 
+class SpeedPerturbationSox():
+    """SpeedPerturbationSox
+
+    The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
+    and sox-speed just to resample the input,
+    i.e pitch and tempo are changed both.
+
+    To speed up or slow down the sound of a file, 
+    use speed to modify the pitch and the duration of the file. 
+    This raises the speed and reduces the time. 
+    The default factor is 1.0 which makes no change to the audio. 
+    2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher.
+
+    "Why use speed option instead of tempo -s in SoX for speed perturbation"
+    https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
+
+    tempo option:
+    sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9
+
+    speed option:
+    sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9
+
+    If we use speed option like above, the pitch of audio also will be changed, 
+    but the tempo option does not change the pitch.
+    """
+
+    def __init__(
+            self,
+            lower=0.9,
+            upper=1.1,
+            utt2ratio=None,
+            keep_length=True,
+            sr=16000,
+            seed=None, ):
+        self.sr = sr
+        self.keep_length = keep_length
+        self.state = numpy.random.RandomState(seed)
+
+        if utt2ratio is not None:
+            self.utt2ratio = {}
+            # Use the scheduled ratio for each utterances
+            self.utt2ratio_file = utt2ratio
+            self.lower = None
+            self.upper = None
+            self.accept_uttid = True
+
+            with open(utt2ratio, "r") as f:
+                for line in f:
+                    utt, ratio = line.rstrip().split(None, 1)
+                    ratio = float(ratio)
+                    self.utt2ratio[utt] = ratio
+        else:
+            self.utt2ratio = None
+            # The ratio is given on runtime randomly
+            self.lower = lower
+            self.upper = upper
+
+    def __repr__(self):
+        if self.utt2ratio is None:
+            return f"""{self.__class__.__name__}(
+                lower={self.lower}, 
+                upper={self.upper}, 
+                keep_length={self.keep_length},
+                sample_rate={self.sr})"""
+
+        else:
+            return f"""{self.__class__.__name__}(
+                utt2ratio={self.utt2ratio_file},
+                sample_rate={self.sr})"""
+
+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+
+        x = x.astype(numpy.float32)
+        if self.accept_uttid:
+            ratio = self.utt2ratio[uttid]
+        else:
+            ratio = self.state.uniform(self.lower, self.upper)
+
+        tfm = sox.Transformer()
+        tfm.set_globals(multithread=False)
+        tfm.speed(ratio)
+        y = tfm.build_array(input_array=x, sample_rate_in=self.sr)
+
+        if self.keep_length:
+            diff = abs(len(x) - len(y))
+            if len(y) > len(x):
+                # Truncate noise
+                y = y[diff // 2:-((diff + 1) // 2)]
+            elif len(y) < len(x):
+                # Assume the time-axis is the first: (Time, Channel)
+                pad_width = [(diff // 2, (diff + 1) // 2)] + [
+                    (0, 0) for _ in range(y.ndim - 1)
+                ]
+                y = numpy.pad(
+                    y, pad_width=pad_width, constant_values=0, mode="constant")
+
+        if y.ndim == 2 and x.ndim == 1:
+            # (T, C) -> (T)
+            y = y.sequence(1)
+        return y
+
+
 class BandpassPerturbation():
     """BandpassPerturbation
 
diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/s2t/transform/spec_augment.py
index 83e4e2e7..5ce95085 100644
--- a/paddlespeech/s2t/transform/spec_augment.py
+++ b/paddlespeech/s2t/transform/spec_augment.py
@@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
     :returns numpy.ndarray: time warped spectrogram (time, freq)
     """
     window = max_time_warp
+    if window == 0:
+        return x
+
     if mode == "PIL":
         t = x.shape[0]
         if t - window <= window:
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index df3130da..da91ef92 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import librosa
 import numpy as np
+from python_speech_features import logfbank
 
 
 def stft(x,
@@ -304,3 +305,94 @@ class IStft():
             win_length=self.win_length,
             window=self.window,
             center=self.center, )
+
+
+class LogMelSpectrogramKaldi():
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_fft=512,  # fft point
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            window="povey",
+            fmin=20,
+            fmax=None,
+            eps=1e-10,
+            dither=False):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        if n_shift > win_length:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        self.n_shift = n_shift / fs  # unit: ms
+        self.win_length = win_length / fs  # unit: ms
+
+        self.window = window
+        self.fmin = fmin
+        if fmax is None:
+            fmax_ = fmax if fmax else self.fs / 2
+        elif fmax > int(self.fs / 2):
+            raise ValueError("fmax must not be greater than half of "
+                             "sample rate.")
+        self.fmax = fmax_
+
+        self.eps = eps
+        self.remove_dc_offset = True
+        self.preemph = 0.97
+        self.dither = dither
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
+            "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, "
+            "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_fft=self.n_fft,
+                n_shift=self.n_shift,
+                preemph=self.preemph,
+                win_length=self.win_length,
+                window=self.window,
+                fmin=self.fmin,
+                fmax=self.fmax,
+                eps=self.eps,
+                dither=self.dither, ))
+
+    def __call__(self, x):
+        """
+
+        Args:
+            x (np.ndarray): shape (Ti,)
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+
+        if x.dtype in np.sctypes['float']:
+            # PCM32 -> PCM16
+            bits = np.iinfo(np.int16).bits
+            x = x * 2**(bits - 1)
+
+        # logfbank need PCM16 input
+        y = logfbank(
+            signal=x,
+            samplerate=self.fs,
+            winlen=self.win_length,  # unit ms
+            winstep=self.n_shift,  # unit ms
+            nfilt=self.n_mels,
+            nfft=self.n_fft,
+            lowfreq=self.fmin,
+            highfreq=self.fmax,
+            dither=self.dither,
+            remove_dc_offset=self.remove_dc_offset,
+            preemph=self.preemph,
+            wintype=self.window)
+        return y
diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py
index 1aee4b36..381b0cdc 100644
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -45,7 +45,8 @@ import_alias = dict(
     stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram",
     wpe="paddlespeech.s2t.transform.wpe:WPE",
     channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector",
-)
+    fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi",
+    cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN")
 
 
 class Transformation():
diff --git a/tests/chains/speedyspeech/prepare.sh b/tests/chains/speedyspeech/prepare.sh
index fb6ef285..1ddcd677 100755
--- a/tests/chains/speedyspeech/prepare.sh
+++ b/tests/chains/speedyspeech/prepare.sh
@@ -32,7 +32,7 @@ trainer_list=$(func_parser_value "${lines[14]}")
 # MODE be one of ['lite_train_infer' 'whole_infer' 'whole_train_infer']
 if [ ${MODE} = "lite_train_infer" ];then
     # pretrain lite train data
-    wget -nc -P  ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
+    wget -nc -P  ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip
     (cd ./pretrain_models && unzip pwg_baker_ckpt_0.4.zip)
     # download data
     rm -rf ./train_data/mini_BZNSYP
@@ -40,7 +40,7 @@ if [ ${MODE} = "lite_train_infer" ];then
     cd ./train_data/ && tar xzf mini_BZNSYP.tar.gz
     cd ../
 elif [ ${MODE} = "whole_train_infer" ];then
-    wget -nc -P  ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip
+    wget -nc -P  ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip
     wget -nc -P  ./pretrain_models/ https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip
     (cd ./pretrain_models && unzip speedyspeech_nosil_baker_ckpt_0.5.zip && unzip pwg_baker_ckpt_0.4.zip)
     rm -rf ./train_data/processed_BZNSYP
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index 296d272a..e47554dc 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -33,8 +33,8 @@ add_arg('spectrum_type',    str,
         choices=['linear', 'mfcc', 'fbank'])
 add_arg('feat_dim',    int, 13, "Audio feature dim.")
 add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', float, 10.0,  "stride length in ms.")
-add_arg('window_ms', float, 20.0,  "stride length in ms.")
+add_arg('stride_ms', int, 10,  "stride length in ms.")
+add_arg('window_ms', int, 20,  "stride length in ms.")
 add_arg('sample_rate',  int, 16000,  "target sample rate.")
 add_arg('use_dB_normalization', bool, True, "do dB normalization.")
 add_arg('target_dB',   int, -20,  "target dB.")
@@ -61,8 +61,8 @@ def main():
         spectrum_type=args.spectrum_type,
         feat_dim=args.feat_dim,
         delta_delta=args.delta_delta,
-        stride_ms=args.stride_ms,
-        window_ms=args.window_ms,
+        stride_ms=float(args.stride_ms),
+        window_ms=float(args.window_ms),
         n_fft=None,
         max_freq=None,
         target_sample_rate=args.sample_rate,
diff --git a/utils/format_data.py b/utils/format_data.py
index 6fe36997..2fa1924a 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -20,13 +20,13 @@ import json
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp")
 add_arg('cmvn_path',       str,
         'examples/librispeech/data/mean_std.json',
         "Filepath of cmvn.")
@@ -62,27 +62,76 @@ def main():
     vocab_size = text_feature.vocab_size
     print(f"Vocab size: {vocab_size}")
 
+    # josnline like this
+    # {
+    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+    #   "utt2spk": "111-2222",
+    #   "utt": "111-2222-333"
+    # }
     count = 0
     for manifest_path in args.manifest_paths:
         manifest_jsons = read_manifest(manifest_path)
         for line_json in manifest_jsons:
+            output_json = {
+                "input": [],
+                "output": [],
+                'utt': line_json['utt'],
+                'utt2spk': line_json.get('utt2spk', 'global'),
+            }
+
+            # output
             line = line_json['text']
-            tokens = text_feature.tokenize(line)
-            tokenids = text_feature.featurize(line)
-            line_json['token'] = tokens
-            line_json['token_id'] = tokenids
-            line_json['token_shape'] = (len(tokenids), vocab_size)
-            feat_shape = line_json['feat_shape']
-            assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-            if args.feat_type == 'raw':
-                feat_shape.append(feat_dim)
-                line_json['filetype'] = 'sound'
-            else: # kaldi
-                raise NotImplementedError('no support kaldi feat now!')
-            fout.write(json.dumps(line_json) + '\n')
+            if isinstance(line, str):
+                # only one target
+                tokens = text_feature.tokenize(line)
+                tokenids = text_feature.featurize(line)
+                output_json['output'].append({
+                    'name': 'target1',
+                    'shape': (len(tokenids), vocab_size),
+                    'text': line,
+                    'token': ' '.join(tokens),
+                    'tokenid': ' '.join(map(str, tokenids)),
+                })
+            else:
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
+
+            # input
+            line = line_json['feat']
+            if isinstance(line, str):
+                # only one input
+                feat_shape = line_json['feat_shape']
+                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+                filetype = feat_type(line)
+                if filetype == 'sound':
+                    feat_shape.append(feat_dim)
+                else: # kaldi
+                    raise NotImplementedError('no support kaldi feat now!')
+
+                output_json['input'].append({
+                    "name": "input1",
+                    "shape": feat_shape,
+                    "feat": line,
+                    "filetype": filetype,
+                })
+            else:
+                # isinstance(line, list), multi input 
+                raise NotImplementedError("not support multi input now!")
+
+            fout.write(json.dumps(output_json) + '\n')
             count += 1
 
-    print(f"Examples number: {count}")
+    print(f"{args.manifest_paths} Examples number: {count}")
     fout.close()
 
 
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index 79b3d2cb..e0b5ece3 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -20,13 +20,13 @@ import json
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.frontend.utility import read_manifest
+from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
 add_arg('cmvn_path',       str,
         'examples/librispeech/data/mean_std.json',
         "Filepath of cmvn.")
@@ -79,9 +79,11 @@ def main():
             line_json['token1'] = tokens
             line_json['token_id1'] = tokenids
             line_json['token_shape1'] = (len(tokenids), vocab_size)
+
             feat_shape = line_json['feat_shape']
             assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-            if args.feat_type == 'raw':
+            filetype = feat_type(line_json['feat'])
+            if filetype == 'sound':
                 feat_shape.append(feat_dim)
             else: # kaldi
                 raise NotImplementedError('no support kaldi feat now!')
diff --git a/utils/pack_model.sh b/utils/pack_model.sh
new file mode 100755
index 00000000..8acd59a6
--- /dev/null
+++ b/utils/pack_model.sh
@@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+[ -f ./path.sh ] && . ./path.sh
+
+results=""
+# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt
+#        exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"'
+lm=""
+dict=""
+etc=""
+outfile="model"
+preprocess_conf=""
+
+help_message=$(cat <<EOF
+Usage: $0 --lm <lm> --dict <dict> <train_conf> <dec_conf> <cmvn> <e2e>, for example:
+<lm>:       exp/train_rnnlm/rnnlm.model.best
+<dict>:     data/lang_char
+<train_conf>:  conf/train.yaml
+<dec_conf>: conf/decode.yaml
+<cmvn>:     data/tr_it/cmvn.ark
+<e2e>:      exp/tr_it_pytorch_train/results/model.last10.avg.best
+EOF
+)
+
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+    echo "${help_message}"
+    exit 1
+fi
+
+tr_conf=$1
+dec_conf=$2
+cmvn=$3
+e2e=$4
+
+echo "  - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)"
+echo "    - model link: (put the model link manually.)"
+
+# configs
+if [ -e ${tr_conf} ]; then
+    tar cfh ${outfile}.tar ${tr_conf}
+    echo -n "    - training config file: \`"
+    echo ${tr_conf} | sed -e "s/$/\`/"
+else
+    echo "missing ${tr_conf}"
+    exit 1
+fi
+if [ -e ${dec_conf} ]; then
+    tar rfh ${outfile}.tar ${dec_conf}
+    echo -n "    - decoding config file: \`"
+    echo ${dec_conf} | sed -e "s/$/\`/"
+else
+    echo "missing ${dec_conf}"
+    exit 1
+fi
+# NOTE(kan-bayashi): preprocess conf is optional
+if [ -n "${preprocess_conf}" ]; then
+    tar rfh ${outfile}.tar ${preprocess_conf}
+    echo -n "    - preprocess config file: \`"
+    echo ${preprocess_conf} | sed -e "s/$/\`/"
+fi
+
+# cmvn
+if [ -e ${cmvn} ]; then
+    tar rfh ${outfile}.tar ${cmvn}
+    echo -n "    - cmvn file: \`"
+    echo ${cmvn} | sed -e "s/$/\`/"
+else
+    echo "missing ${cmvn}"
+    exit 1
+fi
+
+# e2e
+if [ -e ${e2e} ]; then
+    tar rfh ${outfile}.tar ${e2e}
+    echo -n "    - e2e file: \`"
+    echo ${e2e} | sed -e "s/$/\`/"
+
+    e2e_conf=$(dirname ${e2e})/model.json
+    if [ ! -e ${e2e_conf} ]; then
+	echo missing ${e2e_conf}
+	#exit 1
+    else
+	echo -n "    - e2e JSON file: \`"
+	echo ${e2e_conf} | sed -e "s/$/\`/"
+	tar rfh ${outfile}.tar ${e2e_conf}
+    fi
+else
+    echo "missing ${e2e}"
+    exit 1
+fi
+
+# lm
+if [ -n "${lm}" ]; then
+    if [ -e ${lm} ]; then
+	tar rfh ${outfile}.tar ${lm}
+	echo -n "    - lm file: \`"
+	echo ${lm} | sed -e "s/$/\`/"
+
+	lm_conf=$(dirname ${lm})/model.json
+	if [ ! -e ${lm_conf} ]; then
+	    echo missing ${lm_conf}
+	    exit 1
+	else
+	    echo -n "    - lm JSON file: \`"
+	    echo ${lm_conf} | sed -e "s/$/\`/"
+	    tar rfh ${outfile}.tar ${lm_conf}
+	fi
+    else
+	echo "missing ${lm}"
+	exit 1
+    fi
+fi
+
+# dict
+if [ -n "${dict}" ]; then
+    if [ -e ${dict} ]; then
+	tar rfh ${outfile}.tar ${dict}
+	echo -n "    - dict file: \`"
+	echo ${dict} | sed -e "s/$/\`/"
+    else
+	echo "missing ${dict}"
+	exit 1
+    fi
+fi
+
+# etc
+for x in ${etc}; do
+    if [ -e ${x} ]; then
+	tar rfh ${outfile}.tar ${x}
+	echo -n "    - etc file: \`"
+	echo ${x} | sed -e "s/$/\`/"
+    else
+	echo "missing ${x}"
+	exit 1
+    fi
+done
+
+# finally compress the tar file
+gzip -f ${outfile}.tar
+
+# results
+if [ -n "${results}" ]; then
+    echo "  - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results <results>\`)"
+    echo "\`\`\`"
+fi
+for x in ${results}; do
+    if [ -e ${x} ]; then
+	echo "${x}"
+	grep -e Avg -e SPKR -m 2 ${x}
+    else
+	echo "missing ${x}"
+	exit 1
+    fi
+done
+if [ -n "${results}" ]; then
+    echo "\`\`\`"
+fi
+
+exit 0
diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py
new file mode 100755
index 00000000..131b4a58
--- /dev/null
+++ b/utils/remove_longshortdata.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""remove longshort data from manifest"""
+import argparse
+import logging
+
+import jsonlines
+
+from paddlespeech.s2t.utils.cli_utils import get_commandline_args
+
+# manifest after format
+# josnline like this
+# {
+#   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+#   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+#   "utt2spk": "111-2222",
+#   "utt": "111-2222-333"
+# }
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="remove longshort data from format manifest",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--iaxis",
+        default=0,
+        type=int,
+        help="multi inputs index, 0 is the first")
+    parser.add_argument(
+        "--oaxis",
+        default=0,
+        type=int,
+        help="multi outputs index, 0 is the first")
+    parser.add_argument("--maxframes", default=2000, type=int, help="maxframes")
+    parser.add_argument("--minframes", default=10, type=int, help="minframes")
+    parser.add_argument("--maxchars", default=200, type=int, help="max tokens")
+    parser.add_argument("--minchars", default=0, type=int, help="min tokens")
+    parser.add_argument(
+        "--stride_ms", default=10, type=int, help="stride in ms unit.")
+    parser.add_argument(
+        "rspecifier",
+        type=str,
+        help="jsonl format manifest. e.g. manifest.jsonl")
+    parser.add_argument(
+        "wspecifier_or_wxfilename",
+        type=str,
+        help="Write specifier. e.g. manifest.jsonl")
+    return parser
+
+
+def filter_input(args, line):
+    tmp = line['input'][args.iaxis]
+    if args.sound:
+        # second to frame
+        nframe = tmp['shape'][0] * 1000 / args.stride_ms
+    else:
+        nframe = tmp['shape'][0]
+
+    if nframe < args.minframes or nframe > args.maxframes:
+        return True
+    else:
+        return False
+
+
+def filter_output(args, line):
+    nchars = len(line['output'][args.iaxis]['text'])
+    if nchars < args.minchars or nchars > args.maxchars:
+        return True
+    else:
+        return False
+
+
+def main():
+    args = get_parser().parse_args()
+
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+
+    with jsonlines.open(args.rspecifier, 'r') as reader:
+        lines = list(reader)
+    logging.info(f"Example: {len(lines)}")
+    feat = lines[0]['input'][args.iaxis]['feat']
+    args.soud = False
+    if feat.split('.')[-1] not in 'ark, scp':
+        args.sound = True
+
+    count = 0
+    filter = 0
+    with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer:
+        for line in lines:
+            if filter_input(args, line) or filter_output(args, line):
+                filter += 1
+                continue
+            writer.write(line)
+            count += 1
+    logging.info(f"Example after filter: {count}\{filter}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/show_results.sh b/utils/show_results.sh
new file mode 100755
index 00000000..42f80ee6
--- /dev/null
+++ b/utils/show_results.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+mindepth=0
+maxdepth=1
+
+. utils/parse_options.sh
+
+if [ $# -gt 1 ]; then
+    echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2
+    echo ""
+    echo "Show the system environments and the evaluation results in Markdown format."
+    echo 'The default of <exp> is "exp/".'
+    exit 1
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+set -euo pipefail
+if [ $# -eq 1 ]; then
+    exp=$1
+else
+    exp=exp
+fi
+
+
+cat << EOF
+<!-- Generated by $0 -->
+# RESULTS
+## Environments
+- date: \`$(LC_ALL=C date)\`
+EOF
+
+python3 << EOF
+import sys, paddle
+pyversion = sys.version.replace('\n', ' ')
+
+print(f"""- python version: \`{pyversion}\`
+- paddle version: \`paddle {paddle.__version__}\`""")
+EOF
+
+cat << EOF
+- Git hash: \`$(git rev-parse HEAD)\`
+  - Commit date: \`$(git log -1 --format='%cd')\`
+
+EOF
+
+while IFS= read -r expdir; do
+    if ls ${expdir}/decode_*/result.txt &> /dev/null; then
+    # 1. Show the result table
+    cat << EOF
+## $(basename ${expdir})
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+        grep -e Avg ${expdir}/decode_*/result.txt \
+            | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \
+            | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+        echo
+
+        # 2. Show the result table for WER
+        if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then
+            cat << EOF
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+EOF
+            grep -e Avg ${expdir}/decode_*/result.wrd.txt \
+                | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \
+                | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|'
+            echo
+        fi
+    fi
+done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)