diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py
index ac129b0f7..34220432b 100644
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@@ -140,7 +140,7 @@ class TextFeaturizer():
         Returns:
            str: text string.
         """
-        tokens = tokens.replace(SPACE, " ")
+        tokens = [t.replace(SPACE, " ") for t in tokens ]
         return "".join(tokens)
 
     def word_tokenize(self, text):
diff --git a/examples/aishell/README.md b/examples/aishell/README.md
index c2534b9e3..5e5c5ca90 100644
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@@ -1,4 +1,11 @@
 # ASR
 
-* s0 for deepspeech2 offline
-* s1 for u2
+* s0 for deepspeech2
+* s1 for u2/transformer/conformer
+
+## Data
+
+| Data Subset | Duration in Seconds |
+| data/manifest.train |  1.23 ~ 14.53125 |
+| data/manifest.dev  | 1.645 ~ 12.533 |  
+| data/manifest.test | 1.859125 ~ 14.6999375 |
diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md
index ee0f1405e..a4617c3b4 100644
--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
@@ -1,11 +1,5 @@
 # Aishell-1
 
-## Data
-| Data Subset | Duration in Seconds |
-| data/manifest.train |  1.23 ~ 14.53125 |
-| data/manifest.dev  | 1.645 ~ 12.533 |  
-| data/manifest.test | 1.859125 ~ 14.6999375 |
-
 ## Deepspeech2
 
 | Model | Params | Release | Config | Test set | Loss | CER |  
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
index 1312a12fc..f4fccbe6e 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -26,22 +26,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # download data, generate manifests
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type="char" \
-    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -62,6 +46,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md
index 07cc569ed..0096c73e3 100644
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
@@ -11,6 +11,7 @@
 
 
 ## Chunk Conformer
+Need set `decoding.decoding_chunk_size=16` when decoding.
 
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
@@ -18,10 +19,3 @@
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
 | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
-
-
-## Transformer
-
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
-| --- | --- | --- | --- | --- | --- | --- | ---|  
-| transformer | - | conf/transformer.yaml | spec_aug + shift | test | attention | - | - |  
diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/s1/conf/augmentation.json
index d0409b142..31c481c8d 100644
--- a/examples/aishell/s1/conf/augmentation.json
+++ b/examples/aishell/s1/conf/augmentation.json
@@ -19,17 +19,17 @@
   {
     "type": "specaug",
     "params": {
+      "W": 0,
+      "warp_mode": "PIL",
       "F": 10,
-      "T": 50,
       "n_freq_masks": 2,
+      "T": 50,
       "n_time_masks": 2,
       "p": 1.0,
-      "W": 80,
       "adaptive_number_ratio": 0,
       "adaptive_size_ratio": 0,
       "max_n_time_masks": 20,
-      "replace_with_zero": true,
-      "warp_mode": "PIL"
+      "replace_with_zero": true
     },
     "prob": 1.0
   }
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh
index c05c3ea25..2b9f69ae4 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -26,22 +26,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # download data, generate manifests
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type="char" \
-    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths "data/manifest.train.raw"
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -63,6 +47,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh
index dd3ccd8e0..0b9b0f8fc 100644
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
@@ -25,5 +25,5 @@ export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
 export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!"
-. $KALDI_ROOT/tools/config/common_path.sh || true
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh
index b2a495b45..634bb8d0e 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -14,22 +14,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # download data, generate manifests
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type="char" \
-    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths "data/manifest.train.raw"
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -51,6 +35,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md
index 2718988f8..57f506a49 100644
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
@@ -4,3 +4,10 @@
 * s1 is for transformer/conformer/U2
 * s2 is for transformer/conformer/U2 w/ kaldi feat
 need install Kaldi
+
+## Data
+| Data Subset | Duration in Seconds |
+| --- | --- |
+| data/manifest.train |  0.83s ~ 29.735s |
+| data/manifest.dev | 1.065 ~ 35.155s |  
+| data/manifest.test-clean | 1.285s ~ 34.955s |
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh
index e3f7b325c..fd2b0c013 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -42,21 +42,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type ${unit_type} \
-    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths="data/manifest.train.raw"
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -77,6 +62,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type ${unit_type} \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md
index 506caa86e..a0b99e752 100644
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
@@ -1,13 +1,5 @@
 # LibriSpeech
 
-## Data
-| Data Subset | Duration in Seconds |
-| --- | --- |
-| data/manifest.train |  0.83s ~ 29.735s |
-| data/manifest.dev | 1.065 ~ 35.155s |  
-| data/manifest.test-clean | 1.285s ~ 34.955s |
-
-
 ## Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
@@ -21,6 +13,7 @@
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 |  
 
+
 ## Chunk Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh
index 2b6af2295..56fec8463 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -46,23 +46,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths="data/manifest.train.raw"
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -84,6 +67,21 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh
index 2b6af2295..56fec8463 100755
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@@ -46,23 +46,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths="data/manifest.train.raw"
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -84,6 +67,21 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
index 3aae24fdb..b080a5b49 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -44,27 +44,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     echo "Complete raw data pre-process."
 fi
 
-
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
-    --text_keys 'text' 'text1' \
-    --manifest_paths="data/manifest.train.raw"
-
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -86,6 +66,23 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --text_keys 'text' 'text1' \
+    --manifest_paths="data/manifest.train.raw"
+
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh
index f4be90482..ad4ddde3f 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -24,22 +24,8 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     fi
 fi
 
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type ${unit_type} \
-    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths="data/manifest.train.raw"
 
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # compute mean and stddev for normalizer
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
@@ -61,6 +47,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type ${unit_type} \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh
index fabf2e404..711ebee40 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -27,21 +27,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type ${unit_type} \
-    --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths="data/manifest.tiny.raw"
-    
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
@@ -61,6 +46,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type ${unit_type} \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.tiny.raw"
+    
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh
index b5dbd5812..b25f993f6 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -30,23 +30,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
-    --manifest_paths="data/manifest.tiny.raw"
-    
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
-fi
-
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # compute mean and stddev for normalizer
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
@@ -67,6 +50,21 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fi
 fi
 
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.tiny.raw"
+    
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size