diff --git a/deepspeech/frontend/featurizer/text_featurizer.py b/deepspeech/frontend/featurizer/text_featurizer.py index ac129b0f7..34220432b 100644 --- a/deepspeech/frontend/featurizer/text_featurizer.py +++ b/deepspeech/frontend/featurizer/text_featurizer.py @@ -140,7 +140,7 @@ class TextFeaturizer(): Returns: str: text string. """ - tokens = tokens.replace(SPACE, " ") + tokens = [t.replace(SPACE, " ") for t in tokens ] return "".join(tokens) def word_tokenize(self, text): diff --git a/examples/aishell/README.md b/examples/aishell/README.md index c2534b9e3..5e5c5ca90 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -1,4 +1,11 @@ # ASR -* s0 for deepspeech2 offline -* s1 for u2 +* s0 for deepspeech2 +* s1 for u2/transformer/conformer + +## Data + +| Data Subset | Duration in Seconds | +| data/manifest.train | 1.23 ~ 14.53125 | +| data/manifest.dev | 1.645 ~ 12.533 | +| data/manifest.test | 1.859125 ~ 14.6999375 | diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index ee0f1405e..a4617c3b4 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -1,11 +1,5 @@ # Aishell-1 -## Data -| Data Subset | Duration in Seconds | -| data/manifest.train | 1.23 ~ 14.53125 | -| data/manifest.dev | 1.645 ~ 12.533 | -| data/manifest.test | 1.859125 ~ 14.6999375 | - ## Deepspeech2 | Model | Params | Release | Config | Test set | Loss | CER | diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index 1312a12fc..f4fccbe6e 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -26,22 +26,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # download data, generate manifests - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type="char" \ - --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ - --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -62,6 +46,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download data, generate manifests + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type="char" \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 07cc569ed..0096c73e3 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -11,6 +11,7 @@ ## Chunk Conformer +Need set `decoding.decoding_chunk_size=16` when decoding. | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | @@ -18,10 +19,3 @@ | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | - - -## Transformer - -| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | ---| -| transformer | - | conf/transformer.yaml | spec_aug + shift | test | attention | - | - | diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/s1/conf/augmentation.json index d0409b142..31c481c8d 100644 --- a/examples/aishell/s1/conf/augmentation.json +++ b/examples/aishell/s1/conf/augmentation.json @@ -19,17 +19,17 @@ { "type": "specaug", "params": { + "W": 0, + "warp_mode": "PIL", "F": 10, - "T": 50, "n_freq_masks": 2, + "T": 50, "n_time_masks": 2, "p": 1.0, - "W": 80, "adaptive_number_ratio": 0, "adaptive_size_ratio": 0, "max_n_time_masks": 20, - "replace_with_zero": true, - "warp_mode": "PIL" + "replace_with_zero": true }, "prob": 1.0 } diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index c05c3ea25..2b9f69ae4 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -26,22 +26,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # download data, generate manifests - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type="char" \ - --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ - --manifest_paths "data/manifest.train.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -63,6 +47,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download data, generate manifests + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type="char" \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths "data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh index dd3ccd8e0..0b9b0f8fc 100644 --- a/examples/aishell/s1/path.sh +++ b/examples/aishell/s1/path.sh @@ -25,5 +25,5 @@ export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH -[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" -. $KALDI_ROOT/tools/config/common_path.sh || true +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index b2a495b45..634bb8d0e 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -14,22 +14,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # download data, generate manifests - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type="char" \ - --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ - --manifest_paths "data/manifest.train.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -51,6 +35,20 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download data, generate manifests + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type="char" \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths "data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 2718988f8..57f506a49 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -4,3 +4,10 @@ * s1 is for transformer/conformer/U2 * s2 is for transformer/conformer/U2 w/ kaldi feat need install Kaldi + +## Data +| Data Subset | Duration in Seconds | +| --- | --- | +| data/manifest.train | 0.83s ~ 29.735s | +| data/manifest.dev | 1.065 ~ 35.155s | +| data/manifest.test-clean | 1.285s ~ 34.955s | diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index e3f7b325c..fd2b0c013 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -42,21 +42,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type ${unit_type} \ - --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ - --manifest_paths="data/manifest.train.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -77,6 +62,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type ${unit_type} \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index 506caa86e..a0b99e752 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -1,13 +1,5 @@ # LibriSpeech -## Data -| Data Subset | Duration in Seconds | -| --- | --- | -| data/manifest.train | 0.83s ~ 29.735s | -| data/manifest.dev | 1.065 ~ 35.155s | -| data/manifest.test-clean | 1.285s ~ 34.955s | - - ## Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | @@ -21,6 +13,7 @@ | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | + ## Chunk Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 2b6af2295..56fec8463 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -46,23 +46,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ - --manifest_paths="data/manifest.train.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -84,6 +67,21 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type "spm" \ + --spm_vocab_size=${nbpe} \ + --spm_mode ${bpemode} \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh index 2b6af2295..56fec8463 100755 --- a/examples/librispeech/s2/local/data.sh +++ b/examples/librispeech/s2/local/data.sh @@ -46,23 +46,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ - --manifest_paths="data/manifest.train.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -84,6 +67,21 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type "spm" \ + --spm_vocab_size=${nbpe} \ + --spm_mode ${bpemode} \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 3aae24fdb..b080a5b49 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -44,27 +44,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "Complete raw data pre-process." fi - if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ - --text_keys 'text' 'text1' \ - --manifest_paths="data/manifest.train.raw" - - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -86,6 +66,23 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type "spm" \ + --spm_vocab_size=${nbpe} \ + --spm_mode ${bpemode} \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --text_keys 'text' 'text1' \ + --manifest_paths="data/manifest.train.raw" + + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index f4be90482..ad4ddde3f 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -24,22 +24,8 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi fi -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type ${unit_type} \ - --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ - --manifest_paths="data/manifest.train.raw" - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # compute mean and stddev for normalizer num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ @@ -61,6 +47,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type ${unit_type} \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index fabf2e404..711ebee40 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -27,21 +27,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type ${unit_type} \ - --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ - --manifest_paths="data/manifest.tiny.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ @@ -61,6 +46,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type ${unit_type} \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.tiny.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index b5dbd5812..b25f993f6 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -30,23 +30,6 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ - --manifest_paths="data/manifest.tiny.raw" - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # compute mean and stddev for normalizer python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ @@ -67,6 +50,21 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fi fi +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type "spm" \ + --spm_vocab_size=${nbpe} \ + --spm_mode ${bpemode} \ + --spm_model_prefix ${bpeprefix} \ + --vocab_path="data/vocab.txt" \ + --manifest_paths="data/manifest.tiny.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size