diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml index 8fb4e6e8..bdfa4219 100644 --- a/examples/aishell/asr0/conf/deepspeech2.yaml +++ b/examples/aishell/asr0/conf/deepspeech2.yaml @@ -14,7 +14,7 @@ collator: batch_size: 64 # one gpu mean_std_filepath: data/mean_std.json unit_type: char - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 29ec2379..010d8f15 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -14,7 +14,7 @@ collator: batch_size: 64 # one gpu mean_std_filepath: data/mean_std.json unit_type: char - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: diff --git a/examples/aishell/asr0/local/data.sh b/examples/aishell/asr0/local/data.sh index 23f04f2a..1032cedc 100755 --- a/examples/aishell/asr0/local/data.sh +++ b/examples/aishell/asr0/local/data.sh @@ -3,9 +3,12 @@ stage=-1 stop_stage=100 -source ${MAIN_ROOT}/utils/parse_options.sh +dict_dir=data/lang_char + +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} @@ -52,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type="char" \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw" if [ $? -ne 0 ]; then @@ -68,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${dataset}.raw" \ --output_path="data/manifest.${dataset}" diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 336a6c46..e07cd07c 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -12,7 +12,7 @@ data: collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' spm_model_prefix: '' augmentation_config: conf/preprocess.yaml diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 0e9d79d8..154f44a2 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -12,7 +12,7 @@ data: collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' spm_model_prefix: '' augmentation_config: conf/preprocess.yaml diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index c021f66b..d13f9e2f 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -12,7 +12,7 @@ data: collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' spm_model_prefix: '' augmentation_config: conf/preprocess.yaml diff --git a/examples/aishell/asr1/local/data.sh b/examples/aishell/asr1/local/data.sh index 76e28075..41843231 100755 --- a/examples/aishell/asr1/local/data.sh +++ b/examples/aishell/asr1/local/data.sh @@ -2,10 +2,12 @@ stage=-1 stop_stage=100 +dict_dir=data/lang_char -source ${MAIN_ROOT}/utils/parse_options.sh +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} @@ -53,7 +55,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type="char" \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths "data/manifest.train.raw" if [ $? -ne 0 ]; then @@ -69,7 +71,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${dataset}.raw" \ --output_path="data/manifest.${dataset}" diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index b18b46fe..d20d2b9a 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -12,7 +12,7 @@ data: collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' spm_model_prefix: '' augmentation_config: conf/preprocess.yaml diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index 47c438a6..f86cd4a3 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -12,7 +12,7 @@ data: collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' spm_model_prefix: '' augmentation_config: conf/preprocess.yaml diff --git a/examples/callcenter/asr1/local/data.sh b/examples/callcenter/asr1/local/data.sh index c40c752a..fe2d3429 100755 --- a/examples/callcenter/asr1/local/data.sh +++ b/examples/callcenter/asr1/local/data.sh @@ -2,10 +2,12 @@ stage=-1 stop_stage=100 +dict_dir=data/lang_char source ${MAIN_ROOT}/utils/parse_options.sh mkdir -p data +mkdir -p ${dict_dir} if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then for dataset in train dev test; do @@ -41,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type="char" \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths "data/manifest.train.raw" if [ $? -ne 0 ]; then @@ -57,7 +59,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${dataset}.raw" \ --output_path="data/manifest.${dataset}" diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index 8afaabf4..70fa3fcb 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -14,7 +14,7 @@ collator: batch_size: 20 mean_std_filepath: data/mean_std.json unit_type: char - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index d6ab9523..3e07862d 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -14,7 +14,7 @@ collator: batch_size: 15 mean_std_filepath: data/mean_std.json unit_type: char - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: diff --git a/examples/librispeech/asr0/local/data.sh b/examples/librispeech/asr0/local/data.sh index 0f276cec..fa2c9b2f 100755 --- a/examples/librispeech/asr0/local/data.sh +++ b/examples/librispeech/asr0/local/data.sh @@ -4,10 +4,12 @@ stage=-1 stop_stage=100 unit_type=char +dict_dir=data/lang_char source ${MAIN_ROOT}/utils/parse_options.sh mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} @@ -67,7 +69,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type ${unit_type} \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths="data/manifest.train.raw" if [ $? -ne 0 ]; then @@ -83,7 +85,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${set}.raw" \ --output_path="data/manifest.${set}" diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 2bfb0fb6..4a574190 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 100.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' + spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml batch_size: 16 diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index c844baaa..684b6297 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 100.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' + spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml batch_size: 16 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index 5a158f3e..1806f3fd 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 100.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_5000' + spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" augmentation_config: conf/preprocess.yaml batch_size: 32 diff --git a/examples/librispeech/asr1/local/data.sh b/examples/librispeech/asr1/local/data.sh index 35f4e635..a0bf9a2d 100755 --- a/examples/librispeech/asr1/local/data.sh +++ b/examples/librispeech/asr1/local/data.sh @@ -2,11 +2,12 @@ stage=-1 stop_stage=100 +dict_dir=data/lang_char # bpemode (unigram or bpe) nbpe=5000 bpemode=unigram -bpeprefix="data/bpe_${bpemode}_${nbpe}" +bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" stride_ms=10 window_ms=25 @@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} @@ -79,7 +81,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --spm_vocab_size=${nbpe} \ --spm_mode ${bpemode} \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths="data/manifest.train.raw" if [ $? -ne 0 ]; then @@ -96,7 +98,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${sub}.raw" \ --output_path="data/manifest.${sub}" diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 8a7e10f0..5a05fa46 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 20.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' - spm_model_prefix: data/bpe_unigram_8000 + spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" # augmentation_config: conf/augmentation.json batch_size: 10 diff --git a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml index 9c1ac91a..8256f716 100644 --- a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 20.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' - spm_model_prefix: data/bpe_unigram_8000 + spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" # augmentation_config: conf/augmentation.json batch_size: 10 diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh index d3acbd44..fb4efbe3 100755 --- a/examples/ted_en_zh/st0/local/data.sh +++ b/examples/ted_en_zh/st0/local/data.sh @@ -4,19 +4,22 @@ set -e stage=-1 stop_stage=100 +dict_dir=data/lang_char # bpemode (unigram or bpe) nbpe=8000 bpemode=unigram -bpeprefix="data/bpe_${bpemode}_${nbpe}" +bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" data_dir=./TED-En-Zh -source ${MAIN_ROOT}/utils/parse_options.sh +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; + TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} mkdir -p data +mkdir -p ${dict_dir} if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then @@ -73,11 +76,10 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --spm_vocab_size=${nbpe} \ --spm_mode ${bpemode} \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --text_keys 'text' 'text1' \ --manifest_paths="data/manifest.train.raw" - if [ $? -ne 0 ]; then echo "Build vocabulary failed. Terminated." exit 1 @@ -92,7 +94,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${set}.raw" \ --output_path="data/manifest.${set}" diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index d9637286..d553bde7 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 20.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' - spm_model_prefix: data/bpe_unigram_8000 + spm_model_prefix: data/lang_char/bpe_unigram_8000 mean_std_filepath: "" # augmentation_config: conf/augmentation.json batch_size: 10 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index ea38d6ee..b4fb5107 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -11,7 +11,7 @@ data: max_output_input_ratio: 20.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc mean_std_filepath: "" diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index b080a5b4..2e9d05d1 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -4,11 +4,12 @@ set -e stage=-1 stop_stage=100 +dict_dir=data/lang_char # bpemode (unigram or bpe) nbpe=8000 bpemode=unigram -bpeprefix="data/bpe_${bpemode}_${nbpe}" +bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" data_dir=./TED_EnZh @@ -17,6 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} mkdir -p data +mkdir -p ${dict_dir} if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then @@ -73,7 +75,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --spm_vocab_size=${nbpe} \ --spm_mode ${bpemode} \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --text_keys 'text' 'text1' \ --manifest_paths="data/manifest.train.raw" @@ -93,7 +95,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${set}.raw" \ --output_path="data/manifest.${set}" diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 1d18468b..89ae2fd3 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -11,7 +11,7 @@ data: max_output_input_ratio: 1000.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: "word" mean_std_filepath: "" augmentation_config: conf/preprocess.yaml diff --git a/examples/timit/asr1/local/data.sh b/examples/timit/asr1/local/data.sh index e588e48d..fb720932 100755 --- a/examples/timit/asr1/local/data.sh +++ b/examples/timit/asr1/local/data.sh @@ -3,15 +3,19 @@ stage=-1 stop_stage=100 +dict_dir=data/lang_char + unit_type=word TIMIT_path= -source ${MAIN_ROOT}/utils/parse_options.sh +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} + if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then # download data, generate manifests python3 ${TARGET_DIR}/timit/timit_kaldi_standard_split.py \ @@ -52,7 +56,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type ${unit_type} \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths="data/manifest.train.raw" if [ $? -ne 0 ]; then @@ -68,7 +72,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.${set}.raw" \ --output_path="data/manifest.${set}" diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index 58899a15..ba453aad 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -14,7 +14,7 @@ data: collator: mean_std_filepath: data/mean_std.json unit_type: char - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 334b1d31..36c774e3 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -14,7 +14,7 @@ data: collator: mean_std_filepath: data/mean_std.json unit_type: char - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: diff --git a/examples/tiny/asr0/local/data.sh b/examples/tiny/asr0/local/data.sh index f1fb8cb1..2a544ef8 100755 --- a/examples/tiny/asr0/local/data.sh +++ b/examples/tiny/asr0/local/data.sh @@ -4,10 +4,12 @@ stage=-1 stop_stage=100 unit_type=char +dict_dir=data/lang_char -source ${MAIN_ROOT}/utils/parse_options.sh +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} @@ -51,7 +53,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type ${unit_type} \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths="data/manifest.tiny.raw" if [ $? -ne 0 ]; then @@ -65,7 +67,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.tiny.raw" \ --output_path="data/manifest.tiny" diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index 6bed27f5..6183a903 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -12,7 +12,7 @@ data: collator: mean_std_filepath: "" - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' augmentation_config: conf/preprocess.yaml diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 7aed1b19..01d383fb 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -12,7 +12,7 @@ data: collator: mean_std_filepath: "" - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' augmentation_config: conf/preprocess.yaml diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index 2c09b3ae..a3fee690 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -12,7 +12,7 @@ data: collator: mean_std_filepath: "" - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' augmentation_config: conf/preprocess.yaml diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index 1378e848..5a87d6d2 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -12,7 +12,7 @@ data: collator: mean_std_filepath: data/mean_std.json - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' augmentation_config: conf/preprocess.yaml diff --git a/examples/tiny/asr1/local/data.sh b/examples/tiny/asr1/local/data.sh index 87539d5e..1ef9f776 100755 --- a/examples/tiny/asr1/local/data.sh +++ b/examples/tiny/asr1/local/data.sh @@ -3,14 +3,17 @@ stage=-1 stop_stage=100 +dict_dir=data/lang_char + # bpemode (unigram or bpe) nbpe=200 bpemode=unigram -bpeprefix="data/bpe_${bpemode}_${nbpe}" +bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" -source ${MAIN_ROOT}/utils/parse_options.sh +. ${MAIN_ROOT}/utils/parse_options.sh || exit -1; mkdir -p data +mkdir -p ${dict_dir} TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} @@ -57,7 +60,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --spm_vocab_size=${nbpe} \ --spm_mode ${bpemode} \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_paths="data/manifest.tiny.raw" if [ $? -ne 0 ]; then @@ -72,7 +75,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ - --vocab_path="data/vocab.txt" \ + --vocab_path="${dict_dir}/vocab.txt" \ --manifest_path="data/manifest.tiny.raw" \ --output_path="data/manifest.tiny" diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index 0340dc85..a3a42ec6 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -51,7 +51,7 @@ data: max_output_input_ratio: 10.0 collator: - vocab_filepath: data/vocab.txt + vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' spm_model_prefix: '' augmentation_config: conf/preprocess.yaml