From 6a50211c8042ce15392f37403edb54e59dd9a568 Mon Sep 17 00:00:00 2001 From: Junkun Date: Thu, 25 Nov 2021 21:20:37 -0800 Subject: [PATCH] data process for ted-en-zh st1 --- examples/ted_en_zh/st1/local/data.sh | 214 +++++++++++++++++++-------- examples/ted_en_zh/st1/path.sh | 10 +- 2 files changed, 161 insertions(+), 63 deletions(-) diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index aa958cfd..72d141e7 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,16 +2,18 @@ set -e -stage=-1 +stage=1 stop_stage=100 dict_dir=data/lang_char # bpemode (unigram or bpe) nbpe=8000 -bpemode=unigram +bpemode=bpe bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" data_dir=./TED_EnZh - +target_dir=data/ted_en_zh +dumpdir=data/dump +do_delta=false source ${MAIN_ROOT}/utils/parse_options.sh @@ -38,75 +40,163 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - # generate manifests - python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ - --manifest_prefix="data/manifest" \ - --src_dir="${data_dir}" + # # extract data + # echo "data Extraction" + # python3 local/ted_en_zh.py \ + # --tgt-dir=${target_dir} \ + # --src-dir=${data_dir} - echo "Complete raw data pre-process." fi - +prep_dir=${target_dir}/data_prep if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # compute mean and stddev for normalizer - num_workers=$(nproc) - python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ - --manifest_path="data/manifest.train.raw" \ - --num_samples=-1 \ - --spectrum_type="fbank" \ - --feat_dim=80 \ - --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ - --use_dB_normalization=False \ - --num_workers=${num_workers} \ - --output_path="data/mean_std.json" - - if [ $? -ne 0 ]; then - echo "Compute mean and stddev failed. Terminated." - exit 1 - fi + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 0: Data preparation" + for set in train dev test; do + # for set in train; do + dst=${target_dir}/${set} + for lang in en zh; do + + if [ ${lang} = 'en' ]; then + echo "remove punctuation $lang" + # remove punctuation + local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw + else + cp ${dst}/${lang}.org ${dst}/${lang}.raw + fi + + paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} + + + done + # error check + n=$(cat ${dst}/.yaml | wc -l) + n_en=$(cat ${dst}/en.raw | wc -l) + n_tgt=$(cat ${dst}/zh.raw | wc -l) + [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; + [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; + + echo "done text processing" + cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp + cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk + + cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt + rm -rf ${prep_dir}/${set}.en-zh + mkdir -p ${prep_dir}/${set}.en-zh + echo "remove duplicate lines..." + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ + | sed 's/^[ \t]*//' > ${dst}/duplicate_lines + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ + | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist + reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh + echo "done wav processing" + for l in en zh; do + cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l} + done + utils/fix_data_dir.sh --utt_extra_files \ + "text.en text.zh" \ + ${prep_dir}/${set}.en-zh + done fi +feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir} +feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir} +feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="${dict_dir}/vocab.txt" \ - --text_keys 'text' 'text1' \ - --manifest_paths="data/manifest.train.raw" - - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi + ### Task dependent. You have to design training and dev sets by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 1: Feature Generation" + fbankdir=data/fbank + # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame + for x in train dev test; do + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir} + done + + echo "speed perturbation" + utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh + utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh + utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh + + utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \ + ${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh + rm -r ${prep_dir}/temp*.en-zh + utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh + + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir} + + for lang in en zh; do + cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang} + for p in "sp0.9-" "sp1.0-" "sp1.1-"; do + awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map + utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang} + done + done + + for x in train_sp dev test; do + local/divide_lang.sh ${prep_dir}/${x}.en-zh zh + done + + for x in train_sp dev; do + # remove utt having more than 3000 frames + # remove utt having more than 400 characters + for lang in zh en; do + remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp + done + cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 + cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 + comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist + + for lang in zh en; do + reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang} + utils/fix_data_dir.sh ${prep_dir}/${x}.en-zh.${lang} + done + rm -rf ${prep_dir}/${x}.en-zh.*.tmp + done + + compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark + + dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \ + ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir} + dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ + ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir} + dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ + ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir} fi +dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt +nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt +bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # format manifest with tokenids, vocab size - for set in train dev test; do - { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ - --cmvn_path "data/mean_std.json" \ - --unit_type "spm" \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="${dict_dir}/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" - - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi - }& + echo "stage 2: Dictionary and Json Data Preparation" + # echo "make a non-linguistic symbol list for all languages" + # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms} + # cat ${nlsyms} + + echo "make a joint source and target dictionary" + echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC + offset=$(wc -l < ${dict}) + grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt + spm_train --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0 + spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict} + wc -l ${dict} + + echo "make json files" + data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json + data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json + data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json + echo "update json (add source references)" + # update json (add source references) + for x in ${train_set} ${train_dev}; do + feat_dir=${dumpdir}/${x}/delta${do_delta} + data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en + update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \ + ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict} done - wait fi - echo "Ted En-Zh Data preparation done." exit 0 diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh index fd537917..ee4c9779 100644 --- a/examples/ted_en_zh/st1/path.sh +++ b/examples/ted_en_zh/st1/path.sh @@ -1,6 +1,6 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH} export LC_ALL=C export PYTHONDONTWRITEBYTECODE=1 @@ -13,3 +13,11 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ MODEL=u2_st export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin + +# Kaldi +export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh +export train_cmd="run.pl" \ No newline at end of file