diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index 72d141e7d..8b829a8a1 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,7 +2,7 @@ set -e -stage=1 +stage=3 stop_stage=100 dict_dir=data/lang_char @@ -14,6 +14,7 @@ data_dir=./TED_EnZh target_dir=data/ted_en_zh dumpdir=data/dump do_delta=false +nj=20 source ${MAIN_ROOT}/utils/parse_options.sh @@ -40,11 +41,11 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - # # extract data - # echo "data Extraction" - # python3 local/ted_en_zh.py \ - # --tgt-dir=${target_dir} \ - # --src-dir=${data_dir} + # extract data + echo "data Extraction" + python3 local/ted_en_zh.py \ + --tgt-dir=${target_dir} \ + --src-dir=${data_dir} fi prep_dir=${target_dir}/data_prep @@ -99,7 +100,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then done fi -feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir} +feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir} feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -109,7 +110,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fbankdir=data/fbank # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame for x in train dev test; do - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir} done @@ -123,7 +124,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then rm -r ${prep_dir}/temp*.en-zh utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir} for lang in en zh; do @@ -155,14 +156,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then rm -rf ${prep_dir}/${x}.en-zh.*.tmp done - compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark + compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark - dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \ - ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir} - dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ - ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir} - dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ - ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir} fi dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt @@ -170,9 +171,6 @@ nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "stage 2: Dictionary and Json Data Preparation" - # echo "make a non-linguistic symbol list for all languages" - # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms} - # cat ${nlsyms} echo "make a joint source and target dictionary" echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC @@ -183,20 +181,27 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then wc -l ${dict} echo "make json files" - data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json - data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json echo "update json (add source references)" # update json (add source references) - for x in ${train_set} ${train_dev}; do + for x in train_sp dev; do feat_dir=${dumpdir}/${x}/delta${do_delta} - data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en - update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \ + data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en + update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \ ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict} done fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: Format the Json Data" + python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train + python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev + python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test +fi echo "Ted En-Zh Data preparation done." exit 0 diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh new file mode 100755 index 000000000..339cee1eb --- /dev/null +++ b/examples/ted_en_zh/st1/local/data_prep.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2019 Kyoto University (Hirofumi Inaguma) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +export LC_ALL=C + +data_dir=${1} + +for set in train dev test; do +# for set in train; do + dst=${target_dir}/${set} + for lang in en zh; do + + if [ ${lang} = 'en' ]; then + echo "remove punctuation $lang" + # remove punctuation + local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw + else + cp ${dst}/${lang}.org ${dst}/${lang}.raw + fi + + paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} + + + done + # error check + n=$(cat ${dst}/.yaml | wc -l) + n_en=$(cat ${dst}/en.raw | wc -l) + n_tgt=$(cat ${dst}/zh.raw | wc -l) + [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; + [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; + + echo "done text processing" + cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp + cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk + + cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt + rm -rf ${target_dir}/data_prep/${set}.en-zh + mkdir -p ${target_dir}/data_prep/${set}.en-zh + echo "remove duplicate lines..." + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ + | sed 's/^[ \t]*//' > ${dst}/duplicate_lines + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ + | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist + reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh + echo "done wav processing" + for l in en zh; do + cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l} + done + fix_data_dir.sh --utt_extra_files \ + "text.en text.zh" \ + ${target_dir}/data_prep/${set}.en-zh +done \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/divide_lang.sh b/examples/ted_en_zh/st1/local/divide_lang.sh new file mode 100755 index 000000000..4e5f85c86 --- /dev/null +++ b/examples/ted_en_zh/st1/local/divide_lang.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2019 Kyoto University (Hirofumi Inaguma) +# 2021 PaddlePaddle +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +. ./path.sh + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 >" + echo "e.g.: $0 dev" + exit 1 +fi + +set=$1 +lang=$2 +export LC_ALL=en_US.UTF-8 +# Copy stuff intoc its final locations [this has been moved from the format_data script] +# for En +mkdir -p ${set}.en +for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do + if [ -f ${set}/${f} ]; then + sort ${set}/${f} > ${set}.en/${f} + fi +done +sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text + +utils/fix_data_dir.sh ${set}.en +if [ -f ${set}.en/feats.scp ]; then + utils/validate_data_dir.sh ${set}.en || exit 1; +else + utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1; +fi + +# for target language +mkdir -p ${set}.${lang} +for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do + if [ -f ${set}/${f} ]; then + sort ${set}/${f} > ${set}.${lang}/${f} + fi +done +sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text +utils/fix_data_dir.sh ${set}.${lang} +if [ -f ${set}.${lang}/feats.scp ]; then + utils/validate_data_dir.sh ${set}.${lang} || exit 1; +else + utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1; +fi diff --git a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py new file mode 100644 index 000000000..60d254367 --- /dev/null +++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import argparse +import json + + +def main(args): + with open(args.json_file, 'r') as fin: + data_json = json.load(fin) + + with open(args.manifest_file, 'w') as fout: + for key, value in data_json['utts'].items(): + value['utt'] = key + fout.write(json.dumps(value, ensure_ascii=False)) + fout.write("\n") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--json-file', type=str, default=None, help="espnet data json file.") + parser.add_argument( + '--manifest-file', + type=str, + default='manifest.train', + help='manifest data json line file.') + args = parser.parse_args() + main(args) diff --git a/examples/ted_en_zh/st1/local/remove_punctuation.pl b/examples/ted_en_zh/st1/local/remove_punctuation.pl new file mode 100755 index 000000000..89e19c6f4 --- /dev/null +++ b/examples/ted_en_zh/st1/local/remove_punctuation.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +while() { + $_ = " $_ "; + + # remove punctuation except apostrophe + s//spacemark/g; # for scoring + s/'/apostrophe/g; + s/[[:punct:]]//g; + s/apostrophe/'/g; + s/spacemark//g; # for scoring + + # remove whitespace + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + + print "$_\n"; +}