diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh deleted file mode 100755 index 339cee1eb..000000000 --- a/examples/ted_en_zh/st1/local/data_prep.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Kyoto University (Hirofumi Inaguma) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -export LC_ALL=C - -data_dir=${1} - -for set in train dev test; do -# for set in train; do - dst=${target_dir}/${set} - for lang in en zh; do - - if [ ${lang} = 'en' ]; then - echo "remove punctuation $lang" - # remove punctuation - local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw - else - cp ${dst}/${lang}.org ${dst}/${lang}.raw - fi - - paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} - - - done - # error check - n=$(cat ${dst}/.yaml | wc -l) - n_en=$(cat ${dst}/en.raw | wc -l) - n_tgt=$(cat ${dst}/zh.raw | wc -l) - [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; - [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; - - echo "done text processing" - cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp - cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk - - cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt - rm -rf ${target_dir}/data_prep/${set}.en-zh - mkdir -p ${target_dir}/data_prep/${set}.en-zh - echo "remove duplicate lines..." - cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ - | sed 's/^[ \t]*//' > ${dst}/duplicate_lines - cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ - | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist - reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh - echo "done wav processing" - for l in en zh; do - cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l} - done - fix_data_dir.sh --utt_extra_files \ - "text.en text.zh" \ - ${target_dir}/data_prep/${set}.en-zh -done \ No newline at end of file diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh index ee4c9779f..867cdb48a 100644 --- a/examples/ted_en_zh/st1/path.sh +++ b/examples/ted_en_zh/st1/path.sh @@ -19,5 +19,4 @@ export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" -[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh -export train_cmd="run.pl" \ No newline at end of file +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh \ No newline at end of file diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index a1c99af30..f6362a8b3 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -1,12 +1,13 @@ #!/bin/bash set -e -source path.sh +. ./path.sh || exit 1; +. ./cmd.sh || exit 1; gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml -ckpt= # paddle.98 # (finetune from FAT-ST pretrained model) +ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;