From 417a8b79962c870606c3c417c007079849b9778e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 16 Aug 2021 03:37:55 +0000 Subject: [PATCH] fix ted egs --- examples/ted_en_zh/t0/.gitignore | 3 +++ examples/ted_en_zh/t0/README.md | 10 ++++++++ examples/ted_en_zh/t0/local/data.sh | 38 ++++++++++++++--------------- examples/ted_en_zh/t0/run.sh | 2 +- 4 files changed, 33 insertions(+), 20 deletions(-) create mode 100644 examples/ted_en_zh/t0/.gitignore create mode 100644 examples/ted_en_zh/t0/README.md diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/t0/.gitignore new file mode 100644 index 00000000..469c6171 --- /dev/null +++ b/examples/ted_en_zh/t0/.gitignore @@ -0,0 +1,3 @@ +TED-En-Zh +data +exp diff --git a/examples/ted_en_zh/t0/README.md b/examples/ted_en_zh/t0/README.md new file mode 100644 index 00000000..e2443d36 --- /dev/null +++ b/examples/ted_en_zh/t0/README.md @@ -0,0 +1,10 @@ + +# TED En-Zh + +## Dataset + +| Data Subset | Duration in Seconds | +| --- | --- | +| data/manifest.train | 0.942 ~ 60 | +| data/manifest.dev | 1.151 ~ 39 | +| data/manifest.test | 1.1 ~ 42.746 | diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 0a5c58aa..32cfd9d7 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -7,37 +7,37 @@ stop_stage=100 nbpe=8000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" -DATA_DIR= +data_dir=/mnt/dataset/TED_EnZh source ${MAIN_ROOT}/utils/parse_options.sh - -mkdir -p data TARGET_DIR=${MAIN_ROOT}/examples/dataset mkdir -p ${TARGET_DIR} +mkdir -p data -if [ ! -d ${SOURCE_DIR} ]; then - echo "Error: Dataset is not avaiable. Please download and unzip the dataset" - echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" - echo "The tree of the directory should be:" - echo "." - echo "|-- En-Zh" - echo "|-- test-segment" - echo " |-- tst2010" - echo " |-- ..." - echo "|-- train-split" - echo " |-- train-segment" - echo "|-- README.md" - - exit 1 -fi if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + if [ ! -e ${data_dir} ]; then + echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" + echo "The tree of the directory should be:" + echo "." + echo "|-- En-Zh" + echo "|-- test-segment" + echo " |-- tst2010" + echo " |-- ..." + echo "|-- train-split" + echo " |-- train-segment" + echo "|-- README.md" + + exit 1 + fi + # generate manifests python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ --manifest_prefix="data/manifest" \ - --src_dir="${DATA_DIR}" + --src_dir="${data_dir}" echo "Complete raw data pre-process." fi diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh index 89048f3d..26fadb60 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/t0/run.sh @@ -16,7 +16,7 @@ echo "checkpoint name ${ckpt}" if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/data.sh --DATA_DIR ${data_path} || exit -1 + bash ./local/data.sh --data_dir ${data_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then