Merge pull request #755 from PaddlePaddle/ted

fix ted egs
pull/761/head
Hui Zhang 3 years ago committed by GitHub
commit 433c1dc41a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,3 @@
TED-En-Zh
data
exp

@ -0,0 +1,10 @@
# TED En-Zh
## Dataset
| Data Subset | Duration in Seconds |
| --- | --- |
| data/manifest.train | 0.942 ~ 60 |
| data/manifest.dev | 1.151 ~ 39 |
| data/manifest.test | 1.1 ~ 42.746 |

@ -7,37 +7,37 @@ stop_stage=100
nbpe=8000 nbpe=8000
bpemode=unigram bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}" bpeprefix="data/bpe_${bpemode}_${nbpe}"
DATA_DIR= data_dir=/mnt/dataset/TED_EnZh
source ${MAIN_ROOT}/utils/parse_options.sh source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR} mkdir -p ${TARGET_DIR}
mkdir -p data
if [ ! -d ${SOURCE_DIR} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
echo "The tree of the directory should be:"
echo "."
echo "|-- En-Zh"
echo "|-- test-segment"
echo " |-- tst2010"
echo " |-- ..."
echo "|-- train-split"
echo " |-- train-segment"
echo "|-- README.md"
exit 1
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${data_dir} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
echo "The tree of the directory should be:"
echo "."
echo "|-- En-Zh"
echo "|-- test-segment"
echo " |-- tst2010"
echo " |-- ..."
echo "|-- train-split"
echo " |-- train-segment"
echo "|-- README.md"
exit 1
fi
# generate manifests # generate manifests
python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
--manifest_prefix="data/manifest" \ --manifest_prefix="data/manifest" \
--src_dir="${DATA_DIR}" --src_dir="${data_dir}"
echo "Complete raw data pre-process." echo "Complete raw data pre-process."
fi fi

@ -16,7 +16,7 @@ echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data # prepare data
bash ./local/data.sh --DATA_DIR ${data_path} || exit -1 bash ./local/data.sh --data_dir ${data_path} || exit -1
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

Loading…
Cancel
Save