Merge pull request #755 from PaddlePaddle/ted

fix ted egs
pull/761/head
Hui Zhang 3 years ago committed by GitHub
commit 433c1dc41a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,3 @@
TED-En-Zh
data
exp

@ -0,0 +1,10 @@
# TED En-Zh
## Dataset
| Data Subset | Duration in Seconds |
| --- | --- |
| data/manifest.train | 0.942 ~ 60 |
| data/manifest.dev | 1.151 ~ 39 |
| data/manifest.test | 1.1 ~ 42.746 |

@ -7,17 +7,18 @@ stop_stage=100
nbpe=8000
bpemode=unigram
bpeprefix="data/bpe_${bpemode}_${nbpe}"
DATA_DIR=
data_dir=/mnt/dataset/TED_EnZh
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
mkdir -p data
if [ ! -d ${SOURCE_DIR} ]; then
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
if [ ! -e ${data_dir} ]; then
echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
echo "The tree of the directory should be:"
@ -31,13 +32,12 @@ if [ ! -d ${SOURCE_DIR} ]; then
echo "|-- README.md"
exit 1
fi
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# generate manifests
python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
--manifest_prefix="data/manifest" \
--src_dir="${DATA_DIR}"
--src_dir="${data_dir}"
echo "Complete raw data pre-process."
fi

@ -16,7 +16,7 @@ echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh --DATA_DIR ${data_path} || exit -1
bash ./local/data.sh --data_dir ${data_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

Loading…
Cancel
Save