You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
1.7 KiB
54 lines
1.7 KiB
#! /usr/bin/env bash
|
|
|
|
stage=-1
|
|
stop_stage=100
|
|
|
|
source ${MAIN_ROOT}/utils/parse_options.sh
|
|
|
|
mkdir -p data
|
|
TARGET_DIR=${MAIN_ROOT}/dataset
|
|
mkdir -p ${TARGET_DIR}
|
|
LEXICON_NAME=$1
|
|
|
|
# download data, generate manifests
|
|
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
|
python3 ${TARGET_DIR}/thchs30/thchs30.py \
|
|
--manifest_prefix="data/manifest" \
|
|
--target_dir="${TARGET_DIR}/thchs30"
|
|
|
|
if [ $? -ne 0 ]; then
|
|
echo "Prepare THCHS-30 failed. Terminated."
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
|
# dump manifest to data/
|
|
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
|
|
fi
|
|
|
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
|
# copy files to data/dict to gen word.lexicon
|
|
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
|
|
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
|
|
# copy phone.lexicon to data/dict
|
|
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
|
|
fi
|
|
|
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
|
# gen word.lexicon
|
|
python local/gen_word2phone.py --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon
|
|
fi
|
|
|
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
|
# reorganize dataset for MFA
|
|
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
|
|
echo "reorganizing thchs30 corpus..."
|
|
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
|
|
echo "reorganization done."
|
|
fi
|
|
fi
|
|
|
|
echo "THCHS-30 data preparation done."
|
|
exit 0
|