diff --git a/examples/chinese_g2p/data/README.md b/examples/chinese_g2p/data/README.md new file mode 100644 index 000000000..2e25312c2 --- /dev/null +++ b/examples/chinese_g2p/data/README.md @@ -0,0 +1,3 @@ +# Download Baker dataset + +Baker dataset has to be downloaded mannually and move to this folder, because you will have to pass the CATTCHA from a browswe to download the dataset. diff --git a/examples/chinese_g2p/local/data_download.sh b/examples/chinese_g2p/local/data_download.sh new file mode 100644 index 000000000..13fe0ac84 --- /dev/null +++ b/examples/chinese_g2p/local/data_download.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +. ${MAIN_ROOT}/utils/utility.sh + +DOWNLOAD_DIR=$(dirname $0)/../data +mkdir -p ${DOWNLOAD_DIR} + +# you may need to pass the authentification to download the data via a browser +URL=https://online-of-baklong.oss-cn-huhehaote.aliyuncs.com/story_resource/BZNSYP.rar + +MD5="c4350563bf7dc298f7dd364b2607be83" +TARGET=${DOWNLOAD_DIR}/BZNSYP.rar + +echo "Download Baker TTS dataset..." +download ${URL} ${MD5} ${TARGET} +if [ $? -ne 0 ]; then + echo "Fail to downlaod Baker TTS dataset!" + exit +fi + +exit 0 diff --git a/examples/chinese_g2p/local/extract_pinyin.py b/examples/chinese_g2p/local/extract_pinyin.py new file mode 100644 index 000000000..4b806862e --- /dev/null +++ b/examples/chinese_g2p/local/extract_pinyin.py @@ -0,0 +1,33 @@ +import argparse +import re +import jieba +import pypinyin +from pypinyin import lazy_pinyin, Style + +def extract_pinyin(source, target, use_jieba=False): + with open(source, 'rt', encoding='utf-8') as f: + with open(target, 'wt', encoding='utf-8') as g: + for i, line in enumerate(f): + if i % 2 == 0: + g.write(line) + sentence_id, raw_text = line.strip().split() + raw_text = re.sub(r'#\d', '', raw_text) + if use_jieba: + raw_text = jieba.lcut(raw_text) + syllables = lazy_pinyin(raw_text, errors='ignore', style=Style.TONE3, neutral_tone_with_five=True) + transcription = ' '.join(syllables) + g.write(f'\t{transcription}\n') + else: + continue + + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="extract baker pinyin labels") + parser.add_argument("input", type=str, help="source file of baker's prosody label file") + parser.add_argument("output", type=str, help="target file to write pinyin lables") + parser.add_argument("--use-jieba", action='store_true', help="use jieba for word segmentation.") + args = parser.parse_args() + print(args) + extract_pinyin(args.input, args.output, use_jieba=args.use_jieba) diff --git a/examples/chinese_g2p/path.sh b/examples/chinese_g2p/path.sh new file mode 100644 index 000000000..b4c625f95 --- /dev/null +++ b/examples/chinese_g2p/path.sh @@ -0,0 +1,8 @@ +export MAIN_ROOT=${PWD}/../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} diff --git a/examples/chinese_g2p/requirements.txt b/examples/chinese_g2p/requirements.txt new file mode 100644 index 000000000..3d5d90d32 --- /dev/null +++ b/examples/chinese_g2p/requirements.txt @@ -0,0 +1,2 @@ +jieba +pypinyin diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh new file mode 100644 index 000000000..fd424a595 --- /dev/null +++ b/examples/chinese_g2p/run.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +source path.sh + +stage=0 +stop_stage=100 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + bash local/data_download.sh + if [ $? -ne 0 ]; then + exit 1 + fi +fi + +EXP_DIR="exp" +mkdir -p ${EXP_DIR} + +ARCHIVE="data/BZNSYP.rar" + +echo "Extracting Prosody Labeling" +LABEL_FILE='ProsodyLabeling/000001-010000.txt' +FILENAME='000001-010000.txt' +unrar e ${ARCHIVE} ${LABEL_FILE} +mv ${FILENAME} ${EXP_DIR} + +if [ ! -f ${EXP_DIR}/${FILENAME} ];then + echo "File extraction failed!" + exit +fi + +# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin +python3 local/extract_pinyin.py ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_result.txt" +python3 local/extract_pinyin.py --use-jieba ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_with_jieba_result.txt" + +echo "done" +exit 0 + + + + + + +