parent
d0635c6592
commit
654c455272
@ -0,0 +1,3 @@
|
|||||||
|
# Download Baker dataset
|
||||||
|
|
||||||
|
Baker dataset has to be downloaded mannually and move to this folder, because you will have to pass the CATTCHA from a browswe to download the dataset.
|
@ -0,0 +1,21 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
. ${MAIN_ROOT}/utils/utility.sh
|
||||||
|
|
||||||
|
DOWNLOAD_DIR=$(dirname $0)/../data
|
||||||
|
mkdir -p ${DOWNLOAD_DIR}
|
||||||
|
|
||||||
|
# you may need to pass the authentification to download the data via a browser
|
||||||
|
URL=https://online-of-baklong.oss-cn-huhehaote.aliyuncs.com/story_resource/BZNSYP.rar
|
||||||
|
|
||||||
|
MD5="c4350563bf7dc298f7dd364b2607be83"
|
||||||
|
TARGET=${DOWNLOAD_DIR}/BZNSYP.rar
|
||||||
|
|
||||||
|
echo "Download Baker TTS dataset..."
|
||||||
|
download ${URL} ${MD5} ${TARGET}
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Fail to downlaod Baker TTS dataset!"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,33 @@
|
|||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import jieba
|
||||||
|
import pypinyin
|
||||||
|
from pypinyin import lazy_pinyin, Style
|
||||||
|
|
||||||
|
def extract_pinyin(source, target, use_jieba=False):
|
||||||
|
with open(source, 'rt', encoding='utf-8') as f:
|
||||||
|
with open(target, 'wt', encoding='utf-8') as g:
|
||||||
|
for i, line in enumerate(f):
|
||||||
|
if i % 2 == 0:
|
||||||
|
g.write(line)
|
||||||
|
sentence_id, raw_text = line.strip().split()
|
||||||
|
raw_text = re.sub(r'#\d', '', raw_text)
|
||||||
|
if use_jieba:
|
||||||
|
raw_text = jieba.lcut(raw_text)
|
||||||
|
syllables = lazy_pinyin(raw_text, errors='ignore', style=Style.TONE3, neutral_tone_with_five=True)
|
||||||
|
transcription = ' '.join(syllables)
|
||||||
|
g.write(f'\t{transcription}\n')
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||||
|
parser.add_argument("input", type=str, help="source file of baker's prosody label file")
|
||||||
|
parser.add_argument("output", type=str, help="target file to write pinyin lables")
|
||||||
|
parser.add_argument("--use-jieba", action='store_true', help="use jieba for word segmentation.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
@ -0,0 +1,8 @@
|
|||||||
|
export MAIN_ROOT=${PWD}/../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
@ -0,0 +1,2 @@
|
|||||||
|
jieba
|
||||||
|
pypinyin
|
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
bash local/data_download.sh
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
EXP_DIR="exp"
|
||||||
|
mkdir -p ${EXP_DIR}
|
||||||
|
|
||||||
|
ARCHIVE="data/BZNSYP.rar"
|
||||||
|
|
||||||
|
echo "Extracting Prosody Labeling"
|
||||||
|
LABEL_FILE='ProsodyLabeling/000001-010000.txt'
|
||||||
|
FILENAME='000001-010000.txt'
|
||||||
|
unrar e ${ARCHIVE} ${LABEL_FILE}
|
||||||
|
mv ${FILENAME} ${EXP_DIR}
|
||||||
|
|
||||||
|
if [ ! -f ${EXP_DIR}/${FILENAME} ];then
|
||||||
|
echo "File extraction failed!"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||||
|
python3 local/extract_pinyin.py ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_result.txt"
|
||||||
|
python3 local/extract_pinyin.py --use-jieba ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_with_jieba_result.txt"
|
||||||
|
|
||||||
|
echo "done"
|
||||||
|
exit 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in new issue