From 29009089ec4380af7e13f76e918b6a8c09b3a1a0 Mon Sep 17 00:00:00 2001 From: iclementine Date: Tue, 18 May 2021 16:55:25 +0800 Subject: [PATCH] 1. remove script for data downloading, since Baker dataset is not easily downloaded via terminal; 2. remove pypinyin as an extra requirement; it is alreay required by the main project; 3. clean code. --- examples/chinese_g2p/README.md | 3 ++ examples/chinese_g2p/data/README.md | 3 -- examples/chinese_g2p/local/data_download.sh | 21 -------- examples/chinese_g2p/local/extract_pinyin.py | 51 +++++++++++++------ examples/chinese_g2p/local/prepare_dataset.sh | 31 +++++++++++ examples/chinese_g2p/requirements.txt | 1 - examples/chinese_g2p/run.sh | 35 ++++--------- 7 files changed, 79 insertions(+), 66 deletions(-) create mode 100644 examples/chinese_g2p/README.md delete mode 100644 examples/chinese_g2p/data/README.md delete mode 100644 examples/chinese_g2p/local/data_download.sh create mode 100644 examples/chinese_g2p/local/prepare_dataset.sh diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md new file mode 100644 index 000000000..8855d37a9 --- /dev/null +++ b/examples/chinese_g2p/README.md @@ -0,0 +1,3 @@ +# Download Baker dataset + +Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset. diff --git a/examples/chinese_g2p/data/README.md b/examples/chinese_g2p/data/README.md deleted file mode 100644 index 2e25312c2..000000000 --- a/examples/chinese_g2p/data/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Download Baker dataset - -Baker dataset has to be downloaded mannually and move to this folder, because you will have to pass the CATTCHA from a browswe to download the dataset. diff --git a/examples/chinese_g2p/local/data_download.sh b/examples/chinese_g2p/local/data_download.sh deleted file mode 100644 index 13fe0ac84..000000000 --- a/examples/chinese_g2p/local/data_download.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash - -. ${MAIN_ROOT}/utils/utility.sh - -DOWNLOAD_DIR=$(dirname $0)/../data -mkdir -p ${DOWNLOAD_DIR} - -# you may need to pass the authentification to download the data via a browser -URL=https://online-of-baklong.oss-cn-huhehaote.aliyuncs.com/story_resource/BZNSYP.rar - -MD5="c4350563bf7dc298f7dd364b2607be83" -TARGET=${DOWNLOAD_DIR}/BZNSYP.rar - -echo "Download Baker TTS dataset..." -download ${URL} ${MD5} ${TARGET} -if [ $? -ne 0 ]; then - echo "Fail to downlaod Baker TTS dataset!" - exit -fi - -exit 0 diff --git a/examples/chinese_g2p/local/extract_pinyin.py b/examples/chinese_g2p/local/extract_pinyin.py index 4b806862e..5f2c663bd 100644 --- a/examples/chinese_g2p/local/extract_pinyin.py +++ b/examples/chinese_g2p/local/extract_pinyin.py @@ -1,33 +1,54 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse import re + import jieba -import pypinyin -from pypinyin import lazy_pinyin, Style +from pypinyin import lazy_pinyin +from pypinyin import Style + def extract_pinyin(source, target, use_jieba=False): - with open(source, 'rt', encoding='utf-8') as f: - with open(target, 'wt', encoding='utf-8') as g: - for i, line in enumerate(f): + with open(source, 'rt', encoding='utf-8') as fin: + with open(target, 'wt', encoding='utf-8') as fout: + for i, line in enumerate(fin): if i % 2 == 0: - g.write(line) + fout.write(line) sentence_id, raw_text = line.strip().split() raw_text = re.sub(r'#\d', '', raw_text) if use_jieba: raw_text = jieba.lcut(raw_text) - syllables = lazy_pinyin(raw_text, errors='ignore', style=Style.TONE3, neutral_tone_with_five=True) + syllables = lazy_pinyin( + raw_text, + errors='ignore', + style=Style.TONE3, + neutral_tone_with_five=True) transcription = ' '.join(syllables) - g.write(f'\t{transcription}\n') + fout.write(f'\t{transcription}\n') else: continue - - - + if __name__ == "__main__": parser = argparse.ArgumentParser(description="extract baker pinyin labels") - parser.add_argument("input", type=str, help="source file of baker's prosody label file") - parser.add_argument("output", type=str, help="target file to write pinyin lables") - parser.add_argument("--use-jieba", action='store_true', help="use jieba for word segmentation.") + parser.add_argument( + "input", type=str, help="source file of baker's prosody label file") + parser.add_argument( + "output", type=str, help="target file to write pinyin lables") + parser.add_argument( + "--use-jieba", + action='store_true', + help="use jieba for word segmentation.") args = parser.parse_args() - print(args) extract_pinyin(args.input, args.output, use_jieba=args.use_jieba) diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh new file mode 100644 index 000000000..7ef811e51 --- /dev/null +++ b/examples/chinese_g2p/local/prepare_dataset.sh @@ -0,0 +1,31 @@ +echo "Extracting Prosody Labeling" + +exp_dir="exp" +data_dir="data" +source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 + +archive=${data_dir}/"BZNSYP.rar" +if [ ! -f ${archive} ]; then + echo "Baker Dataset not found! Download it first to the data_dir." + exit -1 +fi + +MD5='c4350563bf7dc298f7dd364b2607be83' +md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}') +if [ ${md5_result} != ${MD5} ]; then + echo "MD5 mismatch! The Archive has been changed." + exit -1 +fi + + +label_file='ProsodyLabeling/000001-010000.txt' +filename='000001-010000.txt' +unrar e ${archive} ${label_file} +mv ${filename} ${exp_dir} + +if [ ! -f ${exp_dir}/${filename} ];then + echo "File extraction failed!" + exit +fi + +exit 0 diff --git a/examples/chinese_g2p/requirements.txt b/examples/chinese_g2p/requirements.txt index 3d5d90d32..c84f42278 100644 --- a/examples/chinese_g2p/requirements.txt +++ b/examples/chinese_g2p/requirements.txt @@ -1,2 +1 @@ jieba -pypinyin diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh index ea394b005..b5f55c19f 100644 --- a/examples/chinese_g2p/run.sh +++ b/examples/chinese_g2p/run.sh @@ -1,37 +1,20 @@ #!/usr/bin/env bash source path.sh -stage=0 +stage=-1 stop_stage=100 +exp_dir="exp" +data_dir="data" source ${MAIN_ROOT}/utils/parse_options.sh || exit -1 - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - bash local/data_download.sh - if [ $? -ne 0 ]; then - exit 1 - fi -fi - -EXP_DIR="exp" -mkdir -p ${EXP_DIR} - -ARCHIVE="data/BZNSYP.rar" - -echo "Extracting Prosody Labeling" -LABEL_FILE='ProsodyLabeling/000001-010000.txt' -FILENAME='000001-010000.txt' -unrar e ${ARCHIVE} ${LABEL_FILE} -mv ${FILENAME} ${EXP_DIR} - -if [ ! -f ${EXP_DIR}/${FILENAME} ];then - echo "File extraction failed!" - exit -fi +mkdir -p ${exp_dir} +bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir} # convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin -python3 local/extract_pinyin.py ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_result.txt" -python3 local/extract_pinyin.py --use-jieba ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_with_jieba_result.txt" +filename="000001-010000.txt" +echo "Processing transcriptions..." +python3 local/extract_pinyin.py ${exp_dir}/${filename} ${exp_dir}/"pypinyin_result.txt" +python3 local/extract_pinyin.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"pypinyin_with_jieba_result.txt" echo "done" exit 0