2. remove pypinyin as an extra requirement; it is alreay required by the main project; 3. clean code.pull/623/head
parent
173d92a45b
commit
29009089ec
@ -0,0 +1,3 @@
|
||||
# Download Baker dataset
|
||||
|
||||
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
|
@ -1,3 +0,0 @@
|
||||
# Download Baker dataset
|
||||
|
||||
Baker dataset has to be downloaded mannually and move to this folder, because you will have to pass the CATTCHA from a browswe to download the dataset.
|
@ -1,21 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
DOWNLOAD_DIR=$(dirname $0)/../data
|
||||
mkdir -p ${DOWNLOAD_DIR}
|
||||
|
||||
# you may need to pass the authentification to download the data via a browser
|
||||
URL=https://online-of-baklong.oss-cn-huhehaote.aliyuncs.com/story_resource/BZNSYP.rar
|
||||
|
||||
MD5="c4350563bf7dc298f7dd364b2607be83"
|
||||
TARGET=${DOWNLOAD_DIR}/BZNSYP.rar
|
||||
|
||||
echo "Download Baker TTS dataset..."
|
||||
download ${URL} ${MD5} ${TARGET}
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to downlaod Baker TTS dataset!"
|
||||
exit
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,33 +1,54 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import re
|
||||
|
||||
import jieba
|
||||
import pypinyin
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from pypinyin import lazy_pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
|
||||
def extract_pinyin(source, target, use_jieba=False):
|
||||
with open(source, 'rt', encoding='utf-8') as f:
|
||||
with open(target, 'wt', encoding='utf-8') as g:
|
||||
for i, line in enumerate(f):
|
||||
with open(source, 'rt', encoding='utf-8') as fin:
|
||||
with open(target, 'wt', encoding='utf-8') as fout:
|
||||
for i, line in enumerate(fin):
|
||||
if i % 2 == 0:
|
||||
g.write(line)
|
||||
fout.write(line)
|
||||
sentence_id, raw_text = line.strip().split()
|
||||
raw_text = re.sub(r'#\d', '', raw_text)
|
||||
if use_jieba:
|
||||
raw_text = jieba.lcut(raw_text)
|
||||
syllables = lazy_pinyin(raw_text, errors='ignore', style=Style.TONE3, neutral_tone_with_five=True)
|
||||
syllables = lazy_pinyin(
|
||||
raw_text,
|
||||
errors='ignore',
|
||||
style=Style.TONE3,
|
||||
neutral_tone_with_five=True)
|
||||
transcription = ' '.join(syllables)
|
||||
g.write(f'\t{transcription}\n')
|
||||
fout.write(f'\t{transcription}\n')
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||
parser.add_argument("input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument("output", type=str, help="target file to write pinyin lables")
|
||||
parser.add_argument("--use-jieba", action='store_true', help="use jieba for word segmentation.")
|
||||
parser.add_argument(
|
||||
"input", type=str, help="source file of baker's prosody label file")
|
||||
parser.add_argument(
|
||||
"output", type=str, help="target file to write pinyin lables")
|
||||
parser.add_argument(
|
||||
"--use-jieba",
|
||||
action='store_true',
|
||||
help="use jieba for word segmentation.")
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
||||
|
@ -0,0 +1,31 @@
|
||||
echo "Extracting Prosody Labeling"
|
||||
|
||||
exp_dir="exp"
|
||||
data_dir="data"
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
archive=${data_dir}/"BZNSYP.rar"
|
||||
if [ ! -f ${archive} ]; then
|
||||
echo "Baker Dataset not found! Download it first to the data_dir."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
MD5='c4350563bf7dc298f7dd364b2607be83'
|
||||
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
||||
if [ ${md5_result} != ${MD5} ]; then
|
||||
echo "MD5 mismatch! The Archive has been changed."
|
||||
exit -1
|
||||
fi
|
||||
|
||||
|
||||
label_file='ProsodyLabeling/000001-010000.txt'
|
||||
filename='000001-010000.txt'
|
||||
unrar e ${archive} ${label_file}
|
||||
mv ${filename} ${exp_dir}
|
||||
|
||||
if [ ! -f ${exp_dir}/${filename} ];then
|
||||
echo "File extraction failed!"
|
||||
exit
|
||||
fi
|
||||
|
||||
exit 0
|
@ -1,37 +1,20 @@
|
||||
#!/usr/bin/env bash
|
||||
source path.sh
|
||||
|
||||
stage=0
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
exp_dir="exp"
|
||||
data_dir="data"
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
bash local/data_download.sh
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
EXP_DIR="exp"
|
||||
mkdir -p ${EXP_DIR}
|
||||
|
||||
ARCHIVE="data/BZNSYP.rar"
|
||||
|
||||
echo "Extracting Prosody Labeling"
|
||||
LABEL_FILE='ProsodyLabeling/000001-010000.txt'
|
||||
FILENAME='000001-010000.txt'
|
||||
unrar e ${ARCHIVE} ${LABEL_FILE}
|
||||
mv ${FILENAME} ${EXP_DIR}
|
||||
|
||||
if [ ! -f ${EXP_DIR}/${FILENAME} ];then
|
||||
echo "File extraction failed!"
|
||||
exit
|
||||
fi
|
||||
mkdir -p ${exp_dir}
|
||||
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
|
||||
|
||||
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||
python3 local/extract_pinyin.py ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_result.txt"
|
||||
python3 local/extract_pinyin.py --use-jieba ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_with_jieba_result.txt"
|
||||
filename="000001-010000.txt"
|
||||
echo "Processing transcriptions..."
|
||||
python3 local/extract_pinyin.py ${exp_dir}/${filename} ${exp_dir}/"pypinyin_result.txt"
|
||||
python3 local/extract_pinyin.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"pypinyin_with_jieba_result.txt"
|
||||
|
||||
echo "done"
|
||||
exit 0
|
||||
|
Loading…
Reference in new issue