add a g2p example (#623)
* add an example to convert transcription to pinyin with pypinyin and jieba * format code * 1. remove script for data downloading, since Baker dataset is not easily downloaded via terminal; 2. remove pypinyin as an extra requirement; it is alreay required by the main project; 3. clean code. * change output formatpull/630/head
parent
9cc750bf29
commit
075635d2b4
@ -0,0 +1,5 @@
|
|||||||
|
# Download Baker dataset
|
||||||
|
|
||||||
|
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
|
||||||
|
|
||||||
|
Download URL https://test.data-baker.com/#/data/index/source.
|
@ -0,0 +1,53 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
from pypinyin import lazy_pinyin
|
||||||
|
from pypinyin import Style
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pinyin(source, target, use_jieba=False):
|
||||||
|
with open(source, 'rt', encoding='utf-8') as fin:
|
||||||
|
with open(target, 'wt', encoding='utf-8') as fout:
|
||||||
|
for i, line in enumerate(fin):
|
||||||
|
if i % 2 == 0:
|
||||||
|
sentence_id, raw_text = line.strip().split()
|
||||||
|
raw_text = re.sub(r'#\d', '', raw_text)
|
||||||
|
if use_jieba:
|
||||||
|
raw_text = jieba.lcut(raw_text)
|
||||||
|
syllables = lazy_pinyin(
|
||||||
|
raw_text,
|
||||||
|
errors='ignore',
|
||||||
|
style=Style.TONE3,
|
||||||
|
neutral_tone_with_five=True)
|
||||||
|
transcription = ' '.join(syllables)
|
||||||
|
fout.write(f'{sentence_id}\t{transcription}\n')
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||||
|
parser.add_argument(
|
||||||
|
"input", type=str, help="source file of baker's prosody label file")
|
||||||
|
parser.add_argument(
|
||||||
|
"output", type=str, help="target file to write pinyin lables")
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-jieba",
|
||||||
|
action='store_true',
|
||||||
|
help="use jieba for word segmentation.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
|
@ -0,0 +1,37 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pinyin_lables(source, target):
|
||||||
|
"""Extract pinyin labels from Baker's prosody labeling."""
|
||||||
|
with open(source, 'rt', encoding='utf-8') as fin:
|
||||||
|
with open(target, 'wt', encoding='utf-8') as fout:
|
||||||
|
for i, line in enumerate(fin):
|
||||||
|
if i % 2 == 0:
|
||||||
|
sentence_id, raw_text = line.strip().split()
|
||||||
|
fout.write(f'{sentence_id}\t')
|
||||||
|
else:
|
||||||
|
transcription = line.strip()
|
||||||
|
fout.write(f'{transcription}\n')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
|
||||||
|
parser.add_argument(
|
||||||
|
"input", type=str, help="source file of baker's prosody label file")
|
||||||
|
parser.add_argument(
|
||||||
|
"output", type=str, help="target file to write pinyin lables")
|
||||||
|
args = parser.parse_args()
|
||||||
|
extract_pinyin_lables(args.input, args.output)
|
@ -0,0 +1,32 @@
|
|||||||
|
echo "Extracting Prosody Labeling"
|
||||||
|
|
||||||
|
exp_dir="exp"
|
||||||
|
data_dir="data"
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||||
|
|
||||||
|
archive=${data_dir}/"BZNSYP.rar"
|
||||||
|
if [ ! -f ${archive} ]; then
|
||||||
|
echo "Baker Dataset not found! Download it first to the data_dir."
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
MD5='c4350563bf7dc298f7dd364b2607be83'
|
||||||
|
md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
|
||||||
|
if [ ${md5_result} != ${MD5} ]; then
|
||||||
|
echo "MD5 mismatch! The Archive has been changed."
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
label_file='ProsodyLabeling/000001-010000.txt'
|
||||||
|
filename='000001-010000.txt'
|
||||||
|
unrar e ${archive} ${label_file}
|
||||||
|
cp ${filename} ${exp_dir}
|
||||||
|
rm -f ${filename}
|
||||||
|
|
||||||
|
if [ ! -f ${exp_dir}/${filename} ];then
|
||||||
|
echo "File extraction failed!"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,8 @@
|
|||||||
|
export MAIN_ROOT=${PWD}/../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
@ -0,0 +1 @@
|
|||||||
|
jieba
|
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
exp_dir="exp"
|
||||||
|
data_dir="data"
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
|
||||||
|
mkdir -p ${exp_dir}
|
||||||
|
bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
|
||||||
|
|
||||||
|
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
|
||||||
|
filename="000001-010000.txt"
|
||||||
|
echo "Processing transcriptions..."
|
||||||
|
|
||||||
|
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py"
|
||||||
|
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt"
|
||||||
|
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt"
|
||||||
|
|
||||||
|
echo "done"
|
||||||
|
exit 0
|
Loading…
Reference in new issue