add a g2p example (#623)

* add an example to convert transcription to pinyin with pypinyin and jieba * format code * 1. remove script for data downloading, since Baker dataset is not easily downloaded via terminal; 2. remove pypinyin as an extra requirement; it is alreay required by the main project; 3. clean code. * change output format
5 years ago · 075635d2b4
parent 9cc750bf29
commit 075635d2b4
7 changed files with 158 additions and 0 deletions
--- a/examples/chinese_g2p/README.md
+++ b/examples/chinese_g2p/README.md
@ -0,0 +1,5 @@
 # Download Baker dataset
 Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
 Download URL https://test.data-baker.com/#/data/index/source.
--- a/examples/chinese_g2p/local/convert_transcription.py
+++ b/examples/chinese_g2p/local/convert_transcription.py
@ -0,0 +1,53 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import re
 import jieba
 from pypinyin import lazy_pinyin
 from pypinyin import Style
 def extract_pinyin(source, target, use_jieba=False):
    with open(source, 'rt', encoding='utf-8') as fin:
        with open(target, 'wt', encoding='utf-8') as fout:
            for i, line in enumerate(fin):
                if i % 2 == 0:
                    sentence_id, raw_text = line.strip().split()
                    raw_text = re.sub(r'#\d', '', raw_text)
                    if use_jieba:
                        raw_text = jieba.lcut(raw_text)
                    syllables = lazy_pinyin(
                        raw_text,
                        errors='ignore',
                        style=Style.TONE3,
                        neutral_tone_with_five=True)
                    transcription = ' '.join(syllables)
                    fout.write(f'{sentence_id}\t{transcription}\n')
                else:
                    continue
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="extract baker pinyin labels")
    parser.add_argument(
        "input", type=str, help="source file of baker's prosody label file")
    parser.add_argument(
        "output", type=str, help="target file to write pinyin lables")
    parser.add_argument(
        "--use-jieba",
        action='store_true',
        help="use jieba for word segmentation.")
    args = parser.parse_args()
    extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
--- a/examples/chinese_g2p/local/extract_pinyin_label.py
+++ b/examples/chinese_g2p/local/extract_pinyin_label.py
@ -0,0 +1,37 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 def extract_pinyin_lables(source, target):
    """Extract pinyin labels from Baker's prosody labeling."""
    with open(source, 'rt', encoding='utf-8') as fin:
        with open(target, 'wt', encoding='utf-8') as fout:
            for i, line in enumerate(fin):
                if i % 2 == 0:
                    sentence_id, raw_text = line.strip().split()
                    fout.write(f'{sentence_id}\t')
                else:
                    transcription = line.strip()
                    fout.write(f'{transcription}\n')
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="extract baker pinyin labels")
    parser.add_argument(
        "input", type=str, help="source file of baker's prosody label file")
    parser.add_argument(
        "output", type=str, help="target file to write pinyin lables")
    args = parser.parse_args()
    extract_pinyin_lables(args.input, args.output)
--- a/examples/chinese_g2p/local/prepare_dataset.sh
+++ b/examples/chinese_g2p/local/prepare_dataset.sh
@ -0,0 +1,32 @@
 echo "Extracting Prosody Labeling"
 exp_dir="exp"
 data_dir="data"
 source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
 archive=${data_dir}/"BZNSYP.rar"
 if [ ! -f ${archive} ]; then
    echo "Baker Dataset not found! Download it first to the data_dir."
    exit -1
 fi
 MD5='c4350563bf7dc298f7dd364b2607be83'
 md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
 if [ ${md5_result} != ${MD5} ]; then
    echo "MD5 mismatch! The Archive has been changed."
    exit -1
 fi
 label_file='ProsodyLabeling/000001-010000.txt'
 filename='000001-010000.txt'
 unrar e ${archive} ${label_file}
 cp ${filename} ${exp_dir}
 rm -f ${filename}
 if [ ! -f ${exp_dir}/${filename} ];then
    echo "File extraction failed!"
    exit
 fi
 exit 0
--- a/examples/chinese_g2p/path.sh
+++ b/examples/chinese_g2p/path.sh
@ -0,0 +1,8 @@
 export MAIN_ROOT=${PWD}/../../
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
--- a/examples/chinese_g2p/requirements.txt
+++ b/examples/chinese_g2p/requirements.txt
@ -0,0 +1 @@
 jieba
--- a/examples/chinese_g2p/run.sh
+++ b/examples/chinese_g2p/run.sh
@ -0,0 +1,22 @@
 #!/usr/bin/env bash
 source path.sh
 stage=-1
 stop_stage=100
 exp_dir="exp"
 data_dir="data"
 source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
 mkdir -p ${exp_dir}
 bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
 # convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
 filename="000001-010000.txt"
 echo "Processing transcriptions..."
 python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py"
 python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt"
 python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt"
 echo "done"
 exit 0