From 2fe92b4b34bbddecca2954b9f1e2093e3631ffae Mon Sep 17 00:00:00 2001 From: chenfeiyu Date: Wed, 19 May 2021 15:06:06 +0800 Subject: [PATCH] change output format --- examples/chinese_g2p/README.md | 2 + ...act_pinyin.py => convert_transcription.py} | 3 +- .../chinese_g2p/local/extract_pinyin_label.py | 37 +++++++++++++++++++ examples/chinese_g2p/local/prepare_dataset.sh | 3 +- examples/chinese_g2p/run.sh | 6 ++- 5 files changed, 46 insertions(+), 5 deletions(-) rename examples/chinese_g2p/local/{extract_pinyin.py => convert_transcription.py} (95%) create mode 100644 examples/chinese_g2p/local/extract_pinyin_label.py diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md index 8855d37a9..e3fdfe684 100644 --- a/examples/chinese_g2p/README.md +++ b/examples/chinese_g2p/README.md @@ -1,3 +1,5 @@ # Download Baker dataset Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset. + +Download URL https://test.data-baker.com/#/data/index/source. diff --git a/examples/chinese_g2p/local/extract_pinyin.py b/examples/chinese_g2p/local/convert_transcription.py similarity index 95% rename from examples/chinese_g2p/local/extract_pinyin.py rename to examples/chinese_g2p/local/convert_transcription.py index 5f2c663bd..b133ad2c5 100644 --- a/examples/chinese_g2p/local/extract_pinyin.py +++ b/examples/chinese_g2p/local/convert_transcription.py @@ -24,7 +24,6 @@ def extract_pinyin(source, target, use_jieba=False): with open(target, 'wt', encoding='utf-8') as fout: for i, line in enumerate(fin): if i % 2 == 0: - fout.write(line) sentence_id, raw_text = line.strip().split() raw_text = re.sub(r'#\d', '', raw_text) if use_jieba: @@ -35,7 +34,7 @@ def extract_pinyin(source, target, use_jieba=False): style=Style.TONE3, neutral_tone_with_five=True) transcription = ' '.join(syllables) - fout.write(f'\t{transcription}\n') + fout.write(f'{sentence_id}\t{transcription}\n') else: continue diff --git a/examples/chinese_g2p/local/extract_pinyin_label.py b/examples/chinese_g2p/local/extract_pinyin_label.py new file mode 100644 index 000000000..be7b287f4 --- /dev/null +++ b/examples/chinese_g2p/local/extract_pinyin_label.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + + +def extract_pinyin_lables(source, target): + """Extract pinyin labels from Baker's prosody labeling.""" + with open(source, 'rt', encoding='utf-8') as fin: + with open(target, 'wt', encoding='utf-8') as fout: + for i, line in enumerate(fin): + if i % 2 == 0: + sentence_id, raw_text = line.strip().split() + fout.write(f'{sentence_id}\t') + else: + transcription = line.strip() + fout.write(f'{transcription}\n') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="extract baker pinyin labels") + parser.add_argument( + "input", type=str, help="source file of baker's prosody label file") + parser.add_argument( + "output", type=str, help="target file to write pinyin lables") + args = parser.parse_args() + extract_pinyin_lables(args.input, args.output) diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh index 7ef811e51..fe9948ed3 100644 --- a/examples/chinese_g2p/local/prepare_dataset.sh +++ b/examples/chinese_g2p/local/prepare_dataset.sh @@ -21,7 +21,8 @@ fi label_file='ProsodyLabeling/000001-010000.txt' filename='000001-010000.txt' unrar e ${archive} ${label_file} -mv ${filename} ${exp_dir} +cp ${filename} ${exp_dir} +rm -f ${filename} if [ ! -f ${exp_dir}/${filename} ];then echo "File extraction failed!" diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh index b5f55c19f..6bde2e264 100644 --- a/examples/chinese_g2p/run.sh +++ b/examples/chinese_g2p/run.sh @@ -13,8 +13,10 @@ bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir} # convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin filename="000001-010000.txt" echo "Processing transcriptions..." -python3 local/extract_pinyin.py ${exp_dir}/${filename} ${exp_dir}/"pypinyin_result.txt" -python3 local/extract_pinyin.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"pypinyin_with_jieba_result.txt" + +python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py" +python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt" +python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt" echo "done" exit 0