change output format

pull/623/head
chenfeiyu 4 years ago
parent 29009089ec
commit 2fe92b4b34

@ -1,3 +1,5 @@
# Download Baker dataset
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
Download URL https://test.data-baker.com/#/data/index/source.

@ -24,7 +24,6 @@ def extract_pinyin(source, target, use_jieba=False):
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
fout.write(line)
sentence_id, raw_text = line.strip().split()
raw_text = re.sub(r'#\d', '', raw_text)
if use_jieba:
@ -35,7 +34,7 @@ def extract_pinyin(source, target, use_jieba=False):
style=Style.TONE3,
neutral_tone_with_five=True)
transcription = ' '.join(syllables)
fout.write(f'\t{transcription}\n')
fout.write(f'{sentence_id}\t{transcription}\n')
else:
continue

@ -0,0 +1,37 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
def extract_pinyin_lables(source, target):
"""Extract pinyin labels from Baker's prosody labeling."""
with open(source, 'rt', encoding='utf-8') as fin:
with open(target, 'wt', encoding='utf-8') as fout:
for i, line in enumerate(fin):
if i % 2 == 0:
sentence_id, raw_text = line.strip().split()
fout.write(f'{sentence_id}\t')
else:
transcription = line.strip()
fout.write(f'{transcription}\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="extract baker pinyin labels")
parser.add_argument(
"input", type=str, help="source file of baker's prosody label file")
parser.add_argument(
"output", type=str, help="target file to write pinyin lables")
args = parser.parse_args()
extract_pinyin_lables(args.input, args.output)

@ -21,7 +21,8 @@ fi
label_file='ProsodyLabeling/000001-010000.txt'
filename='000001-010000.txt'
unrar e ${archive} ${label_file}
mv ${filename} ${exp_dir}
cp ${filename} ${exp_dir}
rm -f ${filename}
if [ ! -f ${exp_dir}/${filename} ];then
echo "File extraction failed!"

@ -13,8 +13,10 @@ bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
# convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
filename="000001-010000.txt"
echo "Processing transcriptions..."
python3 local/extract_pinyin.py ${exp_dir}/${filename} ${exp_dir}/"pypinyin_result.txt"
python3 local/extract_pinyin.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"pypinyin_with_jieba_result.txt"
python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py"
python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt"
python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt"
echo "done"
exit 0

Loading…
Cancel
Save