diff --git a/examples/chinese_g2p/local/convert_transcription.py b/examples/chinese_g2p/local/convert_transcription.py index b133ad2c..f9ec04aa 100644 --- a/examples/chinese_g2p/local/convert_transcription.py +++ b/examples/chinese_g2p/local/convert_transcription.py @@ -34,7 +34,7 @@ def extract_pinyin(source, target, use_jieba=False): style=Style.TONE3, neutral_tone_with_five=True) transcription = ' '.join(syllables) - fout.write(f'{sentence_id}\t{transcription}\n') + fout.write(f'{sentence_id} {transcription}\n') else: continue diff --git a/examples/chinese_g2p/local/extract_pinyin_label.py b/examples/chinese_g2p/local/extract_pinyin_label.py index be7b287f..49243f71 100644 --- a/examples/chinese_g2p/local/extract_pinyin_label.py +++ b/examples/chinese_g2p/local/extract_pinyin_label.py @@ -21,7 +21,7 @@ def extract_pinyin_lables(source, target): for i, line in enumerate(fin): if i % 2 == 0: sentence_id, raw_text = line.strip().split() - fout.write(f'{sentence_id}\t') + fout.write(f'{sentence_id} ') else: transcription = line.strip() fout.write(f'{transcription}\n') diff --git a/examples/chinese_g2p/local/ignore_sandhi.py b/examples/chinese_g2p/local/ignore_sandhi.py new file mode 100644 index 00000000..cda1bd14 --- /dev/null +++ b/examples/chinese_g2p/local/ignore_sandhi.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from typing import List, Union +from pathlib import Path + + +def erized(syllable: str) -> bool: + """Whether the syllable contains erhua effect. + + Example + -------- + huar -> True + guanr -> True + er -> False + """ + # note: for pinyin, len(syllable) >=2 is always true + # if not: there is something wrong in the data + assert len(syllable) >= 2, f"inavlid syllable {syllable}" + return syllable[:2] != "er" and syllable[-2] == 'r' + + +def ignore_sandhi(reference: List[str], generated: List[str]) -> List[str]: + """ + Given a sequence of syllables from human annotation(reference), + which makes sandhi explici and a sequence of syllables from some + simple g2p program(generated), which does not consider sandhi, + return a the reference sequence while ignore sandhi. + + Example + -------- + ['lao2', 'hu3'], ['lao3', 'hu3'] -> ['lao3', 'hu3'] + """ + i = 0 + j = 0 + + # sandhi ignored in the result while other errors are not included + result = [] + while i < len(reference): + if erized(reference[i]): + result.append(reference[i]) + i += 1 + j += 2 + elif reference[i][:-1] == generated[i][:-1] and reference[i][ + -1] == '2' and generated[i][-1] == '3': + result.append(generated[i]) + i += 1 + j += 1 + else: + result.append(reference[i]) + i += 1 + j += 1 + assert j == len( + generated + ), "length of transcriptions mismatch, There may be some characters that are ignored in the generated transcription." + return result + + +def convert_transcriptions(reference: Union[str, Path], generated: Union[str, Path], output: Union[str, Path]): + with open(reference, 'rt') as f_ref: + with open(generated, 'rt') as f_gen: + with open(output, 'wt') as f_out: + for i, (ref, gen) in enumerate(zip(f_ref, f_gen)): + sentence_id, ref_transcription = ref.strip().split(' ', 1) + _, gen_transcription = gen.strip().split(' ', 1) + try: + result = ignore_sandhi(ref_transcription.split(), + gen_transcription.split()) + result = ' '.join(result) + except Exception: + print( + f"sentence_id: {sentence_id} There is some annotation error in the reference or generated transcription. Use the reference." + ) + result = ref_transcription + f_out.write(f"{sentence_id} {result}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="reference transcription but ignore sandhi.") + parser.add_argument( + "--reference", + type=str, + help="path to the reference transcription of baker dataset.") + parser.add_argument( + "--generated", type=str, help="path to the generated transcription.") + parser.add_argument("--output", type=str, help="path to save result.") + args = parser.parse_args() + convert_transcriptions(args.reference, args.generated, args.output)