# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Gen Chinese characters to THCHS30-30 phone lexicon using THCHS30-30's lexicon file1: THCHS-30/data_thchs30/lm_word/lexicon.txt file2: THCHS-30/resource/dict/lexicon.txt """ import argparse from collections import defaultdict from pathlib import Path from typing import List from typing import Union # key: (cn, ('ee', 'er4')),value: count cn_phones_counter = defaultdict(int) # key: cn, value: list of (phones, num) cn_counter = defaultdict(list) # key: cn, value: list of (phones, probabilities) cn_counter_p = defaultdict(list) def is_Chinese(ch): if '\u4e00' <= ch <= '\u9fff': return True return False def proc_line(line: str): line = line.strip() if is_Chinese(line[0]): line_list = line.split() cn_list = line_list[0] phone_list = line_list[1:] if len(cn_list) == len(phone_list) / 2: new_phone_list = [(phone_list[i], phone_list[i + 1]) for i in range(0, len(phone_list), 2)] assert len(cn_list) == len(new_phone_list) for idx, cn in enumerate(cn_list): phones = new_phone_list[idx] cn_phones_counter[(cn, phones)] += 1 """ example lines of output the first column is a Chinese character the second is the probability of this pronunciation and the rest are the phones of this pronunciation 一 0.22 ii i1↩ 一 0.45 ii i4↩ 一 0.32 ii i2↩ 一 0.01 ii i5 """ def gen_lexicon(lexicon_files: List[Union[str, Path]], output_path: Union[str, Path]): for file_path in lexicon_files: with open(file_path, "r") as f1: for line in f1: proc_line(line) for key in cn_phones_counter: cn = key[0] cn_counter[cn].append((key[1], cn_phones_counter[key])) for key in cn_counter: phone_count_list = cn_counter[key] count_sum = sum([x[1] for x in phone_count_list]) for item in phone_count_list: p = item[1] / count_sum p = round(p, 2) if p > 0: cn_counter_p[key].append((item[0], p)) with open(output_path, "w") as wf: for key in cn_counter_p: phone_p_list = cn_counter_p[key] for item in phone_p_list: phones, p = item wf.write(key + " " + str(p) + " " + " ".join(phones) + "\n") if __name__ == "__main__": parser = argparse.ArgumentParser( description="Gen Chinese characters to phone lexicon for THCHS-30 dataset" ) # A line of word_lexicon: # 一丁点 ii i4 d ing1 d ian3 # the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len parser.add_argument( "--lexicon-files", type=str, default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2", help="lm_word_lexicon files") parser.add_argument( "--output-path", type=str, default="data/dict/word.lexicon", help="path to save output word2phone lexicon") args = parser.parse_args() lexicon_files = args.lexicon_files.split(" ") output_path = Path(args.output_path).expanduser() gen_lexicon(lexicon_files, output_path)