import argparse import os import re import shutil import ToJyutping def check(str): my_re = re.compile(r'[A-Za-z]', re.S) res = re.findall(my_re, str) if len(res): return True else: return False INITIALS = [ 'aa', 'aai', 'aak', 'aap', 'aat', 'aau', 'ai', 'au', 'ap', 'at', 'ak', 'a', 'p', 'b', 'e', 'ts', 't', 'dz', 'd', 'kw', 'k', 'gw', 'g', 'f', 'h', 'l', 'm', 'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j', 'ong', 'on', 'ou', 'oi', 'ok', 'o', 'uk', 'ung' ] def get_lines(canton): for init in INITIALS: if canton.startswith(init): c, v = canton[:len(init)], canton[len(init):] return canton + ' ' + c + ' ' + v return canton + ' ' + canton if __name__ == "__main__": parser = argparse.ArgumentParser( description="Generate lexicon for Cantonese pinyin to phoneme for MFA") parser.add_argument( "--output_lexicon", type=str, help="Path to save lexicon.") parser.add_argument( "--output_wavlabs", type=str, help="Path of wavs and labs for MFA training.") parser.add_argument( "--inputs", type=str, nargs="+", help="Path to the cantonese datasets.") args = parser.parse_args() os.mkdir(args.output_wavlabs) utterance_info = [] all_canton = [] for input_ in args.inputs: utt = "UTTRANSINFO.txt" if "Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence" in input_ else "UTTERANCEINFO.txt" input_utttxt = os.path.join(input_, utt) with open(input_utttxt, 'r') as f: utterance_info = f.readlines()[1:] for utterance_line in utterance_info: _, wav_name, spk, _, text = utterance_line.split('\t') text = text.strip().replace(' ', '') # check the characters and drop the short text. if not check(text) and len(text) > 2: source_path = os.path.join(input_, 'WAV', spk, wav_name) out_spk_path = os.path.join(args.output_wavlabs, spk) os.makedirs(out_spk_path, exist_ok=True) target_path = os.path.join(out_spk_path, wav_name) shutil.copy(source_path, target_path) lab_name = wav_name.split('.')[0] + '.lab' lab_target_path = os.path.join(out_spk_path, lab_name) canton_list = ToJyutping.get_jyutping_text(text) with open(lab_target_path, 'w') as f: f.write(canton_list) canton_list = canton_list.split(' ') all_canton.extend(canton_list) all_canton = sorted(list(set(all_canton))) with open(args.output_lexicon, 'w') as f: for canton in all_canton: f.write(get_lines(canton) + '\n')