parent
047092de8e
commit
c75906462e
@ -0,0 +1,80 @@
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
||||
import ToJyutping
|
||||
|
||||
|
||||
def check(str):
|
||||
my_re = re.compile(r'[A-Za-z]', re.S)
|
||||
res = re.findall(my_re, str)
|
||||
if len(res):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
consonants = [
|
||||
'p', 'b', 't', 'd', 'ts', 'dz', 'k', 'g', 'kw', 'gw', 'f', 'h', 'l', 'm',
|
||||
'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j'
|
||||
]
|
||||
|
||||
|
||||
def get_lines(canton):
|
||||
for consonant in consonants:
|
||||
if canton.startswith(consonant):
|
||||
c, v = canton[:len(consonant)], canton[len(consonant):]
|
||||
return canton + ' ' + c + ' ' + v
|
||||
return canton + ' ' + canton
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate lexicon for Cantonese pinyin to phoneme for MFA")
|
||||
parser.add_argument(
|
||||
"--output_lexicon", type=str, help="Path to save lexicon.")
|
||||
parser.add_argument(
|
||||
"--output_wavlabs",
|
||||
type=str,
|
||||
help="Path of wavs and labs for MFA training.")
|
||||
parser.add_argument(
|
||||
"--inputs", type=str, nargs="+", help="Path to the cantonese datasets.")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.mkdir(args.output_wavlabs)
|
||||
|
||||
utterance_info = []
|
||||
all_canton = []
|
||||
for input_ in args.inputs:
|
||||
utt = "UTTRANSINFO.txt" if "Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence" in input_ else "UTTERANCEINFO.txt"
|
||||
input_utttxt = os.path.join(input_, utt)
|
||||
|
||||
with open(input_utttxt, 'r') as f:
|
||||
utterance_info = f.readlines()[1:]
|
||||
|
||||
for utterance_line in utterance_info:
|
||||
_, wav_name, spk, _, text = utterance_line.split('\t')
|
||||
text = text.strip().replace(' ', '')
|
||||
# check the characters and drop the short text.
|
||||
if not check(text) and len(text) > 2:
|
||||
source_path = os.path.join(input_, 'WAV', spk, wav_name)
|
||||
out_spk_path = os.path.join(args.output_wavlabs, spk)
|
||||
os.makedirs(out_spk_path, exist_ok=True)
|
||||
target_path = os.path.join(out_spk_path, wav_name)
|
||||
|
||||
shutil.copy(source_path, target_path)
|
||||
|
||||
lab_name = wav_name.split('.')[0] + '.lab'
|
||||
lab_target_path = os.path.join(out_spk_path, lab_name)
|
||||
canton_list = ToJyutping.get_jyutping_text(text)
|
||||
with open(lab_target_path, 'w') as f:
|
||||
f.write(canton_list)
|
||||
|
||||
canton_list = canton_list.split(' ')
|
||||
all_canton.extend(canton_list)
|
||||
all_canton = set(all_canton)
|
||||
|
||||
with open(args.output_lexicon, 'w') as f:
|
||||
for canton in all_canton:
|
||||
f.write(get_lines(canton) + '\n')
|
@ -0,0 +1,34 @@
|
||||
EXP_DIR=exp
|
||||
|
||||
mkdir -p $EXP_DIR
|
||||
LEXICON_NAME='canton'
|
||||
if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
|
||||
echo "generating lexicon and training data..."
|
||||
python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
|
||||
echo "lexicon and training data done"
|
||||
fi
|
||||
|
||||
|
||||
MFA_DOWNLOAD_DIR=local/
|
||||
|
||||
if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
|
||||
echo "downloading mfa..."
|
||||
(cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
|
||||
echo "download mfa done!"
|
||||
fi
|
||||
|
||||
if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
|
||||
echo "extracting mfa..."
|
||||
(cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz")
|
||||
echo "extraction done!"
|
||||
fi
|
||||
|
||||
export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
|
||||
if [ ! -d "$EXP_DIR/canton_alignment" ]; then
|
||||
echo "Start MFA training..."
|
||||
mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
|
||||
echo "training done!"
|
||||
echo "results: $EXP_DIR/canton_alignment"
|
||||
echo "model: $EXP_DIR/canton_model"
|
||||
fi
|
||||
|
Loading…
Reference in new issue