[TTS]Cantonese TTS MFA pipeline (#2899)

Co-authored-by: TianYuan <white-sky@qq.com>
3 years ago · c75906462e
parent 047092de8e
commit c75906462e
5 changed files with 125 additions and 1 deletions
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -38,6 +38,7 @@ sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
 ToJyutping
 typeguard
 webrtcvad
 websockets
--- a/examples/other/mfa/README.md
+++ b/examples/other/mfa/README.md
@ -7,3 +7,10 @@ Run the following script to get started, for more detail, please see `run.sh`.
 # Rhythm tags for MFA
 If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
 Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.
 # MFA for Cantonese language
 First, go download these datasets [Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-daily-use-sentence/) and [Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-in-the-vehicle/) under `~/datasets/`.
 Then,
 ```bash
 ./run_canton.sh
 ```
--- a/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
+++ b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
@ -0,0 +1,80 @@
 import argparse
 import os
 import re
 import shutil
 import ToJyutping
 def check(str):
    my_re = re.compile(r'[A-Za-z]', re.S)
    res = re.findall(my_re, str)
    if len(res):
        return True
    else:
        return False
 consonants = [
    'p', 'b', 't', 'd', 'ts', 'dz', 'k', 'g', 'kw', 'gw', 'f', 'h', 'l', 'm',
    'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j'
 ]
 def get_lines(canton):
    for consonant in consonants:
        if canton.startswith(consonant):
            c, v = canton[:len(consonant)], canton[len(consonant):]
            return canton + ' ' + c + ' ' + v
    return canton + ' ' + canton
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate lexicon for Cantonese pinyin to phoneme for MFA")
    parser.add_argument(
        "--output_lexicon", type=str, help="Path to save lexicon.")
    parser.add_argument(
        "--output_wavlabs",
        type=str,
        help="Path of wavs and labs for MFA training.")
    parser.add_argument(
        "--inputs", type=str, nargs="+", help="Path to the cantonese datasets.")
    args = parser.parse_args()
    os.mkdir(args.output_wavlabs)
    utterance_info = []
    all_canton = []
    for input_ in args.inputs:
        utt = "UTTRANSINFO.txt" if "Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence" in input_ else "UTTERANCEINFO.txt"
        input_utttxt = os.path.join(input_, utt)
        with open(input_utttxt, 'r') as f:
            utterance_info = f.readlines()[1:]
        for utterance_line in utterance_info:
            _, wav_name, spk, _, text = utterance_line.split('\t')
            text = text.strip().replace(' ', '')
            # check the characters and drop the short text.
            if not check(text) and len(text) > 2:
                source_path = os.path.join(input_, 'WAV', spk, wav_name)
                out_spk_path = os.path.join(args.output_wavlabs, spk)
                os.makedirs(out_spk_path, exist_ok=True)
                target_path = os.path.join(out_spk_path, wav_name)
                shutil.copy(source_path, target_path)
                lab_name = wav_name.split('.')[0] + '.lab'
                lab_target_path = os.path.join(out_spk_path, lab_name)
                canton_list = ToJyutping.get_jyutping_text(text)
                with open(lab_target_path, 'w') as f:
                    f.write(canton_list)
                canton_list = canton_list.split(' ')
                all_canton.extend(canton_list)
    all_canton = set(all_canton)
    with open(args.output_lexicon, 'w') as f:
        for canton in all_canton:
            f.write(get_lines(canton) + '\n')
--- a/examples/other/mfa/run_canton.sh
+++ b/examples/other/mfa/run_canton.sh
@ -0,0 +1,34 @@
 EXP_DIR=exp
 mkdir -p $EXP_DIR
 LEXICON_NAME='canton'
 if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
    echo "generating lexicon and training data..."
    python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
    echo "lexicon and training data done"
 fi
 MFA_DOWNLOAD_DIR=local/
 if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
    echo "downloading mfa..."
    (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
    echo "download mfa done!"
 fi
 if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
    echo "extracting mfa..."
    (cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz")
    echo "extraction done!"
 fi
 export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
 if [ ! -d "$EXP_DIR/canton_alignment" ]; then
    echo "Start MFA training..."
    mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
    echo "training done!"
    echo "results: $EXP_DIR/canton_alignment"
    echo "model: $EXP_DIR/canton_model"
 fi
--- a/setup.py
+++ b/setup.py
@ -58,6 +58,7 @@ base = [
    "sacrebleu",
    "textgrid",
    "timer",
    "ToJyutping",
    "typeguard",
    "webrtcvad",
    "yacs~=0.1.8",
@ -294,7 +295,8 @@ setup_info = dict(
    },
    # Package info
-    packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
+    packages=find_packages(
        include=['paddlespeech*'], exclude=['utils', 'third_party']),
    zip_safe=True,
    classifiers=[
        'Development Status :: 5 - Production/Stable',