[TTS]Cantonese TTS MFA pipeline (#2899)

Co-authored-by: TianYuan <white-sky@qq.com>
1 year ago · c75906462e
parent 047092de8e
commit c75906462e
5 changed files with 125 additions and 1 deletions
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -38,6 +38,7 @@ sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
+ToJyutping
 typeguard
 webrtcvad
 websockets
--- a/examples/other/mfa/README.md
+++ b/examples/other/mfa/README.md
@ -7,3 +7,10 @@ Run the following script to get started, for more detail, please see `run.sh`.
 # Rhythm tags for MFA
 If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
 Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.
+
+# MFA for Cantonese language
+First, go download these datasets [Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-daily-use-sentence/) and [Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-in-the-vehicle/) under `~/datasets/`.
+Then,
+```bash
+./run_canton.sh
+```
--- a/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
+++ b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
@ -0,0 +1,80 @@
+import argparse
+import os
+import re
+import shutil
+
+import ToJyutping
+
+
+def check(str):
+    my_re = re.compile(r'[A-Za-z]', re.S)
+    res = re.findall(my_re, str)
+    if len(res):
+        return True
+    else:
+        return False
+
+
+consonants = [
+    'p', 'b', 't', 'd', 'ts', 'dz', 'k', 'g', 'kw', 'gw', 'f', 'h', 'l', 'm',
+    'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j'
+]
+
+
+def get_lines(canton):
+    for consonant in consonants:
+        if canton.startswith(consonant):
+            c, v = canton[:len(consonant)], canton[len(consonant):]
+            return canton + ' ' + c + ' ' + v
+    return canton + ' ' + canton
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate lexicon for Cantonese pinyin to phoneme for MFA")
+    parser.add_argument(
+        "--output_lexicon", type=str, help="Path to save lexicon.")
+    parser.add_argument(
+        "--output_wavlabs",
+        type=str,
+        help="Path of wavs and labs for MFA training.")
+    parser.add_argument(
+        "--inputs", type=str, nargs="+", help="Path to the cantonese datasets.")
+    args = parser.parse_args()
+
+    os.mkdir(args.output_wavlabs)
+
+    utterance_info = []
+    all_canton = []
+    for input_ in args.inputs:
+        utt = "UTTRANSINFO.txt" if "Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence" in input_ else "UTTERANCEINFO.txt"
+        input_utttxt = os.path.join(input_, utt)
+
+        with open(input_utttxt, 'r') as f:
+            utterance_info = f.readlines()[1:]
+
+        for utterance_line in utterance_info:
+            _, wav_name, spk, _, text = utterance_line.split('\t')
+            text = text.strip().replace(' ', '')
+            # check the characters and drop the short text.
+            if not check(text) and len(text) > 2:
+                source_path = os.path.join(input_, 'WAV', spk, wav_name)
+                out_spk_path = os.path.join(args.output_wavlabs, spk)
+                os.makedirs(out_spk_path, exist_ok=True)
+                target_path = os.path.join(out_spk_path, wav_name)
+
+                shutil.copy(source_path, target_path)
+
+                lab_name = wav_name.split('.')[0] + '.lab'
+                lab_target_path = os.path.join(out_spk_path, lab_name)
+                canton_list = ToJyutping.get_jyutping_text(text)
+                with open(lab_target_path, 'w') as f:
+                    f.write(canton_list)
+
+                canton_list = canton_list.split(' ')
+                all_canton.extend(canton_list)
+    all_canton = set(all_canton)
+
+    with open(args.output_lexicon, 'w') as f:
+        for canton in all_canton:
+            f.write(get_lines(canton) + '\n')
--- a/examples/other/mfa/run_canton.sh
+++ b/examples/other/mfa/run_canton.sh
@ -0,0 +1,34 @@
+EXP_DIR=exp
+
+mkdir -p $EXP_DIR
+LEXICON_NAME='canton'
+if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
+    echo "generating lexicon and training data..."
+    python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
+    echo "lexicon and training data done"
+fi
+
+
+MFA_DOWNLOAD_DIR=local/
+
+if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
+    echo "downloading mfa..."
+    (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
+    echo "download mfa done!"
+fi
+
+if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
+    echo "extracting mfa..."
+    (cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz")
+    echo "extraction done!"
+fi
+
+export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
+if [ ! -d "$EXP_DIR/canton_alignment" ]; then
+    echo "Start MFA training..."
+    mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
+    echo "training done!"
+    echo "results: $EXP_DIR/canton_alignment"
+    echo "model: $EXP_DIR/canton_model"
+fi
+
--- a/setup.py
+++ b/setup.py
@ -58,6 +58,7 @@ base = [
    "sacrebleu",
    "textgrid",
    "timer",
+    "ToJyutping",
    "typeguard",
    "webrtcvad",
    "yacs~=0.1.8",
@ -294,7 +295,8 @@ setup_info = dict(
    },

    # Package info
-    packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
+    packages=find_packages(
+        include=['paddlespeech*'], exclude=['utils', 'third_party']),
    zip_safe=True,
    classifiers=[
        'Development Status :: 5 - Production/Stable',