From 2c75c923b9a64f1a6e5c92babbfc71f693abf1af Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 23 Aug 2021 08:07:35 +0000
Subject: [PATCH] fix_mfa

---
 deepspeech/decoders/swig/setup.py             |  5 +-
 examples/aishell/s0/conf/augmentation.json    |  2 +-
 examples/thchs30/a0/local/data.sh             | 38 +++++++------
 examples/thchs30/a0/local/gen_word2phone.py   | 56 +++++++++++++------
 .../thchs30/a0/local/reorganize_thchs30.py    |  9 +--
 examples/thchs30/a0/run.sh                    | 15 +++--
 tools/extras/install_mfa.sh                   |  2 +-
 7 files changed, 79 insertions(+), 48 deletions(-)

diff --git a/deepspeech/decoders/swig/setup.py b/deepspeech/decoders/swig/setup.py
index 86af475a..3da5ce8b 100644
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
 FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
 
 FILES = [
-    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
-                               or fn.endswith('unittest.cc'))
+    fn for fn in FILES
+    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+        'unittest.cc'))
 ]
 
 LIBS = ['stdc++']
diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/s0/conf/augmentation.json
index 39afe4e6..ac8a1c53 100644
--- a/examples/aishell/s0/conf/augmentation.json
+++ b/examples/aishell/s0/conf/augmentation.json
@@ -33,4 +33,4 @@
     },
     "prob": 1.0
   }
-]
\ No newline at end of file
+]
diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/a0/local/data.sh
index 169367ac..8614a041 100644
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
@@ -20,27 +20,33 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
         echo "Prepare THCHS-30 failed. Terminated."
         exit 1
     fi
-    
 fi
 
-# dump manifest to data/
-python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
-
-# copy files to data/dict to gen word.lexicon
-cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
-cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # dump manifest to data/
+    python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
+fi
 
-# copy phone.lexicon to data/dict
-cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # copy files to data/dict to gen word.lexicon
+    cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
+    cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
+    # copy phone.lexicon to data/dict
+    cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
+fi
 
-# gen word.lexicon
-python local/gen_word2phone.py  --root-dir=data/dict --output-dir=data/dict
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # gen word.lexicon
+    python local/gen_word2phone.py  --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon
+fi
 
-# reorganize dataset for MFA
-if [ ! -d $EXP_DIR/thchs30_corpus ]; then
-    echo "reorganizing thchs30 corpus..."
-    python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
-    echo "reorganization done."
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # reorganize dataset for MFA
+    if [ ! -d $EXP_DIR/thchs30_corpus ]; then
+        echo "reorganizing thchs30 corpus..."
+        python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
+        echo "reorganization done."
+    fi
 fi
 
 echo "THCHS-30  data preparation done."
diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/a0/local/gen_word2phone.py
index cd584fcd..9bc0249b 100644
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
@@ -18,6 +18,7 @@ file2: THCHS-30/resource/dict/lexicon.txt
 import argparse
 from collections import defaultdict
 from pathlib import Path
+from typing import List
 from typing import Union
 
 # key: (cn, ('ee', 'er4'))，value: count
@@ -34,7 +35,7 @@ def is_Chinese(ch):
     return False
 
 
-def proc_line(line):
+def proc_line(line: str):
     line = line.strip()
     if is_Chinese(line[0]):
         line_list = line.split()
@@ -49,20 +50,25 @@ def proc_line(line):
                 cn_phones_counter[(cn, phones)] += 1
 
 
-def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
-    root_dir = Path(root_dir).expanduser()
-    output_dir = Path(output_dir).expanduser()
-    output_dir.mkdir(parents=True, exist_ok=True)
-    file1 = root_dir / "lm_word_lexicon_1"
-    file2 = root_dir / "lm_word_lexicon_2"
-    write_file = output_dir / "word.lexicon"
+"""
+example lines of output
+the first column is a Chinese character
+the second is the probability of this pronunciation
+and the rest are the phones of this pronunciation
+一 0.22 ii i1↩
+一 0.45 ii i4↩
+一 0.32 ii i2↩
+一 0.01 ii i5
+"""
+
+
+def gen_lexicon(lexicon_files: List[Union[str, Path]],
+                output_path: Union[str, Path]):
+    for file_path in lexicon_files:
+        with open(file_path, "r") as f1:
+            for line in f1:
+                proc_line(line)
 
-    with open(file1, "r") as f1:
-        for line in f1:
-            proc_line(line)
-    with open(file2, "r") as f2:
-        for line in f2:
-            proc_line(line)
     for key in cn_phones_counter:
         cn = key[0]
         cn_counter[cn].append((key[1], cn_phones_counter[key]))
@@ -75,7 +81,8 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
             p = round(p, 2)
             if p > 0:
                 cn_counter_p[key].append((item[0], p))
-    with open(write_file, "w") as wf:
+
+    with open(output_path, "w") as wf:
         for key in cn_counter_p:
             phone_p_list = cn_counter_p[key]
             for item in phone_p_list:
@@ -87,8 +94,21 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
     )
+    # A line of word_lexicon:
+    # 一丁点 ii i4 d ing1 d ian3
+    # the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len
+    parser.add_argument(
+        "--lexicon-files",
+        type=str,
+        default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2",
+        help="lm_word_lexicon files")
     parser.add_argument(
-        "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
-    parser.add_argument("--output-dir", type=str, help="path to save outputs")
+        "--output-path",
+        type=str,
+        default="data/dict/word.lexicon",
+        help="path to save output word2phone lexicon")
     args = parser.parse_args()
-    gen_lexicon(args.root_dir, args.output_dir)
+    lexicon_files = args.lexicon_files.split(" ")
+    output_path = Path(args.output_path).expanduser()
+
+    gen_lexicon(lexicon_files, output_path)
diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/a0/local/reorganize_thchs30.py
index 9df6bc6a..c7c6248b 100644
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
@@ -58,8 +58,6 @@ def write_lab(root_dir: Union[str, Path],
 def reorganize_thchs30(root_dir: Union[str, Path],
                        output_dir: Union[str, Path]=None,
                        script_type='phone'):
-    root_dir = Path(root_dir).expanduser()
-    output_dir = Path(output_dir).expanduser()
     output_dir.mkdir(parents=True, exist_ok=True)
     link_wav(root_dir, output_dir)
     write_lab(root_dir, output_dir, script_type)
@@ -72,12 +70,15 @@ if __name__ == "__main__":
     parser.add_argument(
         "--output-dir",
         type=str,
-        help="path to save outputs(audio and transcriptions)")
+        help="path to save outputs (audio and transcriptions)")
 
     parser.add_argument(
         "--script-type",
         type=str,
         default="phone",
         help="type of lab ('word'/'syllable'/'phone')")
+
     args = parser.parse_args()
-    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
+    root_dir = Path(args.root_dir).expanduser()
+    output_dir = Path(args.output_dir).expanduser()
+    reorganize_thchs30(root_dir, output_dir, args.script_type)
diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/a0/run.sh
index 53f96b37..5081b612 100755
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
@@ -14,14 +14,17 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 # gen lexicon relink gen dump
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
-    bash ./local/data.sh $LEXICON_NAME|| exit -1
+    echo "Start prepare thchs30 data for MFA ..."
+    bash ./local/data.sh $LEXICON_NAME || exit -1
 fi
 
-# run MFA
-if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
-    echo "Start MFA training..."
-    mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
-    echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # run MFA
+    if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
+        echo "Start MFA training ..."
+        mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
+        echo "MFA training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
+    fi
 fi
 
 
diff --git a/tools/extras/install_mfa.sh b/tools/extras/install_mfa.sh
index b0a4cf99..ae126fa6 100755
--- a/tools/extras/install_mfa.sh
+++ b/tools/extras/install_mfa.sh
@@ -4,7 +4,7 @@
 
 test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git
 
-pushd Montreal-Forced-Aligner && git checkout v2.0.0a7 &&  python setup.py install
+pushd Montreal-Forced-Aligner &&  python setup.py install && popd
 
 test -d kaldi || { echo "need install kaldi first"; exit 1;}