From 2c75c923b9a64f1a6e5c92babbfc71f693abf1af Mon Sep 17 00:00:00 2001 From: TianYuan Date: Mon, 23 Aug 2021 08:07:35 +0000 Subject: [PATCH] fix_mfa --- deepspeech/decoders/swig/setup.py | 5 +- examples/aishell/s0/conf/augmentation.json | 2 +- examples/thchs30/a0/local/data.sh | 38 +++++++------ examples/thchs30/a0/local/gen_word2phone.py | 56 +++++++++++++------ .../thchs30/a0/local/reorganize_thchs30.py | 9 +-- examples/thchs30/a0/run.sh | 15 +++-- tools/extras/install_mfa.sh | 2 +- 7 files changed, 79 insertions(+), 48 deletions(-) diff --git a/deepspeech/decoders/swig/setup.py b/deepspeech/decoders/swig/setup.py index 86af475a..3da5ce8b 100644 --- a/deepspeech/decoders/swig/setup.py +++ b/deepspeech/decoders/swig/setup.py @@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') FILES = [ - fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') - or fn.endswith('unittest.cc')) + fn for fn in FILES + if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( + 'unittest.cc')) ] LIBS = ['stdc++'] diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/s0/conf/augmentation.json index 39afe4e6..ac8a1c53 100644 --- a/examples/aishell/s0/conf/augmentation.json +++ b/examples/aishell/s0/conf/augmentation.json @@ -33,4 +33,4 @@ }, "prob": 1.0 } -] \ No newline at end of file +] diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/a0/local/data.sh index 169367ac..8614a041 100644 --- a/examples/thchs30/a0/local/data.sh +++ b/examples/thchs30/a0/local/data.sh @@ -20,27 +20,33 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "Prepare THCHS-30 failed. Terminated." exit 1 fi - fi -# dump manifest to data/ -python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data - -# copy files to data/dict to gen word.lexicon -cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1 -cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2 +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # dump manifest to data/ + python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data +fi -# copy phone.lexicon to data/dict -cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # copy files to data/dict to gen word.lexicon + cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1 + cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2 + # copy phone.lexicon to data/dict + cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon +fi -# gen word.lexicon -python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # gen word.lexicon + python local/gen_word2phone.py --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon +fi -# reorganize dataset for MFA -if [ ! -d $EXP_DIR/thchs30_corpus ]; then - echo "reorganizing thchs30 corpus..." - python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME - echo "reorganization done." +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # reorganize dataset for MFA + if [ ! -d $EXP_DIR/thchs30_corpus ]; then + echo "reorganizing thchs30 corpus..." + python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME + echo "reorganization done." + fi fi echo "THCHS-30 data preparation done." diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/a0/local/gen_word2phone.py index cd584fcd..9bc0249b 100644 --- a/examples/thchs30/a0/local/gen_word2phone.py +++ b/examples/thchs30/a0/local/gen_word2phone.py @@ -18,6 +18,7 @@ file2: THCHS-30/resource/dict/lexicon.txt import argparse from collections import defaultdict from pathlib import Path +from typing import List from typing import Union # key: (cn, ('ee', 'er4')),value: count @@ -34,7 +35,7 @@ def is_Chinese(ch): return False -def proc_line(line): +def proc_line(line: str): line = line.strip() if is_Chinese(line[0]): line_list = line.split() @@ -49,20 +50,25 @@ def proc_line(line): cn_phones_counter[(cn, phones)] += 1 -def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]): - root_dir = Path(root_dir).expanduser() - output_dir = Path(output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - file1 = root_dir / "lm_word_lexicon_1" - file2 = root_dir / "lm_word_lexicon_2" - write_file = output_dir / "word.lexicon" +""" +example lines of output +the first column is a Chinese character +the second is the probability of this pronunciation +and the rest are the phones of this pronunciation +一 0.22 ii i1↩ +一 0.45 ii i4↩ +一 0.32 ii i2↩ +一 0.01 ii i5 +""" + + +def gen_lexicon(lexicon_files: List[Union[str, Path]], + output_path: Union[str, Path]): + for file_path in lexicon_files: + with open(file_path, "r") as f1: + for line in f1: + proc_line(line) - with open(file1, "r") as f1: - for line in f1: - proc_line(line) - with open(file2, "r") as f2: - for line in f2: - proc_line(line) for key in cn_phones_counter: cn = key[0] cn_counter[cn].append((key[1], cn_phones_counter[key])) @@ -75,7 +81,8 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]): p = round(p, 2) if p > 0: cn_counter_p[key].append((item[0], p)) - with open(write_file, "w") as wf: + + with open(output_path, "w") as wf: for key in cn_counter_p: phone_p_list = cn_counter_p[key] for item in phone_p_list: @@ -87,8 +94,21 @@ if __name__ == "__main__": parser = argparse.ArgumentParser( description="Gen Chinese characters to phone lexicon for THCHS-30 dataset" ) + # A line of word_lexicon: + # 一丁点 ii i4 d ing1 d ian3 + # the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len + parser.add_argument( + "--lexicon-files", + type=str, + default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2", + help="lm_word_lexicon files") parser.add_argument( - "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons") - parser.add_argument("--output-dir", type=str, help="path to save outputs") + "--output-path", + type=str, + default="data/dict/word.lexicon", + help="path to save output word2phone lexicon") args = parser.parse_args() - gen_lexicon(args.root_dir, args.output_dir) + lexicon_files = args.lexicon_files.split(" ") + output_path = Path(args.output_path).expanduser() + + gen_lexicon(lexicon_files, output_path) diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/a0/local/reorganize_thchs30.py index 9df6bc6a..c7c6248b 100644 --- a/examples/thchs30/a0/local/reorganize_thchs30.py +++ b/examples/thchs30/a0/local/reorganize_thchs30.py @@ -58,8 +58,6 @@ def write_lab(root_dir: Union[str, Path], def reorganize_thchs30(root_dir: Union[str, Path], output_dir: Union[str, Path]=None, script_type='phone'): - root_dir = Path(root_dir).expanduser() - output_dir = Path(output_dir).expanduser() output_dir.mkdir(parents=True, exist_ok=True) link_wav(root_dir, output_dir) write_lab(root_dir, output_dir, script_type) @@ -72,12 +70,15 @@ if __name__ == "__main__": parser.add_argument( "--output-dir", type=str, - help="path to save outputs(audio and transcriptions)") + help="path to save outputs (audio and transcriptions)") parser.add_argument( "--script-type", type=str, default="phone", help="type of lab ('word'/'syllable'/'phone')") + args = parser.parse_args() - reorganize_thchs30(args.root_dir, args.output_dir, args.script_type) + root_dir = Path(args.root_dir).expanduser() + output_dir = Path(args.output_dir).expanduser() + reorganize_thchs30(root_dir, output_dir, args.script_type) diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/a0/run.sh index 53f96b37..5081b612 100755 --- a/examples/thchs30/a0/run.sh +++ b/examples/thchs30/a0/run.sh @@ -14,14 +14,17 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; # gen lexicon relink gen dump if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/data.sh $LEXICON_NAME|| exit -1 + echo "Start prepare thchs30 data for MFA ..." + bash ./local/data.sh $LEXICON_NAME || exit -1 fi -# run MFA -if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then - echo "Start MFA training..." - mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS - echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n" +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # run MFA + if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then + echo "Start MFA training ..." + mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS + echo "MFA training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n" + fi fi diff --git a/tools/extras/install_mfa.sh b/tools/extras/install_mfa.sh index b0a4cf99..ae126fa6 100755 --- a/tools/extras/install_mfa.sh +++ b/tools/extras/install_mfa.sh @@ -4,7 +4,7 @@ test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git -pushd Montreal-Forced-Aligner && git checkout v2.0.0a7 && python setup.py install +pushd Montreal-Forced-Aligner && python setup.py install && popd test -d kaldi || { echo "need install kaldi first"; exit 1;}