pull/781/head
TianYuan 3 years ago
parent 9ac6d65a2a
commit 2c75c923b9

@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc') FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
FILES = [ FILES = [
fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc') fn for fn in FILES
or fn.endswith('unittest.cc')) if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
'unittest.cc'))
] ]
LIBS = ['stdc++'] LIBS = ['stdc++']

@ -33,4 +33,4 @@
}, },
"prob": 1.0 "prob": 1.0
} }
] ]

@ -20,27 +20,33 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Prepare THCHS-30 failed. Terminated." echo "Prepare THCHS-30 failed. Terminated."
exit 1 exit 1
fi fi
fi fi
# dump manifest to data/ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data # dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
# copy files to data/dict to gen word.lexicon fi
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon # copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
fi
# gen word.lexicon if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict # gen word.lexicon
python local/gen_word2phone.py --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon
fi
# reorganize dataset for MFA if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
if [ ! -d $EXP_DIR/thchs30_corpus ]; then # reorganize dataset for MFA
echo "reorganizing thchs30 corpus..." if [ ! -d $EXP_DIR/thchs30_corpus ]; then
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME echo "reorganizing thchs30 corpus..."
echo "reorganization done." python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
fi fi
echo "THCHS-30 data preparation done." echo "THCHS-30 data preparation done."

@ -18,6 +18,7 @@ file2: THCHS-30/resource/dict/lexicon.txt
import argparse import argparse
from collections import defaultdict from collections import defaultdict
from pathlib import Path from pathlib import Path
from typing import List
from typing import Union from typing import Union
# key: (cn, ('ee', 'er4'))value: count # key: (cn, ('ee', 'er4'))value: count
@ -34,7 +35,7 @@ def is_Chinese(ch):
return False return False
def proc_line(line): def proc_line(line: str):
line = line.strip() line = line.strip()
if is_Chinese(line[0]): if is_Chinese(line[0]):
line_list = line.split() line_list = line.split()
@ -49,20 +50,25 @@ def proc_line(line):
cn_phones_counter[(cn, phones)] += 1 cn_phones_counter[(cn, phones)] += 1
def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]): """
root_dir = Path(root_dir).expanduser() example lines of output
output_dir = Path(output_dir).expanduser() the first column is a Chinese character
output_dir.mkdir(parents=True, exist_ok=True) the second is the probability of this pronunciation
file1 = root_dir / "lm_word_lexicon_1" and the rest are the phones of this pronunciation
file2 = root_dir / "lm_word_lexicon_2" 0.22 ii i1
write_file = output_dir / "word.lexicon" 0.45 ii i4
0.32 ii i2
0.01 ii i5
"""
def gen_lexicon(lexicon_files: List[Union[str, Path]],
output_path: Union[str, Path]):
for file_path in lexicon_files:
with open(file_path, "r") as f1:
for line in f1:
proc_line(line)
with open(file1, "r") as f1:
for line in f1:
proc_line(line)
with open(file2, "r") as f2:
for line in f2:
proc_line(line)
for key in cn_phones_counter: for key in cn_phones_counter:
cn = key[0] cn = key[0]
cn_counter[cn].append((key[1], cn_phones_counter[key])) cn_counter[cn].append((key[1], cn_phones_counter[key]))
@ -75,7 +81,8 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
p = round(p, 2) p = round(p, 2)
if p > 0: if p > 0:
cn_counter_p[key].append((item[0], p)) cn_counter_p[key].append((item[0], p))
with open(write_file, "w") as wf:
with open(output_path, "w") as wf:
for key in cn_counter_p: for key in cn_counter_p:
phone_p_list = cn_counter_p[key] phone_p_list = cn_counter_p[key]
for item in phone_p_list: for item in phone_p_list:
@ -87,8 +94,21 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset" description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
) )
# A line of word_lexicon:
# 一丁点 ii i4 d ing1 d ian3
# the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len
parser.add_argument(
"--lexicon-files",
type=str,
default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2",
help="lm_word_lexicon files")
parser.add_argument( parser.add_argument(
"--root-dir", type=str, help="dir to thchs30 lm_word_lexicons") "--output-path",
parser.add_argument("--output-dir", type=str, help="path to save outputs") type=str,
default="data/dict/word.lexicon",
help="path to save output word2phone lexicon")
args = parser.parse_args() args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir) lexicon_files = args.lexicon_files.split(" ")
output_path = Path(args.output_path).expanduser()
gen_lexicon(lexicon_files, output_path)

@ -58,8 +58,6 @@ def write_lab(root_dir: Union[str, Path],
def reorganize_thchs30(root_dir: Union[str, Path], def reorganize_thchs30(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None, output_dir: Union[str, Path]=None,
script_type='phone'): script_type='phone'):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir) link_wav(root_dir, output_dir)
write_lab(root_dir, output_dir, script_type) write_lab(root_dir, output_dir, script_type)
@ -72,12 +70,15 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--output-dir", "--output-dir",
type=str, type=str,
help="path to save outputs(audio and transcriptions)") help="path to save outputs (audio and transcriptions)")
parser.add_argument( parser.add_argument(
"--script-type", "--script-type",
type=str, type=str,
default="phone", default="phone",
help="type of lab ('word'/'syllable'/'phone')") help="type of lab ('word'/'syllable'/'phone')")
args = parser.parse_args() args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type) root_dir = Path(args.root_dir).expanduser()
output_dir = Path(args.output_dir).expanduser()
reorganize_thchs30(root_dir, output_dir, args.script_type)

@ -14,14 +14,17 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# gen lexicon relink gen dump # gen lexicon relink gen dump
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data # prepare data
bash ./local/data.sh $LEXICON_NAME|| exit -1 echo "Start prepare thchs30 data for MFA ..."
bash ./local/data.sh $LEXICON_NAME || exit -1
fi fi
# run MFA if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then # run MFA
echo "Start MFA training..." if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS echo "Start MFA training ..."
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n" mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "MFA training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi
fi fi

@ -4,7 +4,7 @@
test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git
pushd Montreal-Forced-Aligner && git checkout v2.0.0a7 && python setup.py install pushd Montreal-Forced-Aligner && python setup.py install && popd
test -d kaldi || { echo "need install kaldi first"; exit 1;} test -d kaldi || { echo "need install kaldi first"; exit 1;}

Loading…
Cancel
Save