pull/781/head
TianYuan 3 years ago
parent 9ac6d65a2a
commit 2c75c923b9

@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
FILES = [
fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
or fn.endswith('unittest.cc'))
fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
'unittest.cc'))
]
LIBS = ['stdc++']

@ -20,27 +20,33 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Prepare THCHS-30 failed. Terminated."
exit 1
fi
fi
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
fi
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
fi
# gen word.lexicon
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# gen word.lexicon
python local/gen_word2phone.py --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon
fi
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
fi
echo "THCHS-30 data preparation done."

@ -18,6 +18,7 @@ file2: THCHS-30/resource/dict/lexicon.txt
import argparse
from collections import defaultdict
from pathlib import Path
from typing import List
from typing import Union
# key: (cn, ('ee', 'er4'))value: count
@ -34,7 +35,7 @@ def is_Chinese(ch):
return False
def proc_line(line):
def proc_line(line: str):
line = line.strip()
if is_Chinese(line[0]):
line_list = line.split()
@ -49,20 +50,25 @@ def proc_line(line):
cn_phones_counter[(cn, phones)] += 1
def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
file1 = root_dir / "lm_word_lexicon_1"
file2 = root_dir / "lm_word_lexicon_2"
write_file = output_dir / "word.lexicon"
"""
example lines of output
the first column is a Chinese character
the second is the probability of this pronunciation
and the rest are the phones of this pronunciation
0.22 ii i1
0.45 ii i4
0.32 ii i2
0.01 ii i5
"""
def gen_lexicon(lexicon_files: List[Union[str, Path]],
output_path: Union[str, Path]):
for file_path in lexicon_files:
with open(file_path, "r") as f1:
for line in f1:
proc_line(line)
with open(file1, "r") as f1:
for line in f1:
proc_line(line)
with open(file2, "r") as f2:
for line in f2:
proc_line(line)
for key in cn_phones_counter:
cn = key[0]
cn_counter[cn].append((key[1], cn_phones_counter[key]))
@ -75,7 +81,8 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
p = round(p, 2)
if p > 0:
cn_counter_p[key].append((item[0], p))
with open(write_file, "w") as wf:
with open(output_path, "w") as wf:
for key in cn_counter_p:
phone_p_list = cn_counter_p[key]
for item in phone_p_list:
@ -87,8 +94,21 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
# A line of word_lexicon:
# 一丁点 ii i4 d ing1 d ian3
# the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len
parser.add_argument(
"--lexicon-files",
type=str,
default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2",
help="lm_word_lexicon files")
parser.add_argument(
"--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
parser.add_argument("--output-dir", type=str, help="path to save outputs")
"--output-path",
type=str,
default="data/dict/word.lexicon",
help="path to save output word2phone lexicon")
args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir)
lexicon_files = args.lexicon_files.split(" ")
output_path = Path(args.output_path).expanduser()
gen_lexicon(lexicon_files, output_path)

@ -58,8 +58,6 @@ def write_lab(root_dir: Union[str, Path],
def reorganize_thchs30(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None,
script_type='phone'):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir)
write_lab(root_dir, output_dir, script_type)
@ -72,12 +70,15 @@ if __name__ == "__main__":
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
help="path to save outputs (audio and transcriptions)")
parser.add_argument(
"--script-type",
type=str,
default="phone",
help="type of lab ('word'/'syllable'/'phone')")
args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
root_dir = Path(args.root_dir).expanduser()
output_dir = Path(args.output_dir).expanduser()
reorganize_thchs30(root_dir, output_dir, args.script_type)

@ -14,14 +14,17 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# gen lexicon relink gen dump
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh $LEXICON_NAME|| exit -1
echo "Start prepare thchs30 data for MFA ..."
bash ./local/data.sh $LEXICON_NAME || exit -1
fi
# run MFA
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# run MFA
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training ..."
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "MFA training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi
fi

@ -4,7 +4,7 @@
test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git
pushd Montreal-Forced-Aligner && git checkout v2.0.0a7 && python setup.py install
pushd Montreal-Forced-Aligner && python setup.py install && popd
test -d kaldi || { echo "need install kaldi first"; exit 1;}

Loading…
Cancel
Save