add MFA example for THCHS30

pull/698/head
TianYuan 3 years ago
parent 1c9c122b3b
commit 7d4eff2b86

@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
FILES = [
fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
or fn.endswith('unittest.cc'))
fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
'unittest.cc'))
]
LIBS = ['stdc++']

@ -130,7 +130,7 @@ def create_manifest(data_dir, manifest_path_prefix):
ensure_ascii=False))
total_sec += duration
total_text += len(text)
total_text += len(word_text)
total_num += 1
manifest_path = manifest_path_prefix + '.' + dtype

@ -0,0 +1,2 @@
this is the example of MFA for thchs30 dataset
cd a0 run run.sh to get start

@ -0,0 +1,26 @@
#! /usr/bin/env bash
stage=-1
stop_stage=100
source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/thchs30/thchs30.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/thchs30"
if [ $? -ne 0 ]; then
echo "Prepare THCHS-30 failed. Terminated."
exit 1
fi
fi
echo "THCHS-30 data preparation done."
exit 0

@ -0,0 +1,96 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gen Chinese characters to THCHS30-30 phone lexicon using THCHS30-30's lexicon
file1: THCHS-30/data_thchs30/lm_word/lexicon.txt
file2: THCHS-30/resource/dict/lexicon.txt
"""
import argparse
from collections import defaultdict
from pathlib import Path
from typing import Union
# key: (cn, ('ee', 'er4'))value: count
cn_phones_counter = defaultdict(int)
# key: cn, value: list of (phones, num)
cn_counter = defaultdict(list)
# key: cn, value: list of (phones, probabilities)
cn_counter_p = defaultdict(list)
def is_Chinese(ch):
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def proc_line(line):
line = line.strip()
if is_Chinese(line[0]):
line_list = line.split()
cn_list = line_list[0]
phone_list = line_list[1:]
if len(cn_list) == len(phone_list) / 2:
new_phone_list = [(phone_list[i], phone_list[i + 1])
for i in range(0, len(phone_list), 2)]
assert len(cn_list) == len(new_phone_list)
for idx, cn in enumerate(cn_list):
phones = new_phone_list[idx]
cn_phones_counter[(cn, phones)] += 1
def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
file1 = root_dir / "data_thchs30/lm_word/lexicon.txt"
file2 = root_dir / "resource/dict/lexicon.txt"
write_file = output_dir / "thchs30_cn2phone"
with open(file1, "r") as f1:
for line in f1:
proc_line(line)
with open(file2, "r") as f2:
for line in f2:
proc_line(line)
for key in cn_phones_counter:
cn = key[0]
cn_counter[cn].append((key[1], cn_phones_counter[key]))
for key in cn_counter:
phone_count_list = cn_counter[key]
count_sum = sum([x[1] for x in phone_count_list])
for item in phone_count_list:
p = item[1] / count_sum
p = round(p, 2)
if p > 0:
cn_counter_p[key].append((item[0], p))
with open(write_file, "w") as wf:
for key in cn_counter_p:
phone_p_list = cn_counter_p[key]
for item in phone_p_list:
phones, p = item
wf.write(key + " " + str(p) + " " + " ".join(phones) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir)

@ -0,0 +1,112 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Recorganize THCHS-30 for MFA
read manifest.train from root-dir
Link *.wav to output-dir
dump *.lab from manifest.train, such as: textsyllable and phone
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
"""
import argparse
import os
from pathlib import Path
from typing import Union
from deepspeech.frontend.utility import read_manifest
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
wav_path = line_json['feat']
wav_name = wav_path.split("/")[-1]
new_wav_path = output_dir / wav_name
os.symlink(wav_path, new_wav_path)
def link_lexicon(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
line_json = manifest_jsons[0]
wav_path = line_json['feat']
if script_type == 'phone':
# find lexicon.txt in THCHS-30
grader_father = os.path.abspath(
os.path.dirname(wav_path) + os.path.sep + "..")
grader_father = Path(grader_father).expanduser()
lexicon_name = "lexicon.txt"
lexicon_father_dir = "lm_phone"
lexicon_path = grader_father / lexicon_father_dir / lexicon_name
elif script_type == 'syllable':
# find thchs30_pinyin2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_pinyin2phone"
else:
# script_type == 'text'
# find thchs30_cn2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_cn2phone"
new_lexicon_name = script_type + ".lexicon"
new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
os.symlink(lexicon_path, new_lexicon_path)
def dump_lab(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
# script_type can in {'text', 'syllable', 'phone'}
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
utt_id = line_json['utt']
transcript_name = utt_id + ".lab"
transcript_path = output_dir / transcript_name
with open(transcript_path, 'wt') as wf:
wf.write(line_json[script_type] + "\n")
def reorganize_thchs30(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None,
script_type='phone'):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir)
dump_lab(root_dir, output_dir, script_type)
link_lexicon(root_dir, output_dir, script_type)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Reorganize THCHS-30 dataset for MFA")
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
parser.add_argument(
"--script-type",
type=str,
default="phone",
help="type of lab (text'/'syllable'/'phone')")
args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,14 @@
export MAIN_ROOT=${PWD}/../../../
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin

@ -0,0 +1,40 @@
#!/bin/bash
set -e
source path.sh
stage=0
stop_stage=100
EXP_DIR=exp
# LEXICON_NAME in {'phone', 'syllable', 'text'}
LEXICON_NAME='phone'
# get machine's cpu core number
NUM_JOBS=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
NUM_JOBS=$((NUM_JOBS/2))
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# download dataset、unzip and generate manifest
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
fi
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/recorganize_thchs30.py --root-dir=./data --output-dir=$EXP_DIR/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
# MFA is in tools
export PATH="${MAIN_ROOT}/tools/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/thchs30_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi

@ -1,7 +1,8 @@
SHELL:= /bin/bash
PYTHON:= python3.7
.PHONY: all clean
all: virtualenv kenlm.done sox.done soxbindings.done
all: virtualenv kenlm.done sox.done soxbindings.done mfa.done
virtualenv:
test -d venv || virtualenv -p $(PYTHON) venv
@ -19,7 +20,7 @@ kenlm.done:
apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install
cd kenlm && python setup.py install
source venv/bin/activate; cd kenlm && python setup.py install
touch kenlm.done
sox.done:
@ -32,4 +33,9 @@ sox.done:
soxbindings.done:
test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git
source venv/bin/activate; cd soxbindings && python setup.py install
touch soxbindings.done
touch soxbindings.done
mfa.done:
test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
tar xvf montreal-forced-aligner_linux.tar.gz
touch mfa.done
Loading…
Cancel
Save