restructure thchs30/a0

pull/698/head
TianYuan 4 years ago
parent c0ee57d400
commit 00017301c6

@ -1,2 +1,7 @@
this is the example of MFA for thchs30 dataset
cd a0 run run.sh to get start
MFA 对齐所使用的字典
MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py

@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
LEXICON_NAME=$1
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
python3 ${TARGET_DIR}/thchs30/thchs30.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/thchs30"
@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
# gen word.lexicon
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
echo "THCHS-30 data preparation done."
exit 0

@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
file1 = root_dir / "data_thchs30/lm_word/lexicon.txt"
file2 = root_dir / "resource/dict/lexicon.txt"
write_file = output_dir / "thchs30_cn2phone"
file1 = root_dir / "lm_word_lexicon_1"
file2 = root_dir / "lm_word_lexicon_2"
write_file = output_dir / "word.lexicon"
with open(file1, "r") as f1:
for line in f1:
@ -87,10 +87,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
"--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
parser.add_argument("--output-dir", type=str, help="path to save outputs")
args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir)

@ -1,112 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Recorganize THCHS-30 for MFA
read manifest.train from root-dir
Link *.wav to output-dir
dump *.lab from manifest.train, such as: textsyllable and phone
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
"""
import argparse
import os
from pathlib import Path
from typing import Union
from deepspeech.frontend.utility import read_manifest
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
wav_path = line_json['feat']
wav_name = wav_path.split("/")[-1]
new_wav_path = output_dir / wav_name
os.symlink(wav_path, new_wav_path)
def link_lexicon(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
line_json = manifest_jsons[0]
wav_path = line_json['feat']
if script_type == 'phone':
# find lexicon.txt in THCHS-30
grader_father = os.path.abspath(
os.path.dirname(wav_path) + os.path.sep + "..")
grader_father = Path(grader_father).expanduser()
lexicon_name = "lexicon.txt"
lexicon_father_dir = "lm_phone"
lexicon_path = grader_father / lexicon_father_dir / lexicon_name
elif script_type == 'syllable':
# find thchs30_pinyin2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_pinyin2phone"
else:
# script_type == 'text'
# find thchs30_cn2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_cn2phone"
new_lexicon_name = script_type + ".lexicon"
new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
os.symlink(lexicon_path, new_lexicon_path)
def dump_lab(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
# script_type can in {'text', 'syllable', 'phone'}
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
utt_id = line_json['utt']
transcript_name = utt_id + ".lab"
transcript_path = output_dir / transcript_name
with open(transcript_path, 'wt') as wf:
wf.write(line_json[script_type] + "\n")
def reorganize_thchs30(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None,
script_type='phone'):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir)
dump_lab(root_dir, output_dir, script_type)
link_lexicon(root_dir, output_dir, script_type)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Reorganize THCHS-30 dataset for MFA")
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
parser.add_argument(
"--script-type",
type=str,
default="phone",
help="type of lab (text'/'syllable'/'phone')")
args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)

@ -0,0 +1,83 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Recorganize THCHS-30 for MFA
read manifest.train from root-dir
Link *.wav to output-dir
dump *.lab from manifest.train, such as: textsyllable and phone
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
"""
import argparse
import os
from pathlib import Path
from typing import Union
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
wav_scp_path = root_dir / 'wav.scp'
with open(wav_scp_path, 'r') as rf:
for line in rf:
utt, feat = line.strip().split()
wav_path = feat
wav_name = wav_path.split("/")[-1]
new_wav_path = output_dir / wav_name
os.symlink(wav_path, new_wav_path)
def write_lab(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
# script_type can in {'word', 'syllable', 'phone'}
json_name = 'text.' + script_type
json_path = root_dir / json_name
with open(json_path, 'r') as rf:
for line in rf:
line = line.strip().split()
utt_id = line[0]
context = ' '.join(line[1:])
transcript_name = utt_id + '.lab'
transcript_path = output_dir / transcript_name
with open(transcript_path, 'wt') as wf:
if script_type == 'word':
# add space between chinese char
context = ''.join([f + ' ' for f in context])[:-1]
wf.write(context + "\n")
def reorganize_thchs30(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None,
script_type='phone'):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir)
write_lab(root_dir, output_dir, script_type)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Reorganize THCHS-30 dataset for MFA")
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
parser.add_argument(
"--script-type",
type=str,
default="phone",
help="type of lab ('word'/'syllable'/'phone')")
args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)

File diff suppressed because it is too large Load Diff

@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
# MFA is in tools
export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH

@ -4,33 +4,26 @@ source path.sh
stage=0
stop_stage=100
EXP_DIR=exp
# LEXICON_NAME in {'phone', 'syllable', 'text'}
# LEXICON_NAME in {'phone', 'syllable', 'word'}
LEXICON_NAME='phone'
# get machine's cpu core number
NUM_JOBS=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
NUM_JOBS=$((NUM_JOBS/2))
# set MFA num_jobs as half of machine's cpu core number
NUM_JOBS=$((`nproc`/2))
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# download dataset、unzip and generate manifest
# gen lexicon relink gen dump
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
bash ./local/data.sh $LEXICON_NAME|| exit -1
fi
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/recorganize_thchs30.py --root-dir=./data --output-dir=$EXP_DIR/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
# MFA is in tools
export PATH="${MAIN_ROOT}/tools/montreal-forced-aligner/bin"
# run MFA
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/thchs30_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
mfa_train_and_align data/thchs30_corpus "data/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS

@ -0,0 +1,63 @@
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
import argparse
from pathlib import Path
from typing import Union
from deepspeech.frontend.utility import read_manifest
key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
filename = {
'text': 'text.word',
'syllable': 'text.syllable',
'phone': 'text.phone',
'feat': 'wav.scp',
}
def dump_manifest(manifest_path, output_dir: Union[str, Path]):
output_dir = Path(output_dir).expanduser()
manifest_path = Path(manifest_path).expanduser()
manifest_jsons = read_manifest(manifest_path)
first_line = manifest_jsons[0]
file_map = {}
for k in first_line.keys():
if k not in key_whitelist:
continue
file_map[k] = open(output_dir / filename[k], 'w')
for line_json in manifest_jsons:
for k in line_json.keys():
if k not in key_whitelist:
continue
file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
for _, file in file_map.items():
file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="dump manifest to wav.scp text.word ...")
parser.add_argument("--manifest-path", type=str, help="path to manifest")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
args = parser.parse_args()
dump_manifest(args.manifest_path, args.output_dir)
Loading…
Cancel
Save