commit
ab5411ec16
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,47 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
mkdir -p data
|
||||
TARGET_DIR=${MAIN_ROOT}/examples/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
LEXICON_NAME=$1
|
||||
|
||||
# download data, generate manifests
|
||||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||
python3 ${TARGET_DIR}/thchs30/thchs30.py \
|
||||
--manifest_prefix="data/manifest" \
|
||||
--target_dir="${TARGET_DIR}/thchs30"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare THCHS-30 failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# dump manifest to data/
|
||||
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
|
||||
|
||||
# copy files to data/dict to gen word.lexicon
|
||||
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
|
||||
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
|
||||
|
||||
# copy phone.lexicon to data/dict
|
||||
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
|
||||
|
||||
# gen word.lexicon
|
||||
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict
|
||||
|
||||
# reorganize dataset for MFA
|
||||
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
|
||||
echo "reorganizing thchs30 corpus..."
|
||||
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
|
||||
echo "reorganization done."
|
||||
fi
|
||||
|
||||
echo "THCHS-30 data preparation done."
|
||||
exit 0
|
@ -0,0 +1,83 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Recorganize THCHS-30 for MFA
|
||||
read manifest.train from root-dir
|
||||
Link *.wav to output-dir
|
||||
dump *.lab from manifest.train, such as: text、syllable and phone
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
|
||||
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
|
||||
wav_scp_path = root_dir / 'wav.scp'
|
||||
with open(wav_scp_path, 'r') as rf:
|
||||
for line in rf:
|
||||
utt, feat = line.strip().split()
|
||||
wav_path = feat
|
||||
wav_name = wav_path.split("/")[-1]
|
||||
new_wav_path = output_dir / wav_name
|
||||
os.symlink(wav_path, new_wav_path)
|
||||
|
||||
|
||||
def write_lab(root_dir: Union[str, Path],
|
||||
output_dir: Union[str, Path],
|
||||
script_type='phone'):
|
||||
# script_type can in {'word', 'syllable', 'phone'}
|
||||
json_name = 'text.' + script_type
|
||||
json_path = root_dir / json_name
|
||||
with open(json_path, 'r') as rf:
|
||||
for line in rf:
|
||||
line = line.strip().split()
|
||||
utt_id = line[0]
|
||||
context = ' '.join(line[1:])
|
||||
transcript_name = utt_id + '.lab'
|
||||
transcript_path = output_dir / transcript_name
|
||||
with open(transcript_path, 'wt') as wf:
|
||||
if script_type == 'word':
|
||||
# add space between chinese char
|
||||
context = ''.join([f + ' ' for f in context])[:-1]
|
||||
wf.write(context + "\n")
|
||||
|
||||
|
||||
def reorganize_thchs30(root_dir: Union[str, Path],
|
||||
output_dir: Union[str, Path]=None,
|
||||
script_type='phone'):
|
||||
root_dir = Path(root_dir).expanduser()
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
link_wav(root_dir, output_dir)
|
||||
write_lab(root_dir, output_dir, script_type)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Reorganize THCHS-30 dataset for MFA")
|
||||
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
help="path to save outputs(audio and transcriptions)")
|
||||
|
||||
parser.add_argument(
|
||||
"--script-type",
|
||||
type=str,
|
||||
default="phone",
|
||||
help="type of lab ('word'/'syllable'/'phone')")
|
||||
args = parser.parse_args()
|
||||
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
|
@ -0,0 +1,13 @@
|
||||
export MAIN_ROOT=${PWD}/../../../
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
# MFA is in tools
|
||||
export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH
|
@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
source path.sh
|
||||
stage=0
|
||||
stop_stage=100
|
||||
EXP_DIR=exp
|
||||
# LEXICON_NAME in {'phone', 'syllable', 'word'}
|
||||
LEXICON_NAME='phone'
|
||||
# set MFA num_jobs as half of machine's cpu core number
|
||||
NUM_JOBS=$((`nproc`/2))
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
# download dataset、unzip and generate manifest
|
||||
# gen lexicon relink gen dump
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh $LEXICON_NAME|| exit -1
|
||||
fi
|
||||
|
||||
# run MFA
|
||||
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
|
||||
echo "Start MFA training..."
|
||||
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
|
||||
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from deepspeech.frontend.utility import read_manifest
|
||||
|
||||
key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
|
||||
filename = {
|
||||
'text': 'text.word',
|
||||
'syllable': 'text.syllable',
|
||||
'phone': 'text.phone',
|
||||
'feat': 'wav.scp',
|
||||
}
|
||||
|
||||
|
||||
def dump_manifest(manifest_path, output_dir: Union[str, Path]):
|
||||
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
manifest_path = Path(manifest_path).expanduser()
|
||||
manifest_jsons = read_manifest(manifest_path)
|
||||
first_line = manifest_jsons[0]
|
||||
file_map = {}
|
||||
|
||||
for k in first_line.keys():
|
||||
if k not in key_whitelist:
|
||||
continue
|
||||
file_map[k] = open(output_dir / filename[k], 'w')
|
||||
|
||||
for line_json in manifest_jsons:
|
||||
for k in line_json.keys():
|
||||
if k not in key_whitelist:
|
||||
continue
|
||||
file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
|
||||
|
||||
for _, file in file_map.items():
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="dump manifest to wav.scp text.word ...")
|
||||
parser.add_argument("--manifest-path", type=str, help="path to manifest")
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
help="path to save outputs(audio and transcriptions)")
|
||||
args = parser.parse_args()
|
||||
dump_manifest(args.manifest_path, args.output_dir)
|
Loading…
Reference in new issue