parent
c0ee57d400
commit
00017301c6
@ -1,2 +1,7 @@
|
|||||||
this is the example of MFA for thchs30 dataset
|
this is the example of MFA for thchs30 dataset
|
||||||
cd a0 run run.sh to get start
|
cd a0 run run.sh to get start
|
||||||
|
|
||||||
|
MFA 对齐所使用的字典
|
||||||
|
MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
|
||||||
|
phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
|
||||||
|
word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
|
||||||
|
@ -1,112 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""Recorganize THCHS-30 for MFA
|
|
||||||
read manifest.train from root-dir
|
|
||||||
Link *.wav to output-dir
|
|
||||||
dump *.lab from manifest.train, such as: text、syllable and phone
|
|
||||||
Manifest file is a json-format file with each line containing the
|
|
||||||
meta data (i.e. audio filepath, transcript and audio duration)
|
|
||||||
"""
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
from deepspeech.frontend.utility import read_manifest
|
|
||||||
|
|
||||||
|
|
||||||
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
|
|
||||||
manifest_path = root_dir / "manifest.train"
|
|
||||||
manifest_jsons = read_manifest(manifest_path)
|
|
||||||
for line_json in manifest_jsons:
|
|
||||||
wav_path = line_json['feat']
|
|
||||||
wav_name = wav_path.split("/")[-1]
|
|
||||||
new_wav_path = output_dir / wav_name
|
|
||||||
os.symlink(wav_path, new_wav_path)
|
|
||||||
|
|
||||||
|
|
||||||
def link_lexicon(root_dir: Union[str, Path],
|
|
||||||
output_dir: Union[str, Path],
|
|
||||||
script_type='phone'):
|
|
||||||
manifest_path = root_dir / "manifest.train"
|
|
||||||
manifest_jsons = read_manifest(manifest_path)
|
|
||||||
line_json = manifest_jsons[0]
|
|
||||||
wav_path = line_json['feat']
|
|
||||||
|
|
||||||
if script_type == 'phone':
|
|
||||||
# find lexicon.txt in THCHS-30
|
|
||||||
grader_father = os.path.abspath(
|
|
||||||
os.path.dirname(wav_path) + os.path.sep + "..")
|
|
||||||
grader_father = Path(grader_father).expanduser()
|
|
||||||
lexicon_name = "lexicon.txt"
|
|
||||||
lexicon_father_dir = "lm_phone"
|
|
||||||
lexicon_path = grader_father / lexicon_father_dir / lexicon_name
|
|
||||||
elif script_type == 'syllable':
|
|
||||||
# find thchs30_pinyin2phone in dir of this py file
|
|
||||||
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
|
|
||||||
py_dir_path = Path(py_dir_path).expanduser()
|
|
||||||
lexicon_path = py_dir_path / "thchs30_pinyin2phone"
|
|
||||||
else:
|
|
||||||
# script_type == 'text'
|
|
||||||
# find thchs30_cn2phone in dir of this py file
|
|
||||||
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
|
|
||||||
py_dir_path = Path(py_dir_path).expanduser()
|
|
||||||
lexicon_path = py_dir_path / "thchs30_cn2phone"
|
|
||||||
|
|
||||||
new_lexicon_name = script_type + ".lexicon"
|
|
||||||
new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
|
|
||||||
os.symlink(lexicon_path, new_lexicon_path)
|
|
||||||
|
|
||||||
|
|
||||||
def dump_lab(root_dir: Union[str, Path],
|
|
||||||
output_dir: Union[str, Path],
|
|
||||||
script_type='phone'):
|
|
||||||
# script_type can in {'text', 'syllable', 'phone'}
|
|
||||||
manifest_path = root_dir / "manifest.train"
|
|
||||||
manifest_jsons = read_manifest(manifest_path)
|
|
||||||
for line_json in manifest_jsons:
|
|
||||||
utt_id = line_json['utt']
|
|
||||||
transcript_name = utt_id + ".lab"
|
|
||||||
transcript_path = output_dir / transcript_name
|
|
||||||
with open(transcript_path, 'wt') as wf:
|
|
||||||
wf.write(line_json[script_type] + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def reorganize_thchs30(root_dir: Union[str, Path],
|
|
||||||
output_dir: Union[str, Path]=None,
|
|
||||||
script_type='phone'):
|
|
||||||
root_dir = Path(root_dir).expanduser()
|
|
||||||
output_dir = Path(output_dir).expanduser()
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
link_wav(root_dir, output_dir)
|
|
||||||
dump_lab(root_dir, output_dir, script_type)
|
|
||||||
link_lexicon(root_dir, output_dir, script_type)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Reorganize THCHS-30 dataset for MFA")
|
|
||||||
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-dir",
|
|
||||||
type=str,
|
|
||||||
help="path to save outputs(audio and transcriptions)")
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--script-type",
|
|
||||||
type=str,
|
|
||||||
default="phone",
|
|
||||||
help="type of lab (text'/'syllable'/'phone')")
|
|
||||||
args = parser.parse_args()
|
|
||||||
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
|
|
@ -0,0 +1,83 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Recorganize THCHS-30 for MFA
|
||||||
|
read manifest.train from root-dir
|
||||||
|
Link *.wav to output-dir
|
||||||
|
dump *.lab from manifest.train, such as: text、syllable and phone
|
||||||
|
Manifest file is a json-format file with each line containing the
|
||||||
|
meta data (i.e. audio filepath, transcript and audio duration)
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
|
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
|
||||||
|
wav_scp_path = root_dir / 'wav.scp'
|
||||||
|
with open(wav_scp_path, 'r') as rf:
|
||||||
|
for line in rf:
|
||||||
|
utt, feat = line.strip().split()
|
||||||
|
wav_path = feat
|
||||||
|
wav_name = wav_path.split("/")[-1]
|
||||||
|
new_wav_path = output_dir / wav_name
|
||||||
|
os.symlink(wav_path, new_wav_path)
|
||||||
|
|
||||||
|
|
||||||
|
def write_lab(root_dir: Union[str, Path],
|
||||||
|
output_dir: Union[str, Path],
|
||||||
|
script_type='phone'):
|
||||||
|
# script_type can in {'word', 'syllable', 'phone'}
|
||||||
|
json_name = 'text.' + script_type
|
||||||
|
json_path = root_dir / json_name
|
||||||
|
with open(json_path, 'r') as rf:
|
||||||
|
for line in rf:
|
||||||
|
line = line.strip().split()
|
||||||
|
utt_id = line[0]
|
||||||
|
context = ' '.join(line[1:])
|
||||||
|
transcript_name = utt_id + '.lab'
|
||||||
|
transcript_path = output_dir / transcript_name
|
||||||
|
with open(transcript_path, 'wt') as wf:
|
||||||
|
if script_type == 'word':
|
||||||
|
# add space between chinese char
|
||||||
|
context = ''.join([f + ' ' for f in context])[:-1]
|
||||||
|
wf.write(context + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def reorganize_thchs30(root_dir: Union[str, Path],
|
||||||
|
output_dir: Union[str, Path]=None,
|
||||||
|
script_type='phone'):
|
||||||
|
root_dir = Path(root_dir).expanduser()
|
||||||
|
output_dir = Path(output_dir).expanduser()
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
link_wav(root_dir, output_dir)
|
||||||
|
write_lab(root_dir, output_dir, script_type)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Reorganize THCHS-30 dataset for MFA")
|
||||||
|
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
type=str,
|
||||||
|
help="path to save outputs(audio and transcriptions)")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--script-type",
|
||||||
|
type=str,
|
||||||
|
default="phone",
|
||||||
|
help="type of lab ('word'/'syllable'/'phone')")
|
||||||
|
args = parser.parse_args()
|
||||||
|
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from deepspeech.frontend.utility import read_manifest
|
||||||
|
|
||||||
|
key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
|
||||||
|
filename = {
|
||||||
|
'text': 'text.word',
|
||||||
|
'syllable': 'text.syllable',
|
||||||
|
'phone': 'text.phone',
|
||||||
|
'feat': 'wav.scp',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dump_manifest(manifest_path, output_dir: Union[str, Path]):
|
||||||
|
|
||||||
|
output_dir = Path(output_dir).expanduser()
|
||||||
|
manifest_path = Path(manifest_path).expanduser()
|
||||||
|
manifest_jsons = read_manifest(manifest_path)
|
||||||
|
first_line = manifest_jsons[0]
|
||||||
|
file_map = {}
|
||||||
|
|
||||||
|
for k in first_line.keys():
|
||||||
|
if k not in key_whitelist:
|
||||||
|
continue
|
||||||
|
file_map[k] = open(output_dir / filename[k], 'w')
|
||||||
|
|
||||||
|
for line_json in manifest_jsons:
|
||||||
|
for k in line_json.keys():
|
||||||
|
if k not in key_whitelist:
|
||||||
|
continue
|
||||||
|
file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
|
||||||
|
|
||||||
|
for _, file in file_map.items():
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="dump manifest to wav.scp text.word ...")
|
||||||
|
parser.add_argument("--manifest-path", type=str, help="path to manifest")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir",
|
||||||
|
type=str,
|
||||||
|
help="path to save outputs(audio and transcriptions)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
dump_manifest(args.manifest_path, args.output_dir)
|
Loading…
Reference in new issue