PaddleSpeech/examples/thchs30/a0/local/reorganize_thchs30.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Recorganize THCHS-30 for MFA
read manifest.train from root-dir
Link *.wav to output-dir
dump *.lab from manifest.train, such as: text、syllable and phone
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
"""
import argparse
import os
from pathlib import Path
from typing import Union


def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
    wav_scp_path = root_dir / 'wav.scp'
    with open(wav_scp_path, 'r') as rf:
        for line in rf:
            utt, feat = line.strip().split()
            wav_path = feat
            wav_name = wav_path.split("/")[-1]
            new_wav_path = output_dir / wav_name
            os.symlink(wav_path, new_wav_path)


def write_lab(root_dir: Union[str, Path],
              output_dir: Union[str, Path],
              script_type='phone'):
    # script_type can in {'word', 'syllable', 'phone'}
    json_name = 'text.' + script_type
    json_path = root_dir / json_name
    with open(json_path, 'r') as rf:
        for line in rf:
            line = line.strip().split()
            utt_id = line[0]
            context = ' '.join(line[1:])
            transcript_name = utt_id + '.lab'
            transcript_path = output_dir / transcript_name
            with open(transcript_path, 'wt') as wf:
                if script_type == 'word':
                    # add space between chinese char
                    context = ''.join([f + ' ' for f in context])[:-1]
                wf.write(context + "\n")


def reorganize_thchs30(root_dir: Union[str, Path],
                       output_dir: Union[str, Path]=None,
                       script_type='phone'):
    output_dir.mkdir(parents=True, exist_ok=True)
    link_wav(root_dir, output_dir)
    write_lab(root_dir, output_dir, script_type)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Reorganize THCHS-30 dataset for MFA")
    parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
    parser.add_argument(
        "--output-dir",
        type=str,
        help="path to save outputs (audio and transcriptions)")

    parser.add_argument(
        "--script-type",
        type=str,
        default="phone",
        help="type of lab ('word'/'syllable'/'phone')")

    args = parser.parse_args()
    root_dir = Path(args.root_dir).expanduser()
    output_dir = Path(args.output_dir).expanduser()
    reorganize_thchs30(root_dir, output_dir, args.script_type)
restructure thchs30/a0 3 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Recorganize THCHS-30 for MFA`
			`read manifest.train from root-dir`
			`Link *.wav to output-dir`
			`dump *.lab from manifest.train, such as: text、syllable and phone`
			`Manifest file is a json-format file with each line containing the`
			`meta data (i.e. audio filepath, transcript and audio duration)`
			`"""`
			`import argparse`
			`import os`
			`from pathlib import Path`
			`from typing import Union`


			`def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):`
			`wav_scp_path = root_dir / 'wav.scp'`
			`with open(wav_scp_path, 'r') as rf:`
			`for line in rf:`
			`utt, feat = line.strip().split()`
			`wav_path = feat`
			`wav_name = wav_path.split("/")[-1]`
			`new_wav_path = output_dir / wav_name`
			`os.symlink(wav_path, new_wav_path)`


			`def write_lab(root_dir: Union[str, Path],`
			`output_dir: Union[str, Path],`
			`script_type='phone'):`
			`# script_type can in {'word', 'syllable', 'phone'}`
			`json_name = 'text.' + script_type`
			`json_path = root_dir / json_name`
			`with open(json_path, 'r') as rf:`
			`for line in rf:`
			`line = line.strip().split()`
			`utt_id = line[0]`
			`context = ' '.join(line[1:])`
			`transcript_name = utt_id + '.lab'`
			`transcript_path = output_dir / transcript_name`
			`with open(transcript_path, 'wt') as wf:`
			`if script_type == 'word':`
			`# add space between chinese char`
			`context = ''.join([f + ' ' for f in context])[:-1]`
			`wf.write(context + "\n")`


			`def reorganize_thchs30(root_dir: Union[str, Path],`
			`output_dir: Union[str, Path]=None,`
			`script_type='phone'):`
			`output_dir.mkdir(parents=True, exist_ok=True)`
			`link_wav(root_dir, output_dir)`
			`write_lab(root_dir, output_dir, script_type)`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(`
			`description="Reorganize THCHS-30 dataset for MFA")`
			`parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")`
			`parser.add_argument(`
			`"--output-dir",`
			`type=str,`
fix_mfa 3 years ago			`help="path to save outputs (audio and transcriptions)")`
restructure thchs30/a0 3 years ago
			`parser.add_argument(`
			`"--script-type",`
			`type=str,`
			`default="phone",`
			`help="type of lab ('word'/'syllable'/'phone')")`
fix_mfa 3 years ago
restructure thchs30/a0 3 years ago			`args = parser.parse_args()`
fix_mfa 3 years ago			`root_dir = Path(args.root_dir).expanduser()`
			`output_dir = Path(args.output_dir).expanduser()`
			`reorganize_thchs30(root_dir, output_dir, args.script_type)`