PaddleSpeech/examples/ted_en_zh/st1/local/ted_en_zh.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import codecs
import os


# org_split = 'train-split/train-segment'
# text_file = 'En-Zh/train.en-zh'
# data_split = 'train'
def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
                 data_split_list):

    for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
                                                data_split_list):
        local_data_split_dir = os.path.join(tgt_dir, data_split)

        os.makedirs(local_data_split_dir, exist_ok=True)
        utts = []
        utt2spk = {}
        with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
            open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
            for files in os.listdir(os.path.join(src_dir, org_split)):
                files = files.strip()
                file_path = os.path.join(src_dir, org_split, files)
                size = os.path.getsize(file_path)
                if size <= 30000:
                    continue
                utt = files.split('.')[0]
                audio_name = utt.split('_')[0]
                #format the name of utterance
                while len(audio_name) < 6:
                    utt = '0' + utt
                    audio_name = '0' + audio_name
                utt = 'ted-en-zh-' + utt
                utts.append(utt)
                spk = utt.split('_')[0]
                utt2spk[utt] = spk
                assert len(spk) == 16, "%r" % spk
                print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
            for utt in sorted(utts):
                print(utt, utt2spk[utt], file=utt2spk_wf)

        with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
            open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
            open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
            codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
                        errors='ignore') as rf:
            count = 0
            for line in rf:
                line = line.strip()
                line_spl = line.split('\t')
                assert len(line_spl) == 3, "%r" % line
                wav, en, zh = line_spl
                assert wav.endswith('wav'), "%r" % wav[-3:]
                utt = wav.split('.')[0]
                audio_name = utt.split('_')[0]
                while len(audio_name) < 6:
                    utt = '0' + utt
                    audio_name = '0' + audio_name
                utt = 'ted-en-zh-' + utt
                print(utt, file=yaml_wf)
                print(en.lower(), file=en_wf)
                print(zh, file=zh_wf)
                count += 1
            print('%s set lines count: %d' % (data_split, count))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)

    parser.add_argument(
        "--src-dir",
        default="",
        type=str,
        help="Directory to kaldi splited data. (default: %(default)s)")
    parser.add_argument(
        "--tgt-dir",
        default="local/ted_en_zh",
        type=str,
        help="Directory to save processed data. (default: %(default)s)")
    args = parser.parse_args()

    wav_dir_list = [
        'train-split/train-segment', 'test-segment/tst2014',
        'test-segment/tst2015'
    ]
    text_file_list = [
        'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
    ]
    data_split_list = ['train', 'dev', 'test']
    data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
                 data_split_list)