process scripts and configs

4 years ago · 48207c1410
parent ea35558ee0
commit 48207c1410
5 changed files with 109 additions and 0 deletions
--- a/examples/ted_en_zh/st1/conf/fbank.conf
+++ b/examples/ted_en_zh/st1/conf/fbank.conf
@ -0,0 +1,2 @@
 --sample-frequency=16000 
 --num-mel-bins=80
--- a/examples/ted_en_zh/st1/conf/pitch.conf
+++ b/examples/ted_en_zh/st1/conf/pitch.conf
@ -0,0 +1 @@
 --sample-frequency=16000
--- a/examples/ted_en_zh/st1/local/ted_en_zh.py
+++ b/examples/ted_en_zh/st1/local/ted_en_zh.py
@ -0,0 +1,104 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import codecs
 import os
 # org_split = 'train-split/train-segment'
 # text_file = 'En-Zh/train.en-zh'
 # data_split = 'train'
 def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
                 data_split_list):
    for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
                                                data_split_list):
        local_data_split_dir = os.path.join(tgt_dir, data_split)
        os.makedirs(local_data_split_dir, exist_ok=True)
        utts = []
        utt2spk = {}
        with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
            open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
            for files in os.listdir(os.path.join(src_dir, org_split)):
                files = files.strip()
                file_path = os.path.join(src_dir, org_split, files)
                size = os.path.getsize(file_path)
                if size <= 30000:
                    continue
                utt = files.split('.')[0]
                audio_name = utt.split('_')[0]
                #format the name of utterance 
                while len(audio_name) < 6:
                    utt = '0' + utt
                    audio_name = '0' + audio_name
                utt = 'ted-en-zh-' + utt
                utts.append(utt)
                spk = utt.split('_')[0]
                utt2spk[utt] = spk
                assert len(spk) == 16, "%r" % spk
                print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
            for utt in sorted(utts):
                print(utt, utt2spk[utt], file=utt2spk_wf)
        with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
            open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
            open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
            codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
                        errors='ignore') as rf:
            count = 0
            for line in rf:
                line = line.strip()
                line_spl = line.split('\t')
                assert len(line_spl) == 3, "%r" % line
                wav, en, zh = line_spl
                assert wav.endswith('wav'), "%r" % wav[-3:]
                utt = wav.split('.')[0]
                audio_name = utt.split('_')[0]
                while len(audio_name) < 6:
                    utt = '0' + utt
                    audio_name = '0' + audio_name
                utt = 'ted-en-zh-' + utt
                print(utt, file=yaml_wf)
                print(en.lower(), file=en_wf)
                print(zh, file=zh_wf)
                count += 1
            print('%s set lines count: %d' % (data_split, count))
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--src-dir",
        default="",
        type=str,
        help="Directory to kaldi splited data. (default: %(default)s)")
    parser.add_argument(
        "--tgt-dir",
        default="local/ted_en_zh",
        type=str,
        help="Directory to save processed data. (default: %(default)s)")
    args = parser.parse_args()
    wav_dir_list = [
        'train-split/train-segment', 'test-segment/tst2014',
        'test-segment/tst2015'
    ]
    text_file_list = [
        'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
    ]
    data_split_list = ['train', 'dev', 'test']
    data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
                 data_split_list)
--- a/examples/ted_en_zh/st1/steps
+++ b/examples/ted_en_zh/st1/steps
@ -0,0 +1 @@
 ../../../tools/kaldi/egs/wsj/s5/steps
--- a/examples/ted_en_zh/st1/utils
+++ b/examples/ted_en_zh/st1/utils
@ -0,0 +1 @@
 ../../../tools/kaldi/egs/wsj/s5/utils
		`@ -0,0 +1,2 @@`
							`--sample-frequency=16000`
							`--num-mel-bins=80`