From 48207c14107a0de7d0c54d008220b1be832ba615 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:34:21 -0800 Subject: [PATCH] process scripts and configs --- examples/ted_en_zh/st1/conf/fbank.conf | 2 + examples/ted_en_zh/st1/conf/pitch.conf | 1 + examples/ted_en_zh/st1/local/ted_en_zh.py | 104 ++++++++++++++++++++++ examples/ted_en_zh/st1/steps | 1 + examples/ted_en_zh/st1/utils | 1 + 5 files changed, 109 insertions(+) create mode 100644 examples/ted_en_zh/st1/conf/fbank.conf create mode 100644 examples/ted_en_zh/st1/conf/pitch.conf create mode 100644 examples/ted_en_zh/st1/local/ted_en_zh.py create mode 120000 examples/ted_en_zh/st1/steps create mode 120000 examples/ted_en_zh/st1/utils diff --git a/examples/ted_en_zh/st1/conf/fbank.conf b/examples/ted_en_zh/st1/conf/fbank.conf new file mode 100644 index 00000000..82ac7bd0 --- /dev/null +++ b/examples/ted_en_zh/st1/conf/fbank.conf @@ -0,0 +1,2 @@ +--sample-frequency=16000 +--num-mel-bins=80 diff --git a/examples/ted_en_zh/st1/conf/pitch.conf b/examples/ted_en_zh/st1/conf/pitch.conf new file mode 100644 index 00000000..e959a19d --- /dev/null +++ b/examples/ted_en_zh/st1/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/examples/ted_en_zh/st1/local/ted_en_zh.py b/examples/ted_en_zh/st1/local/ted_en_zh.py new file mode 100644 index 00000000..f30573b7 --- /dev/null +++ b/examples/ted_en_zh/st1/local/ted_en_zh.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import codecs +import os + + +# org_split = 'train-split/train-segment' +# text_file = 'En-Zh/train.en-zh' +# data_split = 'train' +def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list, + data_split_list): + + for org_split, text_file, data_split in zip(wav_dir_list, text_file_list, + data_split_list): + local_data_split_dir = os.path.join(tgt_dir, data_split) + + os.makedirs(local_data_split_dir, exist_ok=True) + utts = [] + utt2spk = {} + with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \ + open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf: + for files in os.listdir(os.path.join(src_dir, org_split)): + files = files.strip() + file_path = os.path.join(src_dir, org_split, files) + size = os.path.getsize(file_path) + if size <= 30000: + continue + utt = files.split('.')[0] + audio_name = utt.split('_')[0] + #format the name of utterance + while len(audio_name) < 6: + utt = '0' + utt + audio_name = '0' + audio_name + utt = 'ted-en-zh-' + utt + utts.append(utt) + spk = utt.split('_')[0] + utt2spk[utt] = spk + assert len(spk) == 16, "%r" % spk + print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf) + for utt in sorted(utts): + print(utt, utt2spk[utt], file=utt2spk_wf) + + with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \ + open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \ + open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \ + codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8', + errors='ignore') as rf: + count = 0 + for line in rf: + line = line.strip() + line_spl = line.split('\t') + assert len(line_spl) == 3, "%r" % line + wav, en, zh = line_spl + assert wav.endswith('wav'), "%r" % wav[-3:] + utt = wav.split('.')[0] + audio_name = utt.split('_')[0] + while len(audio_name) < 6: + utt = '0' + utt + audio_name = '0' + audio_name + utt = 'ted-en-zh-' + utt + print(utt, file=yaml_wf) + print(en.lower(), file=en_wf) + print(zh, file=zh_wf) + count += 1 + print('%s set lines count: %d' % (data_split, count)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "--src-dir", + default="", + type=str, + help="Directory to kaldi splited data. (default: %(default)s)") + parser.add_argument( + "--tgt-dir", + default="local/ted_en_zh", + type=str, + help="Directory to save processed data. (default: %(default)s)") + args = parser.parse_args() + + wav_dir_list = [ + 'train-split/train-segment', 'test-segment/tst2014', + 'test-segment/tst2015' + ] + text_file_list = [ + 'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh' + ] + data_split_list = ['train', 'dev', 'test'] + data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list, + data_split_list) diff --git a/examples/ted_en_zh/st1/steps b/examples/ted_en_zh/st1/steps new file mode 120000 index 00000000..91f2d234 --- /dev/null +++ b/examples/ted_en_zh/st1/steps @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/examples/ted_en_zh/st1/utils b/examples/ted_en_zh/st1/utils new file mode 120000 index 00000000..f49247da --- /dev/null +++ b/examples/ted_en_zh/st1/utils @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/utils \ No newline at end of file