You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py

87 lines
3.2 KiB

#!/usr/bin/python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Make VoxCeleb1 trial of kaldi format
this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt
to kaldi trial format
"""
import argparse
import codecs
import os
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--voxceleb_trial",
default="voxceleb1_test_v2",
type=str,
help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt"
)
parser.add_argument(
"--trial",
default="data/test/trial",
type=str,
help="Kaldi format trial file")
args = parser.parse_args()
def main(voxceleb_trial, trial):
"""
VoxCeleb provide several trial file, which format is different with kaldi format.
VoxCeleb format's meaning is as following:
--------------------------------
target_or_nontarget path1 path2
--------------------------------
target_or_nontarget is an integer: 1 target path1 is equal to path2
0 nontarget path1 is unequal to path2
path1: spkr_id/rec_id/name
path2: spkr_id/rec_id/name
Kaldi format's meaning is as following:
---------------------------------------
utt_id1 utt_id2 target_or_nontarget
---------------------------------------
utt_id1: utterance identification or speaker identification
utt_id2: utterance identification or speaker identification
target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2
'nontarget' utt_id2 is unequal to utt_id2
"""
print("Start convert the voxceleb trial to kaldi format")
if not os.path.exists(voxceleb_trial):
raise RuntimeError(
"{} does not exist. Pleas input the correct file path".format(
voxceleb_trial))
trial_dirname = os.path.dirname(trial)
if not os.path.exists(trial_dirname):
os.mkdir(trial_dirname)
with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \
codecs.open(trial, 'w', encoding='utf-8') as w:
for line in f:
target_or_nontarget, path1, path2 = line.strip().split()
utt_id1 = "-".join(path1.split("/"))
utt_id2 = "-".join(path2.split("/"))
target = "nontarget"
if int(target_or_nontarget):
target = "target"
w.write("{} {} {}\n".format(utt_id1, utt_id2, target))
print("Convert the voxceleb trial to kaldi format successfully")
if __name__ == "__main__":
main(args.voxceleb_trial, args.trial)