Merge pull request #1417 from Honei/develop
[vector] voxceleb trial: convert voxceleb trial to kaldi format trialpull/1439/head
commit
8eb708e754
@ -0,0 +1,8 @@
|
||||
|
||||
dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about)
|
||||
|
||||
sv0 - speaker verfication with softmax backend etc, all python code
|
||||
more info refer to the sv0/readme.txt
|
||||
|
||||
sv1 - dependence on kaldi, speaker verfication with plda/sc backend,
|
||||
more info refer to the sv1/readme.txt
|
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/python3
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Make VoxCeleb1 trial of kaldi format
|
||||
this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt
|
||||
to kaldi trial format
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import codecs
|
||||
import os
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--voxceleb_trial",
|
||||
default="voxceleb1_test_v2",
|
||||
type=str,
|
||||
help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt")
|
||||
parser.add_argument("--trial",
|
||||
default="data/test/trial",
|
||||
type=str,
|
||||
help="Kaldi format trial file")
|
||||
args = parser.parse_args()
|
||||
|
||||
def main(voxceleb_trial, trial):
|
||||
"""
|
||||
VoxCeleb provide several trial file, which format is different with kaldi format.
|
||||
|
||||
VoxCeleb format's meaning is as following:
|
||||
--------------------------------
|
||||
target_or_nontarget path1 path2
|
||||
--------------------------------
|
||||
target_or_nontarget is an integer: 1 target path1 is equal to path2
|
||||
0 nontarget path1 is unequal to path2
|
||||
path1: spkr_id/rec_id/name
|
||||
path2: spkr_id/rec_id/name
|
||||
|
||||
Kaldi format's meaning is as following:
|
||||
---------------------------------------
|
||||
utt_id1 utt_id2 target_or_nontarget
|
||||
---------------------------------------
|
||||
utt_id1: utterance identification or speaker identification
|
||||
utt_id2: utterance identification or speaker identification
|
||||
target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2
|
||||
'nontarget' utt_id2 is unequal to utt_id2
|
||||
"""
|
||||
print("Start convert the voxceleb trial to kaldi format")
|
||||
if not os.path.exists(voxceleb_trial):
|
||||
raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial))
|
||||
|
||||
trial_dirname = os.path.dirname(trial)
|
||||
if not os.path.exists(trial_dirname):
|
||||
os.mkdir(trial_dirname)
|
||||
|
||||
with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \
|
||||
codecs.open(trial, 'w', encoding='utf-8') as w:
|
||||
for line in f:
|
||||
target_or_nontarget, path1, path2 = line.strip().split()
|
||||
|
||||
utt_id1 = "-".join(path1.split("/"))
|
||||
utt_id2 = "-".join(path2.split("/"))
|
||||
target = "nontarget"
|
||||
if int(target_or_nontarget):
|
||||
target = "target"
|
||||
w.write("{} {} {}\n".format(utt_id1, utt_id2, target))
|
||||
print("Convert the voxceleb trial to kaldi format successfully")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(args.voxceleb_trial, args.trial)
|
Loading…
Reference in new issue