diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md new file mode 100644 index 00000000..2c8ad138 --- /dev/null +++ b/examples/voxceleb/README.md @@ -0,0 +1,8 @@ + +dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about) + +sv0 - speaker verfication with softmax backend etc, all python code + more info refer to the sv0/readme.txt + +sv1 - dependence on kaldi, speaker verfication with plda/sc backend, + more info refer to the sv1/readme.txt diff --git a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py new file mode 100644 index 00000000..55900516 --- /dev/null +++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py @@ -0,0 +1,81 @@ +#!/usr/bin/python3 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Make VoxCeleb1 trial of kaldi format +this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt +to kaldi trial format +""" + +import argparse +import codecs +import os + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument("--voxceleb_trial", + default="voxceleb1_test_v2", + type=str, + help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt") +parser.add_argument("--trial", + default="data/test/trial", + type=str, + help="Kaldi format trial file") +args = parser.parse_args() + +def main(voxceleb_trial, trial): + """ + VoxCeleb provide several trial file, which format is different with kaldi format. + + VoxCeleb format's meaning is as following: + -------------------------------- + target_or_nontarget path1 path2 + -------------------------------- + target_or_nontarget is an integer: 1 target path1 is equal to path2 + 0 target_or_nontarget path1 is unequal to path2 + path1: spkr_id/rec_id/name + path2: spkr_id/rec_id/name + + Kaldi format's meaning is as following: + --------------------------------------- + utt_id1 utt_id2 target_or_nontarget + --------------------------------------- + utt_id1: utterance identification or speaker identification + utt_id2: utterance identification or speaker identification + target_or_nontarget is an string: 'target' utt_id1 is equal to utt_id2 + 'nontarget' utt_id2 is unequal to utt_id2 + """ + print("Start convert the voxceleb trial to kaldi format") + if not os.path.exists(voxceleb_trial): + raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial)) + + trial_dirname = os.path.dirname(trial) + if not os.path.exists(trial_dirname): + os.mkdir(trial_dirname) + + with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \ + codecs.open(trial, 'w', encoding='utf-8') as w: + for line in f: + target_or_nontarget, path1, path2 = line.strip().split() + + utt_id1 = "-".join(path1.split("/")) + utt_id2 = "-".join(path2.split("/")) + target = "nontarget" + if int(target_or_nontarget): + target = "target" + w.write("{} {} {}\n".format(utt_id1, utt_id2, target)) + print("Convert the voxceleb trial to kaldi format successfully") + +if __name__ == "__main__": + main(args.voxceleb_trial, args.trial) \ No newline at end of file