Merge pull request #1417 from Honei/develop

[vector] voxceleb trial: convert voxceleb trial to kaldi format trial
4 years ago · 8eb708e754
parent 70ebbfd89f d7a09ff71c
commit 8eb708e754
2 changed files with 89 additions and 0 deletions
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@ -0,0 +1,8 @@
+
+dataset info refer to [VoxCeleb](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/index.html#about)
+
+sv0 - speaker verfication with softmax backend etc, all python code
+      more info refer to the sv0/readme.txt
+
+sv1 - dependence on kaldi, speaker verfication with plda/sc backend, 
+      more info refer to the sv1/readme.txt
--- a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py
+++ b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py
@ -0,0 +1,81 @@
+#!/usr/bin/python3
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Make VoxCeleb1 trial of kaldi format
+this script creat the test trial from kaldi trial voxceleb1_test_v2.txt or official trial veri_test2.txt 
+to kaldi trial format
+"""
+
+import argparse
+import codecs
+import os
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--voxceleb_trial",
+                    default="voxceleb1_test_v2",
+                    type=str,
+                    help="VoxCeleb trial file. Default we use the kaldi trial voxceleb1_test_v2.txt")
+parser.add_argument("--trial",
+                    default="data/test/trial",
+                    type=str,
+                    help="Kaldi format trial file")
+args = parser.parse_args()
+
+def main(voxceleb_trial, trial):
+    """
+        VoxCeleb provide several trial file, which format is different with kaldi format.
+
+        VoxCeleb format's meaning is as following:
+        --------------------------------
+        target_or_nontarget path1 path2
+        --------------------------------
+        target_or_nontarget is an integer: 1 target                 path1 is equal to path2
+                                           0 nontarget              path1 is unequal to path2    
+        path1: spkr_id/rec_id/name
+        path2: spkr_id/rec_id/name
+
+        Kaldi format's meaning is as following:
+        ---------------------------------------
+        utt_id1 utt_id2 target_or_nontarget
+        ---------------------------------------
+        utt_id1: utterance identification or speaker identification
+        utt_id2: utterance identification or speaker identification
+        target_or_nontarget is an string: 'target' utt_id1 is equal to  utt_id2
+                                        'nontarget' utt_id2 is unequal to utt_id2
+    """
+    print("Start convert the voxceleb trial to kaldi format")
+    if not os.path.exists(voxceleb_trial):
+        raise RuntimeError("{} does not exist. Pleas input the correct file path".format(voxceleb_trial))
+
+    trial_dirname = os.path.dirname(trial)
+    if not os.path.exists(trial_dirname):
+        os.mkdir(trial_dirname)
+
+    with codecs.open(voxceleb_trial, 'r', encoding='utf-8') as f, \
+         codecs.open(trial, 'w', encoding='utf-8') as w:
+         for line in f:
+            target_or_nontarget, path1, path2 = line.strip().split()
+             
+            utt_id1 = "-".join(path1.split("/"))
+            utt_id2 = "-".join(path2.split("/"))
+            target = "nontarget"
+            if int(target_or_nontarget):
+                target = "target"
+            w.write("{} {} {}\n".format(utt_id1, utt_id2, target))
+    print("Convert the voxceleb trial to kaldi format successfully")
+
+if __name__ == "__main__":
+    main(args.voxceleb_trial, args.trial)