From ec24a169ee483e6739f4af07a5d5a0c8cf5284a5 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 30 Mar 2022 23:34:03 +0800
Subject: [PATCH] convert jsonfile to csv file

---
 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml    |   3 +
 examples/voxceleb/sv0/local/data.sh           |  86 +++++++--
 .../sv0/local/make_csv_dataset_from_json.py   | 170 ++++++++++++++++++
 paddleaudio/paddleaudio/datasets/voxceleb.py  |   2 +-
 4 files changed, 241 insertions(+), 20 deletions(-)
 create mode 100644 examples/voxceleb/sv0/local/make_csv_dataset_from_json.py

diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
index e58dca82..bfe90ae7 100644
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -8,7 +8,10 @@ batch_size: 16
 num_workers: 2
 num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
+split_ratio: 0.9
+chunk_duration: 3.0 # seconds
 random_chunk: True
+verification_file: data/vox1/veri_test2.txt
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
index a3ff1c48..9ba6dc5c 100755
--- a/examples/voxceleb/sv0/local/data.sh
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -12,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-stage=1
-stop_stage=100
+stage=5
+stop_stage=5
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
 
@@ -30,29 +30,77 @@ dir=$1
 conf_path=$2
 mkdir -p ${dir}
 
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-    # we should use the local/convert.sh convert m4a to wav
-    python3 local/data_prepare.py \
-                        --data-dir ${dir} \
-                        --config ${conf_path}
-fi 
-
+# Generally the `MAIN_ROOT` refers to the root of PaddleSpeech,
+# which is defined in the path.sh
+# And we will download the 
 TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # download data, generate manifests
+   # download data, generate manifests
+   echo "Start to download vox1 dataset and generate the manifest files "
     python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
       --manifest_prefix="data/vox1/manifest" \
       --target_dir="${TARGET_DIR}/voxceleb/vox1/"
 
-    if [ $? -ne 0 ]; then
-        echo "Prepare voxceleb failed. Terminated."
-        exit 1
-    fi
+   if [ $? -ne 0 ]; then
+      echo "Prepare voxceleb failed. Terminated."
+      exit 1
+   fi
+
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+   # download voxceleb2 data
+   echo "start to download vox2 dataset"
+   python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
+      --download \
+      --target_dir="${TARGET_DIR}/voxceleb/vox2/"
+
+   if [ $? -ne 0 ]; then
+      echo "Prepare voxceleb failed. Terminated."
+      exit 1
+   fi
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+   # convert the m4a to wav
+   echo "start to convert the m4a to wav"
+   bash local/convert.sh ${TARGET_DIR}/voxceleb/vox2/test/ || exit 1;
+   echo "m4a convert to wav operation finished"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+   # generate the vox2 manifest file 
+   echo "start generate the vox2 manifest files"
+   python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
+      --generate \
+      --manifest_prefix="data/vox2/manifest" \
+      --target_dir="${TARGET_DIR}/voxceleb/vox2/"
+
+   if [ $? -ne 0 ]; then
+      echo "Prepare voxceleb failed. Terminated."
+      exit 1
+   fi
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+   # generate the vox2 manifest file 
+   echo "convert the json format to csv format to be compatible with training process"
+   python3 local/make_csv_dataset_from_json.py\
+      --train "data/vox1/manifest.dev" \
+      --test "data/vox1/manifest.test" \
+      --target_dir "data/vox/" \
+      --config ${conf_path}
+
+   if [ $? -ne 0 ]; then
+      echo "Prepare voxceleb failed. Terminated."
+      exit 1
+   fi
+fi
+
+
+
+
 
-   #  for dataset in train dev test; do
-   #      mv data/manifest.${dataset} data/manifest.${dataset}.raw
-   #  done
-fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/make_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_csv_dataset_from_json.py
new file mode 100644
index 00000000..42f3e4b8
--- /dev/null
+++ b/examples/voxceleb/sv0/local/make_csv_dataset_from_json.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Convert the PaddleSpeech jsonline format to csv format
+Currently, Speaker Identificaton Training process need csv format.
+"""
+import argparse
+import os
+import jsonlines
+import collections
+import json
+import csv
+from yacs.config import CfgNode
+import tqdm
+from paddleaudio import load as load_audio
+import random
+from paddlespeech.vector.training.seeding import seed_everything
+# voxceleb meta info for each training utterance segment
+# we extract a segment from a utterance to train 
+# and the segment' period is between start and stop time point in the original wav file
+# each field in the meta means as follows:
+# id: the utterance segment name
+# duration: utterance segment time
+# wav: utterance file path
+# start: start point in the original wav file
+# stop: stop point in the original wav file
+# spk_id: the utterance segment's speaker name
+meta_info = collections.namedtuple(
+        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
+
+def get_chunks(seg_dur, audio_id, audio_duration):
+    num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+    chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+    ]
+    return chunk_lst
+
+def prepare_csv(wav_files, output_file, config, split_chunks=True):
+    if not os.path.exists(os.path.dirname(output_file)):
+        os.makedirs(os.path.dirname(output_file))
+    csv_lines = []
+    header = ["id", "duration", "wav", "start", "stop", "spk_id"]
+    for item in wav_files:
+        item = json.loads(item.strip())
+        audio_id = item['utt'].replace(".wav", "")
+        audio_duration = item['feat_shape'][0]
+        wav_file = item['feat']
+        spk_id = audio_id.split('-')[0]
+        waveform, sr = load_audio(wav_file)
+        if split_chunks:
+            uniq_chunks_list = get_chunks(config.chunk_duration, audio_id, audio_duration)
+            for chunk in uniq_chunks_list:
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                # id, duration, wav, start, stop, spk_id
+                csv_lines.append([
+                    chunk, audio_duration, wav_file, start_sample, end_sample,
+                    spk_id
+                ])  
+        else:
+            csv_lines.append([audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id])              
+
+    
+    with open(output_file, mode="w") as csv_f:
+        csv_writer = csv.writer(csv_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+        csv_writer.writerow(header)
+        for line in csv_lines:
+            csv_writer.writerow(line)
+
+def get_enroll_test_list(filelist, verification_file):
+    print(f"verification file: {verification_file}")
+    enroll_audios = set()
+    test_audios = set()
+    with open(verification_file, 'r') as f:
+        for line in f:
+            _, enroll_file, test_file = line.strip().split(' ')
+            enroll_audios.add('-'.join(enroll_file.split('/')))
+            test_audios.add('-'.join(test_file.split('/')))
+    
+    enroll_files = []
+    test_files = []
+    for item in filelist:
+        with open(item, 'r') as f:
+            for line in f:
+                audio_id = json.loads(line.strip())['utt']
+                if audio_id in enroll_audios:
+                    enroll_files.append(line)
+                if audio_id in test_audios:
+                    test_files.append(line)
+            
+    enroll_files = sorted(enroll_files)
+    test_files = sorted(test_files)
+
+    return enroll_files, test_files
+
+def get_train_dev_list(filelist, target_dir, split_ratio):
+    if not os.path.exists(os.path.join(target_dir, "meta")):
+        os.makedirs(os.path.join(target_dir, "meta"))
+
+    audio_files = []
+    speakers = set()
+    for item in filelist:
+        with open(item, 'r') as f:
+            for line in f:
+                spk_id = json.loads(line.strip())['utt2spk']
+                speakers.add(spk_id)
+                audio_files.append(line.strip())
+    
+    speakers = sorted(speakers)
+    with open(os.path.join(target_dir, "meta", "spk_id2label.txt"), 'w') as f:
+        for label, spk_id in enumerate(speakers):
+            f.write(f'{spk_id} {label}\n')
+    split_idx = int(split_ratio * len(audio_files))
+    random.shuffle(audio_files)
+    train_files, dev_files = audio_files[:split_idx], audio_files[split_idx:]
+
+    return train_files, dev_files
+    
+def prepare_data(args, config):
+
+    paddle.set_device("cpu")
+    seed_everything(config.seed)
+    
+    enroll_files, test_files = get_enroll_test_list([args.test], verification_file=config.verification_file)  
+    prepare_csv(enroll_files, os.path.join(args.target_dir, "csv", "enroll.csv"), config, split_chunks=False)
+    prepare_csv(test_files, os.path.join(args.target_dir, "csv", "test.csv"), config, split_chunks=False)
+    
+    train_files, dev_files = get_train_dev_list(args.train, target_dir=args.target_dir, split_ratio=config.split_ratio)
+    prepare_csv(train_files, os.path.join(args.target_dir, "csv", "train.csv"), config)
+    prepare_csv(dev_files, os.path.join(args.target_dir, "csv", "dev.csv"), config)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--train",
+        required=True,
+        nargs='+',
+        help="The jsonline files list for train")
+    parser.add_argument(
+        "--test", required=True, help="The jsonline file for test")
+    parser.add_argument(
+        "--target_dir",
+        required=True,
+        help="The target directory stores the csv files and meta file")
+    parser.add_argument("--config",
+                        default=None,
+                        required=True,
+                        type=str,
+                        help="configuration file")
+    args = parser.parse_args()
+
+    # parse the yaml config file
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    prepare_data(args, config)
\ No newline at end of file
diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
index 3f72b5f2..07f44e0c 100644
--- a/paddleaudio/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -261,7 +261,7 @@ class VoxCeleb(Dataset):
                      output_file: str,
                      split_chunks: bool=True):
         print(f'Generating csv: {output_file}')
-        header = ["ID", "duration", "wav", "start", "stop", "spk_id"]
+        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
         # Note: this may occurs c++ execption, but the program will execute fine
         # so we can ignore the execption 
         with Pool(cpu_count()) as p: