From 57c11dcab01df60f31314f01bde990be204d7324 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 2 Apr 2022 23:05:45 +0800
Subject: [PATCH] add some annotations, test=doc

---
 .../local/make_vox_csv_dataset_from_json.py   |  2 +-
 paddlespeech/vector/io/dataset.py             | 55 ++++++++++++++++---
 2 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
index 576a3c8b..6c33aba5 100644
--- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
@@ -53,7 +53,7 @@ def prepare_csv(wav_files, output_file, config, split_chunks=True):
     # wav: utterance file path
     # start: start point in the original wav file
     # stop: stop point in the original wav file
-    # spk_id: the utterance segment's speaker name
+    # lab_id: the utterance segment's speaker name
     for item in tqdm.tqdm(wav_files, total=len(wav_files)):
         item = json.loads(item.strip())
         audio_id = item['utt'].replace(".wav", "")
diff --git a/paddlespeech/vector/io/dataset.py b/paddlespeech/vector/io/dataset.py
index ea2106cd..e70c8d3c 100644
--- a/paddlespeech/vector/io/dataset.py
+++ b/paddlespeech/vector/io/dataset.py
@@ -30,6 +30,16 @@ logger = Log(__name__).getlog()
 
 @dataclass
 class meta_info:
+    """the audio meta info in the vector CSVDataset
+
+    Args:
+        utt_id (str): the utterance segment name
+        duration (float): utterance segment time
+        wav (str): utterance file path
+        start (int): start point in the original wav file
+        stop (int): stop point in the original wav file
+        lab_id (str): the utterance segment's label id
+    """
     utt_id: str
     duration: float
     wav: str
@@ -39,18 +49,30 @@ class meta_info:
 
 
 class CSVDataset(Dataset):
-    # meta_info = collections.namedtuple(
-    #     'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
-
     def __init__(self, csv_path, spk_id2label_path=None, config=None):
+        """Implement the CSV Dataset
+
+        Args:
+            csv_path (str): csv dataset file path
+            spk_id2label_path (str): the utterance label to integer id map file path
+            config (CfgNode): yaml config
+        """
         super().__init__()
         self.csv_path = csv_path
         self.spk_id2label_path = spk_id2label_path
         self.config = config
+        self.spk_id2label = {}
+        self.label2spk_id = {}
         self.data = self.load_data_csv()
-        self.spk_id2label = self.load_speaker_to_label()
+        self.load_speaker_to_label()
 
     def load_data_csv(self):
+        """Load the csv dataset content and store them in the data property
+        the csv dataset's format has six fields, 
+        that is audio_id or utt_id, audio duration, segment start point, segment stop point 
+        and utterance label.
+        Note in training period, the utterance label must has a map to integer id in spk_id2label_path 
+        """
         data = []
 
         with open(self.csv_path, 'r') as rf:
@@ -64,18 +86,28 @@ class CSVDataset(Dataset):
         return data
 
     def load_speaker_to_label(self):
+        """Load the utterance label map content.
+        In vector domain, we call the utterance label as speaker label.
+        The speaker label is real speaker label in speaker verification domain,
+        and in language identification is language label.
+        """
         if not self.spk_id2label_path:
             logger.warning("No speaker id to label file")
             return
-        spk_id2label = {}
+        self.spk_id2label = {}
+        self.label2spk_id = {}
         with open(self.spk_id2label_path, 'r') as f:
             for line in f.readlines():
                 spk_id, label = line.strip().split(' ')
-                spk_id2label[spk_id] = int(label)
-
-        return spk_id2label
+                self.spk_id2label[spk_id] = int(label)
+                self.label2spk_id[int(label)] = spk_id
 
     def convert_to_record(self, idx: int):
+        """convert the dataset sample to training record the CSV Dataset
+
+        Args:
+            idx (int) : the request index in all the dataset
+        """
         sample = self.data[idx]
 
         record = {}
@@ -104,7 +136,14 @@ class CSVDataset(Dataset):
         return record
 
     def __getitem__(self, idx):
+        """Return the specific index sample
+
+        Args:
+            idx (int) : the request index in all the dataset
+        """
         return self.convert_to_record(idx)
 
     def __len__(self):
+        """Return the dataset length
+        """
         return len(self.data)