From 16108de71ef00634e70665542929cba8cd3317d2 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 24 Feb 2022 22:14:05 +0800
Subject: [PATCH 01/41] add voxceleb1 dataset prepare process

---
 examples/voxceleb/sv0/local/data.sh | 25 +++++++++++++++++++++++++
 examples/voxceleb/sv0/run.sh        | 14 ++++++++++++++
 examples/voxceleb/sv0/utils         |  1 +
 3 files changed, 40 insertions(+)
 create mode 100755 examples/voxceleb/sv0/local/data.sh
 create mode 100755 examples/voxceleb/sv0/run.sh
 create mode 120000 examples/voxceleb/sv0/utils

diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
new file mode 100755
index 00000000..0c2d008d
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -0,0 +1,25 @@
+stage=-1
+stop_stage=100
+TARGET_DIR=${MAIN_ROOT}/dataset
+
+. utils/parse_options.sh || exit -1;
+
+src=$1
+mkdir -p data/{dev,test}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    # create data/{dev,test} directory to store the manifest files
+    /home/users/xiongxinlei/.conda/envs/xxl_base/bin/python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${src}"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare Voxceleb failed. Terminated."
+        exit 1
+    fi
+    mv data/manifest.dev data/dev
+    mv data/voxceleb1.dev.meta data/dev
+
+    mv data/manifest.test data/test
+    mv data/voxceleb1.test.meta data/test
+fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
new file mode 100755
index 00000000..281f7b40
--- /dev/null
+++ b/examples/voxceleb/sv0/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+stage=0
+voxceleb1_root=/mnt/dataset_12/sv/voxCeleb1_v2/
+
+if [ $stage -le 0 ]; then
+    echo "======================================================================================================"
+    echo "=========================== Stage 0: Prepare the VoxCeleb1 dataset ==================================="
+    echo "======================================================================================================"
+    # prepare the data elapsed about 20s
+    # the script will create the data/{dev,test}
+    local/data.sh ${voxceleb1_root}|| exit 1;
+fi
diff --git a/examples/voxceleb/sv0/utils b/examples/voxceleb/sv0/utils
new file mode 120000
index 00000000..256f914a
--- /dev/null
+++ b/examples/voxceleb/sv0/utils
@@ -0,0 +1 @@
+../../../utils/
\ No newline at end of file

From 35b7968ed10525a42bb8b1f3d82d6b203801f982 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Fri, 25 Feb 2022 19:38:22 +0800
Subject: [PATCH 02/41] remove invalid directory

---
 examples/voxceleb/sv0/local/data.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
index 0c2d008d..6df9c3b8 100755
--- a/examples/voxceleb/sv0/local/data.sh
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -9,7 +9,7 @@ mkdir -p data/{dev,test}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     # download data, generate manifests
     # create data/{dev,test} directory to store the manifest files
-    /home/users/xiongxinlei/.conda/envs/xxl_base/bin/python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
+    python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
     --manifest_prefix="data/manifest" \
     --target_dir="${src}"
 
@@ -22,4 +22,4 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
 
     mv data/manifest.test data/test
     mv data/voxceleb1.test.meta data/test
-fi
\ No newline at end of file
+fi

From 6f7e9656febf8e399ef09b749da7641bee438dc5 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Fri, 25 Feb 2022 20:05:25 +0800
Subject: [PATCH 03/41] add kaldi feats ark dataset

---
 paddlespeech/vector/datasets/dataset.py | 143 ++++++++++++++++++++++++
 paddlespeech/vector/utils/data_utils.py | 125 +++++++++++++++++++++
 paddlespeech/vector/utils/utils.py      | 132 ++++++++++++++++++++++
 3 files changed, 400 insertions(+)
 create mode 100644 paddlespeech/vector/datasets/dataset.py
 create mode 100644 paddlespeech/vector/utils/data_utils.py
 create mode 100644 paddlespeech/vector/utils/utils.py

diff --git a/paddlespeech/vector/datasets/dataset.py b/paddlespeech/vector/datasets/dataset.py
new file mode 100644
index 00000000..e7030053
--- /dev/null
+++ b/paddlespeech/vector/datasets/dataset.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import random
+import numpy as np
+import kaldi_python_io as k_io
+from paddle.io import Dataset
+from paddlespeech.vector.utils.data_utils import batch_pad_right
+import paddlespeech.vector.utils as utils
+from paddlespeech.vector.utils.utils import read_map_file
+
+def ark_collate_fn(batch):
+    """
+    Custom collate function for kaldi feats dataset
+
+    Args:
+        min_chunk_size: min chunk size of a utterance
+        max_chunk_size: max chunk size of a utterance
+
+    Returns:
+        ark_collate_fn: collate funtion for dataloader
+    """
+
+    data = []
+    target = []
+    for items in batch:
+        for x, y in zip(items[0], items[1]):
+            data.append(np.array(x))
+            target.append(y)
+
+    data, lengths = batch_pad_right(data)
+    return np.array(data, dtype=np.float32), \
+           np.array(lengths, dtype=np.float32), \
+           np.array(target, dtype=np.long).reshape((len(target), 1))
+
+
+class KaldiArkDataset(Dataset):
+    """
+    Dataset used to load kaldi ark/scp files.
+    """
+    def __init__(self, scp_file, label2utt, min_item_size=1,
+                 max_item_size=1, repeat=50, min_chunk_size=200,
+                 max_chunk_size=400, select_by_speaker=True):
+        self.scp_file = scp_file
+        self.scp_reader = None
+        self.repeat = repeat
+        self.min_item_size = min_item_size
+        self.max_item_size = max_item_size
+        self.min_chunk_size = min_chunk_size
+        self.max_chunk_size = max_chunk_size
+        self._collate_fn = ark_collate_fn
+        self._is_select_by_speaker = select_by_speaker
+        if utils.is_exist(self.scp_file):
+            self.scp_reader = k_io.ScriptReader(self.scp_file)
+
+        label2utts, utt2label = read_map_file(label2utt, key_func=int)
+        self.utt_info = list(label2utts.items()) if self._is_select_by_speaker else list(utt2label.items())
+
+    @property
+    def collate_fn(self):
+        """
+        Return a collate funtion.
+        """
+        return self._collate_fn
+
+    def _random_chunk(self, length):
+        chunk_size = random.randint(self.min_chunk_size, self.max_chunk_size)
+        if chunk_size >= length:
+            return 0, length
+        start = random.randint(0, length - chunk_size)
+        end = start + chunk_size
+
+        return start, end
+
+    def _select_by_speaker(self, index):
+        if self.scp_reader is None or not self.utt_info:
+            return []
+        index = index % (len(self.utt_info))
+        inputs = []
+        labels = []
+        item_size = random.randint(self.min_item_size, self.max_item_size)
+        for loop_idx in range(item_size):
+            try:
+                utt_index = random.randint(0, len(self.utt_info[index][1])) \
+                        % len(self.utt_info[index][1])
+                key = self.utt_info[index][1][utt_index]
+            except:
+                print(index, utt_index, len(self.utt_info[index][1]))
+                sys.exit(-1)
+            x = self.scp_reader[key]
+            x = np.transpose(x)
+            bg, end = self._random_chunk(x.shape[-1])
+            inputs.append(x[:, bg: end])
+            labels.append(self.utt_info[index][0])
+        return inputs, labels
+
+    def _select_by_utt(self, index):
+        if self.scp_reader is None or len(self.utt_info) == 0:
+            return {}
+        index = index % (len(self.utt_info))
+        key = self.utt_info[index][0]
+        x = self.scp_reader[key]
+        x = np.transpose(x)
+        bg, end = self._random_chunk(x.shape[-1])
+
+        y = self.utt_info[index][1]
+
+        return [x[:, bg: end]], [y]
+
+    def __getitem__(self, index):
+        if self._is_select_by_speaker:
+            return self._select_by_speaker(index)
+        else:
+            return self._select_by_utt(index)
+
+    def __len__(self):
+        return len(self.utt_info) * self.repeat
+
+    def __iter__(self):
+        self._start = 0
+        return self
+
+    def __next__(self):
+        if self._start < len(self):
+            ret = self[self._start]
+            self._start += 1
+            return ret
+        else:
+            raise StopIteration
+
+return KaldiArkDataset
diff --git a/paddlespeech/vector/utils/data_utils.py b/paddlespeech/vector/utils/data_utils.py
new file mode 100644
index 00000000..4a33a795
--- /dev/null
+++ b/paddlespeech/vector/utils/data_utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+data utilities
+"""
+import os
+import sys
+import numpy
+import paddle
+
+
+def pad_right_to(array, target_shape, mode="constant", value=0):
+    """
+    This function takes a numpy array of arbitrary shape and pads it to target
+    shape by appending values on the right.
+
+    Args:
+        array: input numpy array. Input array whose dimension we need to pad.
+    target_shape : (list, tuple). Target shape we want for the target array its len must be equal to array.ndim
+    mode : str. Pad mode, please refer to numpy.pad documentation.
+    value : float. Pad value, please refer to numpy.pad documentation.
+
+    Returns:
+        array: numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+    assert len(target_shape) == array.ndim
+    pads = []  # this contains the abs length of the padding for each dimension.
+    valid_vals = []  # thic contains the relative lengths for each dimension.
+    i = 0 # iterating over target_shape ndims
+    while i < len(target_shape):
+        assert (
+            target_shape[i] >= array.shape[i]
+        ), "Target shape must be >= original shape for every dim"
+        pads.append([0, target_shape[i] - array.shape[i]])
+        valid_vals.append(array.shape[i] / target_shape[i])
+        i += 1
+
+    array = numpy.pad(array, pads, mode=mode, constant_values=value)
+
+    return array, valid_vals
+
+
+def batch_pad_right(arrays, mode="constant", value=0):
+    """Given a list of numpy arrays it batches them together by padding to the right
+    on each dimension in order to get same length for all.
+
+    Args:
+        arrays : list. List of array we wish to pad together.
+        mode : str. Padding mode see numpy.pad documentation.
+        value : float. Padding value see numpy.pad documentation.
+
+    Returns:
+        array : numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+
+    if not len(arrays):
+        raise IndexError("arrays list must not be empty")
+
+    if len(arrays) == 1:
+        # if there is only one array in the batch we simply unsqueeze it.
+        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
+
+    if not (
+        any(
+            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
+        )
+    ):
+        raise IndexError("All arrays must have same number of dimensions")
+
+    # FIXME we limit the support here: we allow padding of only the last dimension
+    # need to remove this when feat extraction is updated to handle multichannel.
+    max_shape = []
+    for dim in range(arrays[0].ndim):
+        if dim != (arrays[0].ndim - 1):
+            if not all(
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
+            ):
+                raise EnvironmentError(
+                    "arrays should have same dimensions except for last one"
+                )
+        max_shape.append(max([x.shape[dim] for x in arrays]))
+
+    batched = []
+    valid = []
+    for t in arrays:
+        # for each array we apply pad_right_to
+        padded, valid_percent = pad_right_to(
+            t, max_shape, mode=mode, value=value
+        )
+        batched.append(padded)
+        valid.append(valid_percent[-1])
+
+    batched = numpy.stack(batched)
+
+    return batched, numpy.array(valid)
+
+
+def length_to_mask(length, max_len=None, dtype=None):
+    """Creates a binary mask for each sequence.
+    """
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = paddle.cast(paddle.max(length), dtype="int64")  # using arange to generate mask
+    mask = paddle.arange(max_len, dtype=length.dtype).expand([paddle.shape(length)[0], max_len]) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    mask = paddle.cast(mask, dtype=dtype)
+    return mask
diff --git a/paddlespeech/vector/utils/utils.py b/paddlespeech/vector/utils/utils.py
new file mode 100644
index 00000000..c46e42c2
--- /dev/null
+++ b/paddlespeech/vector/utils/utils.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+utilities
+"""
+import os
+import sys
+import paddle
+import numpy as np
+
+from sidt import _logger as log
+
+
+def exit_if_not_exist(in_path):
+    """
+    Check the existence of a file or directory, if not exit, exit the program.
+
+    Args:
+        in_path: input dicrector
+    """
+    if not is_exist(in_path):
+        sys.exit(-1)
+
+
+def is_exist(in_path):
+    """
+    Check the existence of a file or directory
+
+    Args:
+        in_path: input dicrector
+
+    Returns:
+        True or False
+    """
+    if not os.path.exists(in_path):
+        log.error("No such file or directory: %s" % (in_path))
+        return False
+
+    return True
+
+
+def get_latest_file(target_dir):
+    """
+    Get the latest file in target directory
+
+    Args:
+        target_dir: target directory
+
+    Returns:
+        latest_file: a string or None
+    """
+    items = os.listdir(target_dir)
+    items.sort(key=lambda fn: os.path.getmtime(os.path.join(target_dir, fn)) \
+               if not os.path.isdir(os.path.join(target_dir, fn)) else 0)
+    latest_file = None if not items else os.path.join(target_dir, items[-1])
+    return latest_file
+
+
+def avg_models(models):
+    """
+    merge multiple models
+    """
+    checkpoint_dict = paddle.load(models[0])
+    final_state_dict = checkpoint_dict
+
+    if len(models) > 1:
+        for model in models[1:]:
+            checkpoint_dict = paddle.load(model)
+            for k, v in checkpoint_dict.items():
+                final_state_dict[k] += v
+        for k in final_state_dict.keys():
+            final_state_dict[k] /= float(len(models))
+            if np.any(np.isnan(final_state_dict[k])):
+                print("Nan in %s" % (k))
+
+    return final_state_dict
+
+def Q_from_tokens(token_num):
+    """
+    get prior model, data from uniform, would support others(guassian) in future
+    """
+    freq = [1] * token_num
+    Q = paddle.to_tensor(freq, dtype = 'float64')
+    return Q / Q.sum()
+
+
+def read_map_file(map_file, key_func=None, value_func=None, values_func=None):
+    """ Read map file. First colume is key, the rest columes are values.
+
+    Args:
+        map_file: map file
+        key_func: convert function for key
+        value_func: convert function for each value
+        values_func: convert function for values
+
+    Returns:
+        dict: key 2 value
+        dict: value 2 key
+    """
+    if not is_exist(map_file):
+        sys.exit(0)
+
+    key2val = {}
+    val2key = {}
+    with open(map_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            items = line.split()
+            assert len(items) >= 2
+            key = items[0] if not key_func else key_func(items[0])
+            values = items[1:] if not value_func else [value_func(item) for item in items[1:]]
+            if values_func:
+                values = values_func(values)
+            key2val[key] = values
+            for value in values:
+                val2key[value] = key
+
+    return key2val, val2key

From d7da629302d40e7f8b1e2d488e40a369b556acfe Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 26 Feb 2022 15:27:51 +0800
Subject: [PATCH 04/41] add kaldi feats egs dataset

---
 paddlespeech/vector/__init__.py             |  31 +++++
 paddlespeech/vector/datasets/ark_dataset.py | 142 ++++++++++++++++++++
 paddlespeech/vector/datasets/egs_dataset.py |  91 +++++++++++++
 paddlespeech/vector/utils/data_utils.py     |   0
 paddlespeech/vector/utils/utils.py          |   2 +-
 5 files changed, 265 insertions(+), 1 deletion(-)
 create mode 100755 paddlespeech/vector/datasets/ark_dataset.py
 create mode 100644 paddlespeech/vector/datasets/egs_dataset.py
 mode change 100644 => 100755 paddlespeech/vector/utils/data_utils.py
 mode change 100644 => 100755 paddlespeech/vector/utils/utils.py

diff --git a/paddlespeech/vector/__init__.py b/paddlespeech/vector/__init__.py
index 185a92b8..2a3588ec 100644
--- a/paddlespeech/vector/__init__.py
+++ b/paddlespeech/vector/__init__.py
@@ -11,3 +11,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+"""
+__init__ file for sidt package.
+"""
+
+import logging as sidt_logging
+import colorlog
+
+LOG_COLOR_CONFIG = {
+    'DEBUG': 'white',
+    'INFO': 'white',
+    'WARNING': 'yellow',
+    'ERROR': 'red',
+    'CRITICAL': 'purple',
+}
+
+# 设置全局的logger
+colored_formatter = colorlog.ColoredFormatter(
+    '%(log_color)s [%(levelname)s] [%(asctime)s] [%(filename)s:%(lineno)d] - %(message)s',
+    datefmt="%Y-%m-%d %H:%M:%S",
+    log_colors=LOG_COLOR_CONFIG)  # 日志输出格式
+_logger = sidt_logging.getLogger("sidt")
+handler = colorlog.StreamHandler()
+handler.setLevel(sidt_logging.INFO)
+handler.setFormatter(colored_formatter)
+_logger.addHandler(handler)
+_logger.setLevel(sidt_logging.INFO)
+
+from .trainer.trainer import Trainer
+from .dataset.ark_dataset import create_kaldi_ark_dataset
+from .dataset.egs_dataset import create_kaldi_egs_dataset
diff --git a/paddlespeech/vector/datasets/ark_dataset.py b/paddlespeech/vector/datasets/ark_dataset.py
new file mode 100755
index 00000000..7a00e7ba
--- /dev/null
+++ b/paddlespeech/vector/datasets/ark_dataset.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import random
+import numpy as np
+import kaldi_python_io as k_io
+from paddle.io import Dataset
+from paddlespeech.vector.utils.data_utils import batch_pad_right
+import paddlespeech.vector.utils as utils
+from paddlespeech.vector.utils.utils import read_map_file
+from paddlespeech.vector import _logger as log
+
+def ark_collate_fn(batch):
+    """
+    Custom collate function] for kaldi feats dataset
+
+    Args:
+        min_chunk_size: min chunk size of a utterance
+        max_chunk_size: max chunk size of a utterance
+
+    Returns:
+        ark_collate_fn: collate funtion for dataloader
+    """
+
+    data = []
+    target = []
+    for items in batch:
+        for x, y in zip(items[0], items[1]):
+            data.append(np.array(x))
+            target.append(y)
+
+    data, lengths = batch_pad_right(data)
+    return np.array(data, dtype=np.float32), \
+           np.array(lengths, dtype=np.float32), \
+           np.array(target, dtype=np.long).reshape((len(target), 1))
+
+
+class KaldiArkDataset(Dataset):
+    """
+    Dataset used to load kaldi ark/scp files.
+    """
+    def __init__(self, scp_file, label2utt, min_item_size=1,
+                 max_item_size=1, repeat=50, min_chunk_size=200,
+                 max_chunk_size=400, select_by_speaker=True):
+        self.scp_file = scp_file
+        self.scp_reader = None
+        self.repeat = repeat
+        self.min_item_size = min_item_size
+        self.max_item_size = max_item_size
+        self.min_chunk_size = min_chunk_size
+        self.max_chunk_size = max_chunk_size
+        self._collate_fn = ark_collate_fn
+        self._is_select_by_speaker = select_by_speaker
+        if utils.is_exist(self.scp_file):
+            self.scp_reader = k_io.ScriptReader(self.scp_file)
+
+        label2utts, utt2label = read_map_file(label2utt, key_func=int)
+        self.utt_info = list(label2utts.items()) if self._is_select_by_speaker else list(utt2label.items())
+
+    @property
+    def collate_fn(self):
+        """
+        Return a collate funtion.
+        """
+        return self._collate_fn
+
+    def _random_chunk(self, length):
+        chunk_size = random.randint(self.min_chunk_size, self.max_chunk_size)
+        if chunk_size >= length:
+            return 0, length
+        start = random.randint(0, length - chunk_size)
+        end = start + chunk_size
+
+        return start, end
+
+    def _select_by_speaker(self, index):
+        if self.scp_reader is None or not self.utt_info:
+            return []
+        index = index % (len(self.utt_info))
+        inputs = []
+        labels = []
+        item_size = random.randint(self.min_item_size, self.max_item_size)
+        for loop_idx in range(item_size):
+            try:
+                utt_index = random.randint(0, len(self.utt_info[index][1])) \
+                        % len(self.utt_info[index][1])
+                key = self.utt_info[index][1][utt_index]
+            except:
+                print(index, utt_index, len(self.utt_info[index][1]))
+                sys.exit(-1)
+            x = self.scp_reader[key]
+            x = np.transpose(x)
+            bg, end = self._random_chunk(x.shape[-1])
+            inputs.append(x[:, bg: end])
+            labels.append(self.utt_info[index][0])
+        return inputs, labels
+
+    def _select_by_utt(self, index):
+        if self.scp_reader is None or len(self.utt_info) == 0:
+            return {}
+        index = index % (len(self.utt_info))
+        key = self.utt_info[index][0]
+        x = self.scp_reader[key]
+        x = np.transpose(x)
+        bg, end = self._random_chunk(x.shape[-1])
+
+        y = self.utt_info[index][1]
+
+        return [x[:, bg: end]], [y]
+
+    def __getitem__(self, index):
+        if self._is_select_by_speaker:
+            return self._select_by_speaker(index)
+        else:
+            return self._select_by_utt(index)
+
+    def __len__(self):
+        return len(self.utt_info) * self.repeat
+
+    def __iter__(self):
+        self._start = 0
+        return self
+
+    def __next__(self):
+        if self._start < len(self):
+            ret = self[self._start]
+            self._start += 1
+            return ret
+        else:
+            raise StopIteration
diff --git a/paddlespeech/vector/datasets/egs_dataset.py b/paddlespeech/vector/datasets/egs_dataset.py
new file mode 100644
index 00000000..53130d5f
--- /dev/null
+++ b/paddlespeech/vector/datasets/egs_dataset.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Load nnet3 training egs which generated by kaldi
+"""
+
+import random
+import numpy as np
+import kaldi_python_io as k_io
+from paddle.io import Dataset
+import paddlespeech.vector.utils.utils as utils
+from paddlespeech.vector import _logger as log
+class KaldiEgsDataset(Dataset):
+    """
+    Dataset used to load kaldi nnet3 egs files.
+    """
+    def __init__(self, egs_list_file, egs_idx, transforms=None):
+        self.scp_reader = None
+        self.subset_idx = egs_idx - 1
+        self.transforms = transforms
+        if not utils.is_exist(egs_list_file):
+            return
+
+        self.egs_files = []
+        with open(egs_list_file, 'r') as in_fh:
+            for line in in_fh:
+                if line.strip():
+                    self.egs_files.append(line.strip())
+
+        self.next_subset()
+
+    def next_subset(self, target_index=None, delta_index=None):
+        """
+        Use next specific subset
+
+        Args:
+            target_index: target egs index
+            delta_index: incremental value of egs index
+        """
+        if self.egs_files:
+            if target_index:
+                self.subset_idx = target_index
+            else:
+                delta_index = delta_index if delta_index else 1
+                self.subset_idx += delta_index
+            log.info("egs dataset subset index: %d" % (self.subset_idx))
+            egs_file = self.egs_files[self.subset_idx % len(self.egs_files)]
+            if utils.is_exist(egs_file):
+                self.scp_reader = k_io.Nnet3EgsScriptReader(egs_file)
+            else:
+                log.warning("No such file or directory: %s" % (egs_file))
+
+    def __getitem__(self, index):
+        if self.scp_reader is None:
+            return {}
+        index %= len(self)
+        in_dict, out_dict = self.scp_reader[index]
+        x = np.array(in_dict['matrix'])
+        x = np.transpose(x)
+        y = np.array(out_dict['matrix'][0][0][0], dtype=np.int).reshape((1,))
+        if self.transforms is not None:
+            idx = random.randint(0, len(self.transforms) - 1)
+            x = self.transforms[idx](x)
+        return x, y
+
+    def __len__(self):
+        return len(self.scp_reader)
+
+    def __iter__(self):
+        self._start = 0
+        return self
+
+    def __next__(self):
+        if self._start < len(self):
+            ret = self[self._start]
+            self._start += 1
+            return ret
+        else:
+            raise StopIteration
\ No newline at end of file
diff --git a/paddlespeech/vector/utils/data_utils.py b/paddlespeech/vector/utils/data_utils.py
old mode 100644
new mode 100755
diff --git a/paddlespeech/vector/utils/utils.py b/paddlespeech/vector/utils/utils.py
old mode 100644
new mode 100755
index c46e42c2..a28cb526
--- a/paddlespeech/vector/utils/utils.py
+++ b/paddlespeech/vector/utils/utils.py
@@ -20,7 +20,7 @@ import sys
 import paddle
 import numpy as np
 
-from sidt import _logger as log
+from paddlespeech.vector import _logger as log
 
 
 def exit_if_not_exist(in_path):

From 70d3b01c0dc76a70dfe2a93ab6184fb1a69757d9 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 26 Feb 2022 17:35:03 +0800
Subject: [PATCH 05/41] remove invalid code

---
 paddlespeech/vector/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/paddlespeech/vector/__init__.py b/paddlespeech/vector/__init__.py
index 2a3588ec..5c846193 100644
--- a/paddlespeech/vector/__init__.py
+++ b/paddlespeech/vector/__init__.py
@@ -39,6 +39,3 @@ handler.setFormatter(colored_formatter)
 _logger.addHandler(handler)
 _logger.setLevel(sidt_logging.INFO)
 
-from .trainer.trainer import Trainer
-from .dataset.ark_dataset import create_kaldi_ark_dataset
-from .dataset.egs_dataset import create_kaldi_egs_dataset

From 7ef60ebae2e4c14c9f2bd1954c420987a5d8369a Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 2 Mar 2022 20:36:39 +0800
Subject: [PATCH 06/41] add voxceleb1 data prepare

---
 dataset/voxceleb/voxceleb1.py        | 463 +++++++++++++++++----------
 examples/voxceleb/sv0/local/train.py |  31 ++
 examples/voxceleb/sv0/path.sh        |  11 +
 examples/voxceleb/sv0/run.sh         |  10 +
 4 files changed, 342 insertions(+), 173 deletions(-)
 create mode 100644 examples/voxceleb/sv0/local/train.py
 create mode 100755 examples/voxceleb/sv0/path.sh
 create mode 100755 examples/voxceleb/sv0/run.sh

diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index e50c91bc..0c9c68dc 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -11,182 +11,299 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Prepare VoxCeleb1 dataset
-
-create manifest files.
-Manifest file is a json-format file with each line containing the
-meta data (i.e. audio filepath, transcript and audio duration)
-of each audio file in the data set.
-
-researchers should download the voxceleb1 dataset yourselves
-through google form to get the username & password and unpack the data
-"""
-import argparse
-import codecs
+
+import collections
+import csv
 import glob
-import json
 import os
-import subprocess
-from pathlib import Path
+import random
+from typing import Dict, List, Tuple
 
-import soundfile
+from paddle.io import Dataset
+from tqdm import tqdm
+from pathos.multiprocessing import Pool
 
-from utils.utility import check_md5sum
+from paddleaudio.backends import load as load_audio
+from paddleaudio.utils import DATA_HOME, decompress, download_and_decompress
+from paddleaudio.datasets.dataset import feat_funcs
+from utils.utility import unpack
 from utils.utility import download
-from utils.utility import unzip
-
-# all the data will be download in the current data/voxceleb directory default
-DATA_HOME = os.path.expanduser('.')
-
-# if you use the http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/ as the download base url
-# you need to get the username & password via the google form
-
-# if you use the https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a as the download base url,
-# you need use --no-check-certificate to connect the target download url 
-
-BASE_URL = "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a"
-
-# dev data
-DEV_LIST = {
-    "vox1_dev_wav_partaa": "e395d020928bc15670b570a21695ed96",
-    "vox1_dev_wav_partab": "bbfaaccefab65d82b21903e81a8a8020",
-    "vox1_dev_wav_partac": "017d579a2a96a077f40042ec33e51512",
-    "vox1_dev_wav_partad": "7bb1e9f70fddc7a678fa998ea8b3ba19",
-}
-DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b"
-
-# test data
-TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
-TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
-
-# kaldi trial
-# this trial file is organized by kaldi according the official file,
-# which is a little different with the official trial veri_test2.txt
-KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
-TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
-TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/voxceleb1/",
-    type=str,
-    help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    data_path = os.path.join(data_dir, "wav", "**", "*.wav")
-    total_sec = 0.0
-    total_text = 0.0
-    total_num = 0
-    speakers = set()
-    for audio_path in glob.glob(data_path, recursive=True):
-        audio_id = "-".join(audio_path.split("/")[-3:])
-        utt2spk = audio_path.split("/")[-3]
-        duration = soundfile.info(audio_path).duration
-        text = ""
-        json_lines.append(
-            json.dumps(
-                {
-                    "utt": audio_id,
-                    "utt2spk": str(utt2spk),
-                    "feat": audio_path,
-                    "feat_shape": (duration, ),
-                    "text": text  # compatible with asr data format
-                },
-                ensure_ascii=False))
-
-        total_sec += duration
-        total_text += len(text)
-        total_num += 1
-        speakers.add(utt2spk)
-
-    # data_dir_name refer to dev or test
-    # voxceleb1 is given explicit in the path
-    data_dir_name = Path(data_dir).name
-    manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
-    with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
-        for line in json_lines:
-            f.write(line + "\n")
-
-    manifest_dir = os.path.dirname(manifest_path_prefix)
-    meta_path = os.path.join(manifest_dir, "voxceleb1." +
-                             data_dir_name) + ".meta"
-    with codecs.open(meta_path, 'w', encoding='utf-8') as f:
-        print(f"{total_num} utts", file=f)
-        print(f"{len(speakers)} speakers", file=f)
-        print(f"{total_sec / (60 * 60)} h", file=f)
-        print(f"{total_text} text", file=f)
-        print(f"{total_text / total_sec} text/sec", file=f)
-        print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(base_url, data_list, target_dir, manifest_path,
-                    target_data):
-    if not os.path.exists(target_dir):
-        os.mkdir(target_dir)
-
-    # wav directory already exists, it need do nothing
-    if not os.path.exists(os.path.join(target_dir, "wav")):
-        # download all dataset part
-        for zip_part in data_list.keys():
-            download_url = " --no-check-certificate " + base_url + "/" + zip_part
-            download(
-                url=download_url,
-                md5sum=data_list[zip_part],
-                target_dir=target_dir)
-
-        # pack the all part to target zip file
-        all_target_part, target_name, target_md5sum = target_data.split()
-        target_name = os.path.join(target_dir, target_name)
-        if not os.path.exists(target_name):
-            pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
-                                                    target_name)
-            subprocess.call(pack_part_cmd, shell=True)
-
-        # check the target zip file md5sum
-        if not check_md5sum(target_name, target_md5sum):
-            raise RuntimeError("{} MD5 checkssum failed".format(target_name))
-        else:
-            print("Check {} md5sum successfully".format(target_name))
-
-        # unzip the all zip file
-        if target_name.endswith(".zip"):
-            unzip(target_name, target_dir)
-
-    # create the manifest file
-    create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
 
-    prepare_dataset(
-        base_url=BASE_URL,
-        data_list=DEV_LIST,
-        target_dir=os.path.join(args.target_dir, "dev"),
-        manifest_path=args.manifest_prefix,
-        target_data=DEV_TARGET_DATA)
-
-    prepare_dataset(
-        base_url=BASE_URL,
-        data_list=TEST_LIST,
-        target_dir=os.path.join(args.target_dir, "test"),
-        manifest_path=args.manifest_prefix,
-        target_data=TEST_TARGET_DATA)
-
-    print("Manifest prepare done!")
-
-
-if __name__ == '__main__':
-    main()
+__all__ = ['VoxCeleb1']
+
+
+class VoxCeleb1(Dataset):
+    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
+    archieves_audio_dev = [
+        {
+            'url': source_url + 'vox1_dev_wav_partaa',
+            'md5': 'e395d020928bc15670b570a21695ed96',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partab',
+            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partac',
+            'md5': '017d579a2a96a077f40042ec33e51512',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partad',
+            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
+        },
+    ]
+    archieves_audio_test = [
+        {
+            'url': source_url + 'vox1_test_wav.zip',
+            'md5': '185fdc63c3c739954633d50379a3d102',
+        },
+    ]
+    archieves_meta = [
+        {
+            'url': 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
+            'md5': 'b73110731c9223c1461fe49cb48dddfc',
+        },
+    ]
+
+
+    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+    sample_rate = 16000
+    meta_info = collections.namedtuple(
+        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
+    base_path = os.path.join(DATA_HOME, 'vox1')
+    wav_path = os.path.join(base_path, 'wav')
+    subsets = ['train', 'dev', 'enrol', 'test']
+
+    def __init__(self,
+                 subset: str = 'train',
+                 feat_type: str = 'raw',
+                 random_chunk: bool = True,
+                 chunk_duration: float = 3.0,       # seconds
+                 split_ratio: float = 0.9,          # train split ratio
+                 seed: int = 0,
+                 target_dir: str = None,
+                 **kwargs):
+
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.spk_id2label = {}
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+        self.split_ratio = split_ratio
+        self.target_dir = target_dir if target_dir else self.base_path
+        self.csv_path = os.path.join(target_dir, 'csv') if target_dir else os.path.join(self.base_path, 'csv')
+        self.meta_path = os.path.join(target_dir, 'meta') if target_dir else os.path.join(base_path, 'meta')
+        self.veri_test_file = os.path.join(self.meta_path, 'veri_test2.txt')
+        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
+        self._data = self._get_data()
+        super(VoxCeleb1, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
+        # so, we check the vox1/wav dir status
+        print("wav base path: {}".format(self.wav_path))
+        if not os.path.isdir(self.wav_path):
+            print("start to download the voxceleb1 dataset")
+            download_and_decompress(    # multi-zip parts concatenate to vox1_dev_wav.zip
+                self.archieves_audio_dev, self.base_path, decompress=False)
+            download_and_decompress(    # download the vox1_test_wav.zip and unzip
+                self.archieves_audio_test, self.base_path, decompress=True)
+
+            # Download all parts and concatenate the files into one zip file.
+            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
+            print(f'Concatenating all parts to: {dev_zipfile}')
+            os.system(
+                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
+            )
+
+            # Extract all audio files of dev and test set.
+            decompress(dev_zipfile, self.base_path)
+
+        # Download meta files.
+        if not os.path.isdir(self.meta_path):
+            download_and_decompress(
+                self.archieves_meta, self.meta_path, decompress=False)
+        
+        # Data preparation.
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav, start, stop, spk_id = line.strip(
+                ).split(',')
+                data.append(
+                    self.meta_info(audio_id, float(duration), wav, int(start),
+                                   int(stop), spk_id))
+
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
+            for line in f.readlines():
+                spk_id, label = line.strip().split(' ')
+                self.spk_id2label[spk_id] = int(label)
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        # random select a chunk audio samples from the audio
+        if self.random_chunk:
+            num_wav_samples = waveform.shape[0]
+            num_chunk_samples = int(self.chunk_duration * sr)
+            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
+            stop = start + num_chunk_samples
+        else:
+            start = record['start']
+            stop = record['stop']
+
+        waveform = waveform[start:stop]
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        if self.subset in ['train',
+                           'dev']:  # Labels are available in train and dev.
+            record.update({'label': self.spk_id2label[record['spk_id']]})
+
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
+        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for chunk in uniq_chunks_list:
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                # id, duration, wav, start, stop, spk_id
+                ret.append([
+                    chunk, audio_duration, wav_file, start_sample, end_sample,
+                    spk_id
+                ])
+        else:  # Keep whole audio.
+            ret.append([
+                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
+            ])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool = True):
+        print(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
+
+        with Pool(64) as p:
+            infos = list(
+                tqdm(
+                    p.imap(lambda x: self._get_audio_info(x, split_chunks), wav_files), total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        # Audio of speakers in veri_test_file should not be included in training set.
+        print("start to prepare the data csv file")
+        enrol_files = set()
+        test_files = set()
+        # get the enroll and test audio file path
+        with open(self.veri_test_file, 'r') as f:
+            for line in f.readlines():
+                _, enrol_file, test_file = line.strip().split(' ')
+                enrol_files.add(os.path.join(self.wav_path, enrol_file))
+                test_files.add(os.path.join(self.wav_path, test_file))
+            enrol_files = sorted(enrol_files)
+            test_files = sorted(test_files)
+
+        # get the enroll and test speakers
+        test_spks = set()
+        for file in (enrol_files + test_files):
+            spk = file.split('/wav/')[1].split('/')[0]
+            test_spks.add(spk)
+
+        # get all the train and dev audios file path
+        audio_files = []
+        speakers = set()
+        for path in [self.wav_path]:
+            for file in glob.glob(os.path.join(path, "**", "*.wav"), recursive=True):
+                spk = file.split('/wav/')[1].split('/')[0]
+                if spk in test_spks:
+                    continue
+                speakers.add(spk)
+                audio_files.append(file)
+
+        print("start to generate the {}".format(os.path.join(self.meta_path, 'spk_id2label.txt')))
+        # encode the train and dev speakers label to spk_id2label.txt
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
+            for label, spk_id in enumerate(sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
+                f.write(f'{spk_id} {label}\n')
+
+        audio_files = sorted(audio_files)
+        random.shuffle(audio_files)
+        split_idx = int(self.split_ratio * len(audio_files))
+        # split_ratio to train
+        train_files, dev_files = audio_files[:split_idx], audio_files[split_idx:]
+
+        self.generate_csv(train_files, 
+                          os.path.join(self.csv_path, 'train.csv'))
+        self.generate_csv(dev_files, 
+                          os.path.join(self.csv_path, 'dev.csv'))
+        self.generate_csv(enrol_files,
+                          os.path.join(self.csv_path, 'enrol.csv'),
+                          split_chunks=False)
+        self.generate_csv(test_files,
+                          os.path.join(self.csv_path, 'test.csv'),
+                          split_chunks=False)
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
new file mode 100644
index 00000000..e8619cca
--- /dev/null
+++ b/examples/voxceleb/sv0/local/train.py
@@ -0,0 +1,31 @@
+import argparse
+import paddle
+from dataset.voxceleb.voxceleb1 import VoxCeleb1
+
+
+def main(args):
+    paddle.set_device(args.device)
+
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    paddle.distributed.init_parallel_env()
+    nranks = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+
+    # stage2: data prepare
+    train_ds = VoxCeleb1('train', target_dir=args.data_dir)
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device', 
+                        choices=['cpu', 'gpu'], 
+                        default="cpu", 
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    args = parser.parse_args()
+    # yapf: enable
+
+    main(args)
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
new file mode 100755
index 00000000..38a242a4
--- /dev/null
+++ b/examples/voxceleb/sv0/path.sh
@@ -0,0 +1,11 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
new file mode 100755
index 00000000..c24cbff4
--- /dev/null
+++ b/examples/voxceleb/sv0/run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+. ./path.sh
+set -e
+export PPAUDIO_HOME=/home/users/xiongxinlei/exprts/v3
+dir=./data/
+mkdir -p ${dir}
+# you can set the variable DATA_HOME to specifiy the downloaded the vox1 and vox2 dataset
+/home/users/xiongxinlei/.conda/envs/xxl_base/bin/python3 \
+     local/train.py \
+     --data-dir ${dir}
\ No newline at end of file

From 0780d181d29d8470ba3579aa0d0ef9465c5ad264 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 2 Mar 2022 20:55:31 +0800
Subject: [PATCH 07/41] remove personal code test=doc

---
 dataset/voxceleb/voxceleb1.py        | 106 ++++++++++++++++-----------
 examples/voxceleb/sv0/local/train.py |  24 +++++-
 examples/voxceleb/sv0/run.sh         |   4 +-
 3 files changed, 85 insertions(+), 49 deletions(-)

diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index 0c9c68dc..b2d5f5c3 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -11,23 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import collections
 import csv
 import glob
 import os
 import random
-from typing import Dict, List, Tuple
+from typing import Dict
+from typing import List
+from typing import Tuple
 
 from paddle.io import Dataset
-from tqdm import tqdm
 from pathos.multiprocessing import Pool
+from tqdm import tqdm
 
 from paddleaudio.backends import load as load_audio
-from paddleaudio.utils import DATA_HOME, decompress, download_and_decompress
 from paddleaudio.datasets.dataset import feat_funcs
-from utils.utility import unpack
+from paddleaudio.utils import DATA_HOME
+from paddleaudio.utils import decompress
+from paddleaudio.utils import download_and_decompress
 from utils.utility import download
+from utils.utility import unpack
 
 __all__ = ['VoxCeleb1']
 
@@ -60,12 +63,13 @@ class VoxCeleb1(Dataset):
     ]
     archieves_meta = [
         {
-            'url': 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
-            'md5': 'b73110731c9223c1461fe49cb48dddfc',
+            'url':
+            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
+            'md5':
+            'b73110731c9223c1461fe49cb48dddfc',
         },
     ]
 
-
     num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
     sample_rate = 16000
     meta_info = collections.namedtuple(
@@ -74,15 +78,16 @@ class VoxCeleb1(Dataset):
     wav_path = os.path.join(base_path, 'wav')
     subsets = ['train', 'dev', 'enrol', 'test']
 
-    def __init__(self,
-                 subset: str = 'train',
-                 feat_type: str = 'raw',
-                 random_chunk: bool = True,
-                 chunk_duration: float = 3.0,       # seconds
-                 split_ratio: float = 0.9,          # train split ratio
-                 seed: int = 0,
-                 target_dir: str = None,
-                 **kwargs):
+    def __init__(
+            self,
+            subset: str='train',
+            feat_type: str='raw',
+            random_chunk: bool=True,
+            chunk_duration: float=3.0,  # seconds
+            split_ratio: float=0.9,  # train split ratio
+            seed: int=0,
+            target_dir: str=None,
+            **kwargs):
 
         assert subset in self.subsets, \
             'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
@@ -95,8 +100,12 @@ class VoxCeleb1(Dataset):
         self.chunk_duration = chunk_duration
         self.split_ratio = split_ratio
         self.target_dir = target_dir if target_dir else self.base_path
-        self.csv_path = os.path.join(target_dir, 'csv') if target_dir else os.path.join(self.base_path, 'csv')
-        self.meta_path = os.path.join(target_dir, 'meta') if target_dir else os.path.join(base_path, 'meta')
+        self.csv_path = os.path.join(
+            target_dir, 'csv') if target_dir else os.path.join(self.base_path,
+                                                               'csv')
+        self.meta_path = os.path.join(
+            target_dir, 'meta') if target_dir else os.path.join(base_path,
+                                                                'meta')
         self.veri_test_file = os.path.join(self.meta_path, 'veri_test2.txt')
         # self._data = self._get_data()[:1000]  # KP: Small dataset test.
         self._data = self._get_data()
@@ -112,10 +121,14 @@ class VoxCeleb1(Dataset):
         print("wav base path: {}".format(self.wav_path))
         if not os.path.isdir(self.wav_path):
             print("start to download the voxceleb1 dataset")
-            download_and_decompress(    # multi-zip parts concatenate to vox1_dev_wav.zip
-                self.archieves_audio_dev, self.base_path, decompress=False)
-            download_and_decompress(    # download the vox1_test_wav.zip and unzip
-                self.archieves_audio_test, self.base_path, decompress=True)
+            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
+                self.archieves_audio_dev,
+                self.base_path,
+                decompress=False)
+            download_and_decompress(  # download the vox1_test_wav.zip and unzip
+                self.archieves_audio_test,
+                self.base_path,
+                decompress=True)
 
             # Download all parts and concatenate the files into one zip file.
             dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
@@ -131,7 +144,7 @@ class VoxCeleb1(Dataset):
         if not os.path.isdir(self.meta_path):
             download_and_decompress(
                 self.archieves_meta, self.meta_path, decompress=False)
-        
+
         # Data preparation.
         if not os.path.isdir(self.csv_path):
             os.makedirs(self.csv_path)
@@ -143,8 +156,9 @@ class VoxCeleb1(Dataset):
                 audio_id, duration, wav, start, stop, spk_id = line.strip(
                 ).split(',')
                 data.append(
-                    self.meta_info(audio_id, float(duration), wav, int(start),
-                                   int(stop), spk_id))
+                    self.meta_info(audio_id,
+                                   float(duration), wav,
+                                   int(start), int(stop), spk_id))
 
         with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
             for line in f.readlines():
@@ -228,14 +242,16 @@ class VoxCeleb1(Dataset):
     def generate_csv(self,
                      wav_files: List[str],
                      output_file: str,
-                     split_chunks: bool = True):
+                     split_chunks: bool=True):
         print(f'Generating csv: {output_file}')
         header = ["id", "duration", "wav", "start", "stop", "spk_id"]
 
         with Pool(64) as p:
             infos = list(
                 tqdm(
-                    p.imap(lambda x: self._get_audio_info(x, split_chunks), wav_files), total=len(wav_files)))
+                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
+                           wav_files),
+                    total=len(wav_files)))
 
         csv_lines = []
         for info in infos:
@@ -272,35 +288,39 @@ class VoxCeleb1(Dataset):
         audio_files = []
         speakers = set()
         for path in [self.wav_path]:
-            for file in glob.glob(os.path.join(path, "**", "*.wav"), recursive=True):
+            for file in glob.glob(
+                    os.path.join(path, "**", "*.wav"), recursive=True):
                 spk = file.split('/wav/')[1].split('/')[0]
                 if spk in test_spks:
                     continue
                 speakers.add(spk)
                 audio_files.append(file)
 
-        print("start to generate the {}".format(os.path.join(self.meta_path, 'spk_id2label.txt')))
+        print("start to generate the {}".format(
+            os.path.join(self.meta_path, 'spk_id2label.txt')))
         # encode the train and dev speakers label to spk_id2label.txt
         with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
-            for label, spk_id in enumerate(sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
+            for label, spk_id in enumerate(
+                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
                 f.write(f'{spk_id} {label}\n')
 
         audio_files = sorted(audio_files)
         random.shuffle(audio_files)
         split_idx = int(self.split_ratio * len(audio_files))
         # split_ratio to train
-        train_files, dev_files = audio_files[:split_idx], audio_files[split_idx:]
-
-        self.generate_csv(train_files, 
-                          os.path.join(self.csv_path, 'train.csv'))
-        self.generate_csv(dev_files, 
-                          os.path.join(self.csv_path, 'dev.csv'))
-        self.generate_csv(enrol_files,
-                          os.path.join(self.csv_path, 'enrol.csv'),
-                          split_chunks=False)
-        self.generate_csv(test_files,
-                          os.path.join(self.csv_path, 'test.csv'),
-                          split_chunks=False)
+        train_files, dev_files = audio_files[:split_idx], audio_files[
+            split_idx:]
+
+        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
+        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
+        self.generate_csv(
+            enrol_files,
+            os.path.join(self.csv_path, 'enrol.csv'),
+            split_chunks=False)
+        self.generate_csv(
+            test_files,
+            os.path.join(self.csv_path, 'test.csv'),
+            split_chunks=False)
 
     def __getitem__(self, idx):
         return self._convert_to_record(idx)
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index e8619cca..c0cb1e17 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
+
 import paddle
+
 from dataset.voxceleb.voxceleb1 import VoxCeleb1
 
 
@@ -14,12 +29,13 @@ def main(args):
     # stage2: data prepare
     train_ds = VoxCeleb1('train', target_dir=args.data_dir)
 
+
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument('--device', 
-                        choices=['cpu', 'gpu'], 
-                        default="cpu", 
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="cpu",
                         help="Select which device to train model, defaults to gpu.")
     parser.add_argument("--data-dir",
                         default="./data/",
@@ -28,4 +44,4 @@ if __name__ == "__main__":
     args = parser.parse_args()
     # yapf: enable
 
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index c24cbff4..a8debfc6 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -5,6 +5,6 @@ export PPAUDIO_HOME=/home/users/xiongxinlei/exprts/v3
 dir=./data/
 mkdir -p ${dir}
 # you can set the variable DATA_HOME to specifiy the downloaded the vox1 and vox2 dataset
-/home/users/xiongxinlei/.conda/envs/xxl_base/bin/python3 \
+python3 \
      local/train.py \
-     --data-dir ${dir}
\ No newline at end of file
+     --data-dir ${dir}

From 3a943ca95b13818409efa6253b05fb2831ab2419 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 2 Mar 2022 20:59:45 +0800
Subject: [PATCH 08/41] repair the variable name bug

---
 examples/voxceleb/sv0/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index a8debfc6..a96c3827 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 . ./path.sh
 set -e
-export PPAUDIO_HOME=/home/users/xiongxinlei/exprts/v3
+
 dir=./data/
 mkdir -p ${dir}
-# you can set the variable DATA_HOME to specifiy the downloaded the vox1 and vox2 dataset
+# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
 python3 \
      local/train.py \
      --data-dir ${dir}

From dc28ebe4eecc7b0cd72b3f4991830fcf5f907b52 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 3 Mar 2022 14:55:37 +0800
Subject: [PATCH 09/41] move the csv vox format to paddleaudio, test=doc

---
 dataset/voxceleb/voxceleb1.py    | 487 +++++++++++--------------------
 paddleaudio/datasets/voxceleb.py | 329 +++++++++++++++++++++
 2 files changed, 504 insertions(+), 312 deletions(-)
 create mode 100644 paddleaudio/datasets/voxceleb.py

diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index b2d5f5c3..c6fc0695 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -11,319 +11,182 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import collections
-import csv
+"""Prepare VoxCeleb1 dataset
+
+create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+
+researchers should download the voxceleb1 dataset yourselves
+through google form to get the username & password and unpack the data
+"""
+import argparse
+import codecs
 import glob
+import json
 import os
-import random
-from typing import Dict
-from typing import List
-from typing import Tuple
-
-from paddle.io import Dataset
-from pathos.multiprocessing import Pool
-from tqdm import tqdm
-
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets.dataset import feat_funcs
-from paddleaudio.utils import DATA_HOME
-from paddleaudio.utils import decompress
-from paddleaudio.utils import download_and_decompress
+import subprocess
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import check_md5sum
 from utils.utility import download
-from utils.utility import unpack
-
-__all__ = ['VoxCeleb1']
-
-
-class VoxCeleb1(Dataset):
-    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
-    archieves_audio_dev = [
-        {
-            'url': source_url + 'vox1_dev_wav_partaa',
-            'md5': 'e395d020928bc15670b570a21695ed96',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partab',
-            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partac',
-            'md5': '017d579a2a96a077f40042ec33e51512',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partad',
-            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
-        },
-    ]
-    archieves_audio_test = [
-        {
-            'url': source_url + 'vox1_test_wav.zip',
-            'md5': '185fdc63c3c739954633d50379a3d102',
-        },
-    ]
-    archieves_meta = [
-        {
-            'url':
-            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
-            'md5':
-            'b73110731c9223c1461fe49cb48dddfc',
-        },
-    ]
-
-    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
-    sample_rate = 16000
-    meta_info = collections.namedtuple(
-        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
-    base_path = os.path.join(DATA_HOME, 'vox1')
-    wav_path = os.path.join(base_path, 'wav')
-    subsets = ['train', 'dev', 'enrol', 'test']
-
-    def __init__(
-            self,
-            subset: str='train',
-            feat_type: str='raw',
-            random_chunk: bool=True,
-            chunk_duration: float=3.0,  # seconds
-            split_ratio: float=0.9,  # train split ratio
-            seed: int=0,
-            target_dir: str=None,
-            **kwargs):
-
-        assert subset in self.subsets, \
-            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
-
-        self.subset = subset
-        self.spk_id2label = {}
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self.random_chunk = random_chunk
-        self.chunk_duration = chunk_duration
-        self.split_ratio = split_ratio
-        self.target_dir = target_dir if target_dir else self.base_path
-        self.csv_path = os.path.join(
-            target_dir, 'csv') if target_dir else os.path.join(self.base_path,
-                                                               'csv')
-        self.meta_path = os.path.join(
-            target_dir, 'meta') if target_dir else os.path.join(base_path,
-                                                                'meta')
-        self.veri_test_file = os.path.join(self.meta_path, 'veri_test2.txt')
-        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
-        self._data = self._get_data()
-        super(VoxCeleb1, self).__init__()
-
-        # Set up a seed to reproduce training or predicting result.
-        # random.seed(seed)
-
-    def _get_data(self):
-        # Download audio files.
-        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
-        # so, we check the vox1/wav dir status
-        print("wav base path: {}".format(self.wav_path))
-        if not os.path.isdir(self.wav_path):
-            print("start to download the voxceleb1 dataset")
-            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
-                self.archieves_audio_dev,
-                self.base_path,
-                decompress=False)
-            download_and_decompress(  # download the vox1_test_wav.zip and unzip
-                self.archieves_audio_test,
-                self.base_path,
-                decompress=True)
-
-            # Download all parts and concatenate the files into one zip file.
-            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
-            print(f'Concatenating all parts to: {dev_zipfile}')
-            os.system(
-                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
-            )
-
-            # Extract all audio files of dev and test set.
-            decompress(dev_zipfile, self.base_path)
-
-        # Download meta files.
-        if not os.path.isdir(self.meta_path):
-            download_and_decompress(
-                self.archieves_meta, self.meta_path, decompress=False)
-
-        # Data preparation.
-        if not os.path.isdir(self.csv_path):
-            os.makedirs(self.csv_path)
-            self.prepare_data()
-
-        data = []
-        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                audio_id, duration, wav, start, stop, spk_id = line.strip(
-                ).split(',')
-                data.append(
-                    self.meta_info(audio_id,
-                                   float(duration), wav,
-                                   int(start), int(stop), spk_id))
-
-        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
-            for line in f.readlines():
-                spk_id, label = line.strip().split(' ')
-                self.spk_id2label[spk_id] = int(label)
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(record['wav'])
-
-        # random select a chunk audio samples from the audio
-        if self.random_chunk:
-            num_wav_samples = waveform.shape[0]
-            num_chunk_samples = int(self.chunk_duration * sr)
-            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
-            stop = start + num_chunk_samples
+from utils.utility import unzip
+
+# all the data will be download in the current data/voxceleb directory default
+DATA_HOME = os.path.expanduser('.')
+
+# if you use the http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/ as the download base url
+# you need to get the username & password via the google form
+
+# if you use the https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a as the download base url,
+# you need use --no-check-certificate to connect the target download url 
+
+BASE_URL = "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a"
+
+# dev data
+DEV_LIST = {
+    "vox1_dev_wav_partaa": "e395d020928bc15670b570a21695ed96",
+    "vox1_dev_wav_partab": "bbfaaccefab65d82b21903e81a8a8020",
+    "vox1_dev_wav_partac": "017d579a2a96a077f40042ec33e51512",
+    "vox1_dev_wav_partad": "7bb1e9f70fddc7a678fa998ea8b3ba19",
+}
+DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f532ba230b"
+
+# test data
+TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
+TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
+
+# kaldi trial
+# this trial file is organized by kaldi according the official file,
+# which is a little different with the official trial veri_test2.txt
+KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
+TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
+TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/voxceleb1/",
+    type=str,
+    help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    data_path = os.path.join(data_dir, "wav", "**", "*.wav")
+    total_sec = 0.0
+    total_text = 0.0
+    total_num = 0
+    speakers = set()
+    for audio_path in glob.glob(data_path, recursive=True):
+        audio_id = "-".join(audio_path.split("/")[-3:])
+        utt2spk = audio_path.split("/")[-3]
+        duration = soundfile.info(audio_path).duration
+        text = ""
+        json_lines.append(
+            json.dumps(
+                {
+                    "utt": audio_id,
+                    "utt2spk": str(utt2spk),
+                    "feat": audio_path,
+                    "feat_shape": (duration, ),
+                    "text": text  # compatible with asr data format
+                },
+                ensure_ascii=False))
+
+        total_sec += duration
+        total_text += len(text)
+        total_num += 1
+        speakers.add(utt2spk)
+
+    # data_dir_name refer to dev or test
+    # voxceleb1 is given explicit in the path
+    data_dir_name = Path(data_dir).name
+    manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+    with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
+        for line in json_lines:
+            f.write(line + "\n")
+
+    manifest_dir = os.path.dirname(manifest_path_prefix)
+    meta_path = os.path.join(manifest_dir, "voxceleb1." +
+                             data_dir_name) + ".meta"
+    with codecs.open(meta_path, 'w', encoding='utf-8') as f:
+        print(f"{total_num} utts", file=f)
+        print(f"{len(speakers)} speakers", file=f)
+        print(f"{total_sec / (60 * 60)} h", file=f)
+        print(f"{total_text} text", file=f)
+        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(base_url, data_list, target_dir, manifest_path,
+                    target_data):
+    if not os.path.exists(target_dir):
+        os.mkdir(target_dir)
+
+    # wav directory already exists, it need do nothing
+    if not os.path.exists(os.path.join(target_dir, "wav")):
+        # download all dataset part
+        for zip_part in data_list.keys():
+            download_url = " --no-check-certificate " + base_url + "/" + zip_part
+            download(
+                url=download_url,
+                md5sum=data_list[zip_part],
+                target_dir=target_dir)
+
+        # pack the all part to target zip file
+        all_target_part, target_name, target_md5sum = target_data.split()
+        target_name = os.path.join(target_dir, target_name)
+        if not os.path.exists(target_name):
+            pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
+                                                    target_name)
+            subprocess.call(pack_part_cmd, shell=True)
+
+        # check the target zip file md5sum
+        if not check_md5sum(target_name, target_md5sum):
+            raise RuntimeError("{} MD5 checkssum failed".format(target_name))
         else:
-            start = record['start']
-            stop = record['stop']
-
-        waveform = waveform[start:stop]
-
-        assert self.feat_type in feat_funcs.keys(), \
-            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sr=sr, **self.feat_config) if feat_func else waveform
-
-        record.update({'feat': feat})
-        if self.subset in ['train',
-                           'dev']:  # Labels are available in train and dev.
-            record.update({'label': self.spk_id2label[record['spk_id']]})
-
-        return record
-
-    @staticmethod
-    def _get_chunks(seg_dur, audio_id, audio_duration):
-        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
-
-        chunk_lst = [
-            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
-            for i in range(num_chunks)
-        ]
-        return chunk_lst
-
-    def _get_audio_info(self, wav_file: str,
-                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = load_audio(wav_file)
-        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
-        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
-        audio_duration = waveform.shape[0] / sr
-
-        ret = []
-        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
-            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
-                                                audio_duration)
-
-            for chunk in uniq_chunks_list:
-                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
-                start_sample = int(float(s) * sr)
-                end_sample = int(float(e) * sr)
-                # id, duration, wav, start, stop, spk_id
-                ret.append([
-                    chunk, audio_duration, wav_file, start_sample, end_sample,
-                    spk_id
-                ])
-        else:  # Keep whole audio.
-            ret.append([
-                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
-            ])
-        return ret
-
-    def generate_csv(self,
-                     wav_files: List[str],
-                     output_file: str,
-                     split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
-        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
-
-        with Pool(64) as p:
-            infos = list(
-                tqdm(
-                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
-                           wav_files),
-                    total=len(wav_files)))
-
-        csv_lines = []
-        for info in infos:
-            csv_lines.extend(info)
-
-        with open(output_file, mode="w") as csv_f:
-            csv_writer = csv.writer(
-                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-            csv_writer.writerow(header)
-            for line in csv_lines:
-                csv_writer.writerow(line)
-
-    def prepare_data(self):
-        # Audio of speakers in veri_test_file should not be included in training set.
-        print("start to prepare the data csv file")
-        enrol_files = set()
-        test_files = set()
-        # get the enroll and test audio file path
-        with open(self.veri_test_file, 'r') as f:
-            for line in f.readlines():
-                _, enrol_file, test_file = line.strip().split(' ')
-                enrol_files.add(os.path.join(self.wav_path, enrol_file))
-                test_files.add(os.path.join(self.wav_path, test_file))
-            enrol_files = sorted(enrol_files)
-            test_files = sorted(test_files)
-
-        # get the enroll and test speakers
-        test_spks = set()
-        for file in (enrol_files + test_files):
-            spk = file.split('/wav/')[1].split('/')[0]
-            test_spks.add(spk)
-
-        # get all the train and dev audios file path
-        audio_files = []
-        speakers = set()
-        for path in [self.wav_path]:
-            for file in glob.glob(
-                    os.path.join(path, "**", "*.wav"), recursive=True):
-                spk = file.split('/wav/')[1].split('/')[0]
-                if spk in test_spks:
-                    continue
-                speakers.add(spk)
-                audio_files.append(file)
-
-        print("start to generate the {}".format(
-            os.path.join(self.meta_path, 'spk_id2label.txt')))
-        # encode the train and dev speakers label to spk_id2label.txt
-        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
-            for label, spk_id in enumerate(
-                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
-                f.write(f'{spk_id} {label}\n')
-
-        audio_files = sorted(audio_files)
-        random.shuffle(audio_files)
-        split_idx = int(self.split_ratio * len(audio_files))
-        # split_ratio to train
-        train_files, dev_files = audio_files[:split_idx], audio_files[
-            split_idx:]
-
-        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
-        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
-        self.generate_csv(
-            enrol_files,
-            os.path.join(self.csv_path, 'enrol.csv'),
-            split_chunks=False)
-        self.generate_csv(
-            test_files,
-            os.path.join(self.csv_path, 'test.csv'),
-            split_chunks=False)
-
-    def __getitem__(self, idx):
-        return self._convert_to_record(idx)
-
-    def __len__(self):
-        return len(self._data)
+            print("Check {} md5sum successfully".format(target_name))
+
+        # unzip the all zip file
+        if target_name.endswith(".zip"):
+            unzip(target_name, target_dir)
+
+    # create the manifest file
+    create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        base_url=BASE_URL,
+        data_list=DEV_LIST,
+        target_dir=os.path.join(args.target_dir, "dev"),
+        manifest_path=args.manifest_prefix,
+        target_data=DEV_TARGET_DATA)
+
+    prepare_dataset(
+        base_url=BASE_URL,
+        data_list=TEST_LIST,
+        target_dir=os.path.join(args.target_dir, "test"),
+        manifest_path=args.manifest_prefix,
+        target_data=TEST_TARGET_DATA)
+
+    print("Manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py
new file mode 100644
index 00000000..70cf3e7a
--- /dev/null
+++ b/paddleaudio/datasets/voxceleb.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import glob
+import os
+import random
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+from paddle.io import Dataset
+from pathos.multiprocessing import Pool
+from tqdm import tqdm
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets.dataset import feat_funcs
+from paddleaudio.utils import DATA_HOME
+from paddleaudio.utils import decompress
+from paddleaudio.utils import download_and_decompress
+from utils.utility import download
+from utils.utility import unpack
+
+__all__ = ['VoxCeleb1']
+
+
+class VoxCeleb1(Dataset):
+    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
+    archieves_audio_dev = [
+        {
+            'url': source_url + 'vox1_dev_wav_partaa',
+            'md5': 'e395d020928bc15670b570a21695ed96',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partab',
+            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partac',
+            'md5': '017d579a2a96a077f40042ec33e51512',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partad',
+            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
+        },
+    ]
+    archieves_audio_test = [
+        {
+            'url': source_url + 'vox1_test_wav.zip',
+            'md5': '185fdc63c3c739954633d50379a3d102',
+        },
+    ]
+    archieves_meta = [
+        {
+            'url':
+            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
+            'md5':
+            'b73110731c9223c1461fe49cb48dddfc',
+        },
+    ]
+
+    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+    sample_rate = 16000
+    meta_info = collections.namedtuple(
+        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
+    base_path = os.path.join(DATA_HOME, 'vox1')
+    wav_path = os.path.join(base_path, 'wav')
+    subsets = ['train', 'dev', 'enrol', 'test']
+
+    def __init__(
+            self,
+            subset: str='train',
+            feat_type: str='raw',
+            random_chunk: bool=True,
+            chunk_duration: float=3.0,  # seconds
+            split_ratio: float=0.9,  # train split ratio
+            seed: int=0,
+            target_dir: str=None,
+            **kwargs):
+
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.spk_id2label = {}
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+        self.split_ratio = split_ratio
+        self.target_dir = target_dir if target_dir else self.base_path
+        self.csv_path = os.path.join(
+            target_dir, 'csv') if target_dir else os.path.join(self.base_path,
+                                                               'csv')
+        self.meta_path = os.path.join(
+            target_dir, 'meta') if target_dir else os.path.join(self.base_path,
+                                                                'meta')
+        self.veri_test_file = os.path.join(self.meta_path, 'veri_test2.txt')
+        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
+        self._data = self._get_data()
+        super(VoxCeleb1, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
+        # so, we check the vox1/wav dir status
+        print("wav base path: {}".format(self.wav_path))
+        if not os.path.isdir(self.wav_path):
+            print("start to download the voxceleb1 dataset")
+            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
+                self.archieves_audio_dev,
+                self.base_path,
+                decompress=False)
+            download_and_decompress(  # download the vox1_test_wav.zip and unzip
+                self.archieves_audio_test,
+                self.base_path,
+                decompress=True)
+
+            # Download all parts and concatenate the files into one zip file.
+            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
+            print(f'Concatenating all parts to: {dev_zipfile}')
+            os.system(
+                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
+            )
+
+            # Extract all audio files of dev and test set.
+            decompress(dev_zipfile, self.base_path)
+
+        # Download meta files.
+        if not os.path.isdir(self.meta_path):
+            download_and_decompress(
+                self.archieves_meta, self.meta_path, decompress=False)
+
+        # Data preparation.
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav, start, stop, spk_id = line.strip(
+                ).split(',')
+                data.append(
+                    self.meta_info(audio_id,
+                                   float(duration), wav,
+                                   int(start), int(stop), spk_id))
+
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
+            for line in f.readlines():
+                spk_id, label = line.strip().split(' ')
+                self.spk_id2label[spk_id] = int(label)
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        # random select a chunk audio samples from the audio
+        if self.random_chunk:
+            num_wav_samples = waveform.shape[0]
+            num_chunk_samples = int(self.chunk_duration * sr)
+            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
+            stop = start + num_chunk_samples
+        else:
+            start = record['start']
+            stop = record['stop']
+
+        waveform = waveform[start:stop]
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        if self.subset in ['train',
+                           'dev']:  # Labels are available in train and dev.
+            record.update({'label': self.spk_id2label[record['spk_id']]})
+
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
+        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for chunk in uniq_chunks_list:
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                # id, duration, wav, start, stop, spk_id
+                ret.append([
+                    chunk, audio_duration, wav_file, start_sample, end_sample,
+                    spk_id
+                ])
+        else:  # Keep whole audio.
+            ret.append([
+                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
+            ])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        print(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
+
+        with Pool(64) as p:
+            infos = list(
+                tqdm(
+                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
+                           wav_files),
+                    total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        # Audio of speakers in veri_test_file should not be included in training set.
+        print("start to prepare the data csv file")
+        enrol_files = set()
+        test_files = set()
+        # get the enroll and test audio file path
+        with open(self.veri_test_file, 'r') as f:
+            for line in f.readlines():
+                _, enrol_file, test_file = line.strip().split(' ')
+                enrol_files.add(os.path.join(self.wav_path, enrol_file))
+                test_files.add(os.path.join(self.wav_path, test_file))
+            enrol_files = sorted(enrol_files)
+            test_files = sorted(test_files)
+
+        # get the enroll and test speakers
+        test_spks = set()
+        for file in (enrol_files + test_files):
+            spk = file.split('/wav/')[1].split('/')[0]
+            test_spks.add(spk)
+
+        # get all the train and dev audios file path
+        audio_files = []
+        speakers = set()
+        for path in [self.wav_path]:
+            for file in glob.glob(
+                    os.path.join(path, "**", "*.wav"), recursive=True):
+                spk = file.split('/wav/')[1].split('/')[0]
+                if spk in test_spks:
+                    continue
+                speakers.add(spk)
+                audio_files.append(file)
+
+        print("start to generate the {}".format(
+            os.path.join(self.meta_path, 'spk_id2label.txt')))
+        # encode the train and dev speakers label to spk_id2label.txt
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
+            for label, spk_id in enumerate(
+                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
+                f.write(f'{spk_id} {label}\n')
+
+        audio_files = sorted(audio_files)
+        random.shuffle(audio_files)
+        split_idx = int(self.split_ratio * len(audio_files))
+        # split_ratio to train
+        train_files, dev_files = audio_files[:split_idx], audio_files[
+            split_idx:]
+
+        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
+        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
+        self.generate_csv(
+            enrol_files,
+            os.path.join(self.csv_path, 'enrol.csv'),
+            split_chunks=False)
+        self.generate_csv(
+            test_files,
+            os.path.join(self.csv_path, 'test.csv'),
+            split_chunks=False)
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)

From 57c4f4a68cf5b722bfaf6ee0f90c9f1768e7dded Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 3 Mar 2022 16:37:26 +0800
Subject: [PATCH 10/41] add sid learning rate and training model

---
 examples/voxceleb/sv0/local/train.py      | 34 ++++++++++++-
 paddlespeech/vector/layers/lr.py          | 45 +++++++++++++++++
 paddlespeech/vector/training/sid_model.py | 60 +++++++++++++++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 paddlespeech/vector/layers/lr.py
 create mode 100644 paddlespeech/vector/training/sid_model.py

diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index c0cb1e17..8dea5fff 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -15,10 +15,14 @@ import argparse
 
 import paddle
 
-from dataset.voxceleb.voxceleb1 import VoxCeleb1
+from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.vector.layers.lr import CyclicLRScheduler
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.training.sid_model import SpeakerIdetification
 
 
 def main(args):
+    # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
 
     # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
@@ -27,8 +31,32 @@ def main(args):
     local_rank = paddle.distributed.get_rank()
 
     # stage2: data prepare
+    # note: some cmd must do in rank==0
     train_ds = VoxCeleb1('train', target_dir=args.data_dir)
 
+    # stage3: build the dnn backbone model network
+    model_conf = {
+        "input_size": 80,
+        "channels": [1024, 1024, 1024, 1024, 3072],
+        "kernel_sizes": [5, 3, 3, 3, 1],
+        "dilations": [1, 2, 3, 4, 1],
+        "attention_channels": 128,
+        "lin_neurons": 192,
+    }
+    ecapa_tdnn = EcapaTdnn(**model_conf)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+
+    # stage5: build the optimizer, we now only construct the AdamW optimizer
+    lr_schedule = CyclicLRScheduler(
+        base_lr=args.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_schedule, parameters=model.parameters())
+
+    # stage6: build the loss function, we now only support LogSoftmaxWrapper
+
 
 if __name__ == "__main__":
     # yapf: disable
@@ -41,6 +69,10 @@ if __name__ == "__main__":
                         default="./data/",
                         type=str,
                         help="data directory")
+    parser.add_argument("--learning_rate",
+                        type=float,
+                        default=1e-8,
+                        help="Learning rate used to train with warmup.")
     args = parser.parse_args()
     # yapf: enable
 
diff --git a/paddlespeech/vector/layers/lr.py b/paddlespeech/vector/layers/lr.py
new file mode 100644
index 00000000..3dcac057
--- /dev/null
+++ b/paddlespeech/vector/layers/lr.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.optimizer.lr import LRScheduler
+
+
+class CyclicLRScheduler(LRScheduler):
+    def __init__(self,
+                 base_lr: float=1e-8,
+                 max_lr: float=1e-3,
+                 step_size: int=10000):
+
+        super(CyclicLRScheduler, self).__init__()
+
+        self.current_step = -1
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.step_size = step_size
+
+    def step(self):
+        if not hasattr(self, 'current_step'):
+            return
+
+        self.current_step += 1
+        if self.current_step >= 2 * self.step_size:
+            self.current_step %= 2 * self.step_size
+
+        self.last_lr = self.get_lr()
+
+    def get_lr(self):
+        p = self.current_step / (2 * self.step_size)  # Proportion in one cycle.
+        if p < 0.5:  # Increase
+            return self.base_lr + p / 0.5 * (self.max_lr - self.base_lr)
+        else:  # Decrease
+            return self.max_lr - (p / 0.5 - 1) * (self.max_lr - self.base_lr)
diff --git a/paddlespeech/vector/training/sid_model.py b/paddlespeech/vector/training/sid_model.py
new file mode 100644
index 00000000..8a46c3cd
--- /dev/null
+++ b/paddlespeech/vector/training/sid_model.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SpeakerIdetification(nn.Layer):
+    def __init__(
+            self,
+            backbone,
+            num_class,
+            lin_blocks=0,
+            lin_neurons=192,
+            dropout=0.1, ):
+
+        super(SpeakerIdetification, self).__init__()
+        self.backbone = backbone
+        if dropout > 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+        input_size = self.backbone.emb_size
+        self.blocks = nn.LayerList()
+        for i in range(lin_blocks):
+            self.blocks.extend([
+                nn.BatchNorm1D(input_size),
+                nn.Linear(in_features=input_size, out_features=lin_neurons),
+            ])
+            input_size = lin_neurons
+
+        self.weight = paddle.create_parameter(
+            shape=(input_size, num_class),
+            dtype='float32',
+            attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
+
+    def forward(self, x, lengths=None):
+        # x.shape: (N, C, L)
+        x = self.backbone(x, lengths).squeeze(
+            -1)  # (N, emb_size, 1) -> (N, emb_size)
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        for fc in self.blocks:
+            x = fc(x)
+
+        logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0))
+
+        return logits

From 6af2bc3d5badbaa865db9d2e5b371ea2eecb9a0d Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 3 Mar 2022 16:49:46 +0800
Subject: [PATCH 11/41] add sid loss wraper for voxceleb, test=doc

---
 examples/voxceleb/sv0/local/train.py | 39 +++++++++++++++-
 paddleaudio/utils/download.py        | 26 ++++++++---
 paddlespeech/vector/layers/loss.py   | 70 ++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 7 deletions(-)
 create mode 100644 paddlespeech/vector/layers/loss.py

diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index 8dea5fff..1d9a78f9 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import argparse
 
 import paddle
@@ -19,7 +20,7 @@ from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddlespeech.vector.layers.lr import CyclicLRScheduler
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.training.sid_model import SpeakerIdetification
-
+from paddlespeech.vector.layers.loss import AdditiveAngularMargin, LogSoftmaxWrapper
 
 def main(args):
     # stage0: set the training device, cpu or gpu
@@ -33,6 +34,7 @@ def main(args):
     # stage2: data prepare
     # note: some cmd must do in rank==0
     train_ds = VoxCeleb1('train', target_dir=args.data_dir)
+    dev_ds = VoxCeleb1('dev', target_dir=args.data_dir)
 
     # stage3: build the dnn backbone model network
     model_conf = {
@@ -56,8 +58,38 @@ def main(args):
         learning_rate=lr_schedule, parameters=model.parameters())
 
     # stage6: build the loss function, we now only support LogSoftmaxWrapper
+    criterion = LogSoftmaxWrapper(
+        loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
+
+    
+    # stage7: confirm training start epoch
+    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
+    start_epoch = 0
+    if args.load_checkpoint:
+        args.load_checkpoint = os.path.abspath(
+            os.path.expanduser(args.load_checkpoint))
+        try:
+            # load model checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdparams'))
+            model.set_state_dict(state_dict)
 
+            # load optimizer checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdopt'))
+            optimizer.set_state_dict(state_dict)
+            if local_rank == 0:
+                print(f'Checkpoint loaded from {args.load_checkpoint}')
+        except FileExistsError:
+            if local_rank == 0:
+                print('Train from scratch.')
 
+        try:
+            start_epoch = int(args.load_checkpoint[-1])
+            print(f'Restore training from epoch {start_epoch}.')
+        except ValueError:
+            pass
+    
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
@@ -73,6 +105,11 @@ if __name__ == "__main__":
                         type=float,
                         default=1e-8,
                         help="Learning rate used to train with warmup.")
+    parser.add_argument("--load_checkpoint", 
+                        type=str, 
+                        default=None, 
+                        help="Directory to load model checkpoint to contiune trainning.")
+
     args = parser.parse_args()
     # yapf: enable
 
diff --git a/paddleaudio/utils/download.py b/paddleaudio/utils/download.py
index 45a8e57b..a0c02ee1 100644
--- a/paddleaudio/utils/download.py
+++ b/paddleaudio/utils/download.py
@@ -23,15 +23,29 @@ from .log import logger
 download.logger = logger
 
 
-def decompress(file: str):
+def decompress(file: str, path: str=os.PathLike):
     """
-    Extracts all files from a compressed file.
+    Extracts all files from a compressed file to specific path.
     """
     assert os.path.isfile(file), "File: {} not exists.".format(file)
-    download._decompress(file)
 
+    if path is None:
+        print("decompress the data: {}".format(file))
+        download._decompress(file)
+    else:
+        print("decompress the data: {} to {}".format(file, path))
+        if not os.path.isdir(path):
+            os.makedirs(path)
 
-def download_and_decompress(archives: List[Dict[str, str]], path: str):
+        tmp_file = os.path.join(path, os.path.basename(file))
+        os.rename(file, tmp_file)
+        download._decompress(tmp_file)
+        os.rename(tmp_file, file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            decompress: bool=True):
     """
     Download archieves and decompress to specific path.
     """
@@ -41,8 +55,8 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str):
     for archive in archives:
         assert 'url' in archive and 'md5' in archive, \
             'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-
-        download.get_path_from_url(archive['url'], path, archive['md5'])
+        download.get_path_from_url(
+            archive['url'], path, archive['md5'], decompress=decompress)
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
diff --git a/paddlespeech/vector/layers/loss.py b/paddlespeech/vector/layers/loss.py
new file mode 100644
index 00000000..bf632b13
--- /dev/null
+++ b/paddlespeech/vector/layers/loss.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class AngularMargin(nn.Layer):
+    def __init__(self, margin=0.0, scale=1.0):
+        super(AngularMargin, self).__init__()
+        self.margin = margin
+        self.scale = scale
+
+    def forward(self, outputs, targets):
+        outputs = outputs - self.margin * targets
+        return self.scale * outputs
+
+
+class AdditiveAngularMargin(AngularMargin):
+    def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
+        super(AdditiveAngularMargin, self).__init__(margin, scale)
+        self.easy_margin = easy_margin
+
+        self.cos_m = math.cos(self.margin)
+        self.sin_m = math.sin(self.margin)
+        self.th = math.cos(math.pi - self.margin)
+        self.mm = math.sin(math.pi - self.margin) * self.margin
+
+    def forward(self, outputs, targets):
+        cosine = outputs.astype('float32')
+        sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2))
+        phi = cosine * self.cos_m - sine * self.sin_m  # cos(theta + m)
+        if self.easy_margin:
+            phi = paddle.where(cosine > 0, phi, cosine)
+        else:
+            phi = paddle.where(cosine > self.th, phi, cosine - self.mm)
+        outputs = (targets * phi) + ((1.0 - targets) * cosine)
+        return self.scale * outputs
+
+
+class LogSoftmaxWrapper(nn.Layer):
+    def __init__(self, loss_fn):
+        super(LogSoftmaxWrapper, self).__init__()
+        self.loss_fn = loss_fn
+        self.criterion = paddle.nn.KLDivLoss(reduction="sum")
+
+    def forward(self, outputs, targets, length=None):
+        targets = F.one_hot(targets, outputs.shape[1])
+        try:
+            predictions = self.loss_fn(outputs, targets)
+        except TypeError:
+            predictions = self.loss_fn(outputs)
+
+        predictions = F.log_softmax(predictions, axis=1)
+        loss = self.criterion(predictions, targets) / targets.sum()
+        return loss
\ No newline at end of file

From 7668f61422df4895706663ee7f563c0de83149d9 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 3 Mar 2022 17:05:20 +0800
Subject: [PATCH 12/41] add sid dataloader for training, test=doc

---
 examples/voxceleb/sv0/local/train.py  | 37 ++++++++++++++++++++++-----
 paddlespeech/vector/datasets/batch.py | 20 +++++++++++++++
 2 files changed, 50 insertions(+), 7 deletions(-)
 create mode 100644 paddlespeech/vector/datasets/batch.py

diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index 1d9a78f9..bddb94bb 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -11,16 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import argparse
+import os
 
 import paddle
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
 
 from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.vector.datasets.batch import waveform_collate_fn
+from paddlespeech.vector.layers.loss import AdditiveAngularMargin
+from paddlespeech.vector.layers.loss import LogSoftmaxWrapper
 from paddlespeech.vector.layers.lr import CyclicLRScheduler
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.training.sid_model import SpeakerIdetification
-from paddlespeech.vector.layers.loss import AdditiveAngularMargin, LogSoftmaxWrapper
+
 
 def main(args):
     # stage0: set the training device, cpu or gpu
@@ -61,7 +66,6 @@ def main(args):
     criterion = LogSoftmaxWrapper(
         loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
 
-    
     # stage7: confirm training start epoch
     #         if pre-trained model exists, start epoch confirmed by the pre-trained model
     start_epoch = 0
@@ -89,7 +93,19 @@ def main(args):
             print(f'Restore training from epoch {start_epoch}.')
         except ValueError:
             pass
-    
+
+    # stage8: we build the batch sampler for paddle.DataLoader
+    train_sampler = DistributedBatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
+    train_loader = DataLoader(
+        train_ds,
+        batch_sampler=train_sampler,
+        num_workers=args.num_workers,
+        collate_fn=waveform_collate_fn,
+        return_list=True,
+        use_buffer_reader=True, )
+
+
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
@@ -105,10 +121,17 @@ if __name__ == "__main__":
                         type=float,
                         default=1e-8,
                         help="Learning rate used to train with warmup.")
-    parser.add_argument("--load_checkpoint", 
-                        type=str, 
-                        default=None, 
+    parser.add_argument("--load_checkpoint",
+                        type=str,
+                        default=None,
                         help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--batch_size",
+                        type=int, default=64,
+                        help="Total examples' number in batch for training.")
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=0,
+                        help="Number of workers in dataloader.")
 
     args = parser.parse_args()
     # yapf: enable
diff --git a/paddlespeech/vector/datasets/batch.py b/paddlespeech/vector/datasets/batch.py
new file mode 100644
index 00000000..a9e5d6ee
--- /dev/null
+++ b/paddlespeech/vector/datasets/batch.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def waveform_collate_fn(batch):
+    waveforms = np.stack([item['feat'] for item in batch])
+    labels = np.stack([item['label'] for item in batch])
+
+    return {'waveforms': waveforms, 'labels': labels}

From 4648059b5f4c271458d85351b2442be3f61b0e60 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 3 Mar 2022 18:20:44 +0800
Subject: [PATCH 13/41] add training process for sid, test=doc

---
 examples/voxceleb/sv0/local/train.py  | 146 +++++++++++++++++++++++++-
 paddlespeech/vector/datasets/batch.py |  13 +++
 2 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index bddb94bb..f68f7373 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -14,11 +14,15 @@
 import argparse
 import os
 
+import numpy as np
 import paddle
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 
 from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddleaudio.features.core import melspectrogram
+from paddleaudio.utils.time import Timer
+from paddlespeech.vector.datasets.batch import feature_normalize
 from paddlespeech.vector.datasets.batch import waveform_collate_fn
 from paddlespeech.vector.layers.loss import AdditiveAngularMargin
 from paddlespeech.vector.layers.loss import LogSoftmaxWrapper
@@ -26,6 +30,13 @@ from paddlespeech.vector.layers.lr import CyclicLRScheduler
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.training.sid_model import SpeakerIdetification
 
+# feat configuration
+cpu_feat_conf = {
+    'n_mels': 80,
+    'window_size': 400,
+    'hop_length': 160,
+}
+
 
 def main(args):
     # stage0: set the training device, cpu or gpu
@@ -42,9 +53,10 @@ def main(args):
     dev_ds = VoxCeleb1('dev', target_dir=args.data_dir)
 
     # stage3: build the dnn backbone model network
+    #"channels": [1024, 1024, 1024, 1024, 3072],
     model_conf = {
         "input_size": 80,
-        "channels": [1024, 1024, 1024, 1024, 3072],
+        "channels": [512, 512, 512, 512, 1536],
         "kernel_sizes": [5, 3, 3, 3, 1],
         "dilations": [1, 2, 3, 4, 1],
         "attention_channels": 128,
@@ -105,6 +117,122 @@ def main(args):
         return_list=True,
         use_buffer_reader=True, )
 
+    # stage9: start to train
+    #         we will comment the training process
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * args.epochs)
+    timer.start()
+
+    for epoch in range(start_epoch + 1, args.epochs + 1):
+        # at the begining, model must set to train mode
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        for batch_idx, batch in enumerate(train_loader):
+            waveforms, labels = batch['waveforms'], batch['labels']
+
+            feats = []
+            for waveform in waveforms.numpy():
+                feat = melspectrogram(x=waveform, **cpu_feat_conf)
+                feats.append(feat)
+            feats = paddle.to_tensor(np.asarray(feats))
+            feats = feature_normalize(
+                feats, mean_norm=True, std_norm=False)  # Features normalization
+            logits = model(feats)
+
+            loss = criterion(logits, labels)
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+
+            # Calculate loss
+            avg_loss += loss.numpy()[0]
+
+            # Calculate metrics
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+
+            timer.count()
+
+            if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= args.log_freq
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                print(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+
+        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch:
+            if local_rank != 0:
+                paddle.distributed.barrier(
+                )  # Wait for valid step in main process
+                continue  # Resume trainning on other process
+
+            dev_sampler = paddle.io.BatchSampler(
+                dev_ds,
+                batch_size=args.batch_size // 4,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = paddle.io.DataLoader(
+                dev_ds,
+                batch_sampler=dev_sampler,
+                collate_fn=waveform_collate_fn,
+                num_workers=args.num_workers,
+                return_list=True, )
+
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+            print('Evaluate on validation dataset')
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(dev_loader):
+                    waveforms, labels = batch['waveforms'], batch['labels']
+                    # feats = feature_extractor(waveforms)
+                    feats = []
+                    for waveform in waveforms.numpy():
+                        feat = melspectrogram(x=waveform, **cpu_feat_conf)
+                        feats.append(feat)
+                    feats = paddle.to_tensor(np.asarray(feats))
+                    feats = feature_normalize(
+                        feats, mean_norm=True, std_norm=False)
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+
+            print(print_msg)
+
+            # Save model
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            print('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
+
+            if nranks > 1:
+                paddle.distributed.barrier()  # Main process
+
 
 if __name__ == "__main__":
     # yapf: disable
@@ -117,21 +245,29 @@ if __name__ == "__main__":
                         default="./data/",
                         type=str,
                         help="data directory")
-    parser.add_argument("--learning_rate",
+    parser.add_argument("--learning-rate",
                         type=float,
                         default=1e-8,
                         help="Learning rate used to train with warmup.")
-    parser.add_argument("--load_checkpoint",
+    parser.add_argument("--load-checkpoint",
                         type=str,
                         default=None,
                         help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--batch_size",
+    parser.add_argument("--batch-size",
                         type=int, default=64,
                         help="Total examples' number in batch for training.")
-    parser.add_argument("--num_workers",
+    parser.add_argument("--num-workers",
                         type=int,
                         default=0,
                         help="Number of workers in dataloader.")
+    parser.add_argument("--epochs",
+                        type=int,
+                        default=50,
+                        help="Number of epoches for fine-tuning.")
+    parser.add_argument("--log_freq",
+                        type=int,
+                        default=10,
+                        help="Log the training infomation every n steps.")
 
     args = parser.parse_args()
     # yapf: enable
diff --git a/paddlespeech/vector/datasets/batch.py b/paddlespeech/vector/datasets/batch.py
index a9e5d6ee..9db615f6 100644
--- a/paddlespeech/vector/datasets/batch.py
+++ b/paddlespeech/vector/datasets/batch.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
+import paddle
 
 
 def waveform_collate_fn(batch):
@@ -18,3 +20,14 @@ def waveform_collate_fn(batch):
     labels = np.stack([item['label'] for item in batch])
 
     return {'waveforms': waveforms, 'labels': labels}
+
+
+def feature_normalize(feats: paddle.Tensor,
+                      mean_norm: bool=True,
+                      std_norm: bool=True):
+    # Features normalization if needed
+    mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
+    std = feats.std(axis=-1, keepdim=True) if std_norm else 1
+    feats = (feats - mean) / std
+
+    return feats

From 1f74af110b54127bd7b2b76a3c0664c909dbea98 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 3 Mar 2022 22:17:14 +0800
Subject: [PATCH 14/41] add training log info and comment, test=doc

---
 examples/voxceleb/sv0/local/train.py | 52 +++++++++++++++------
 paddlespeech/vector/training/time.py | 67 ++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 13 deletions(-)
 create mode 100644 paddlespeech/vector/training/time.py

diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index f68f7373..f86b0a86 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -16,12 +16,13 @@ import os
 
 import numpy as np
 import paddle
+from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddleaudio.features.core import melspectrogram
-from paddleaudio.utils.time import Timer
+from paddlespeech.vector.training.time import Timer
 from paddlespeech.vector.datasets.batch import feature_normalize
 from paddlespeech.vector.datasets.batch import waveform_collate_fn
 from paddlespeech.vector.layers.loss import AdditiveAngularMargin
@@ -37,7 +38,6 @@ cpu_feat_conf = {
     'hop_length': 160,
 }
 
-
 def main(args):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -82,6 +82,7 @@ def main(args):
     #         if pre-trained model exists, start epoch confirmed by the pre-trained model
     start_epoch = 0
     if args.load_checkpoint:
+        print("load the check point")
         args.load_checkpoint = os.path.abspath(
             os.path.expanduser(args.load_checkpoint))
         try:
@@ -131,18 +132,30 @@ def main(args):
         num_corrects = 0
         num_samples = 0
         for batch_idx, batch in enumerate(train_loader):
+            # stage 9-1: batch data is audio sample points and speaker id label
             waveforms, labels = batch['waveforms'], batch['labels']
 
+            # stage 9-2: audio sample augment method, which is done on the audio sample point
+            # todo
+
+            # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
             feats = []
             for waveform in waveforms.numpy():
                 feat = melspectrogram(x=waveform, **cpu_feat_conf)
                 feats.append(feat)
             feats = paddle.to_tensor(np.asarray(feats))
+
+            # stage 9-4: feature normalize, which help converge and imporve the performance
             feats = feature_normalize(
                 feats, mean_norm=True, std_norm=False)  # Features normalization
+
+            # stage 9-5: model forward, such ecapa-tdnn, x-vector
             logits = model(feats)
 
+            # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
             loss = criterion(logits, labels)
+
+            # stage 9-7: update the gradient and clear the gradient cache
             loss.backward()
             optimizer.step()
             if isinstance(optimizer._learning_rate,
@@ -150,22 +163,22 @@ def main(args):
                 optimizer._learning_rate.step()
             optimizer.clear_grad()
 
-            # Calculate loss
+            # stage 9-8: Calculate average loss per batch
             avg_loss += loss.numpy()[0]
 
-            # Calculate metrics
+            # stage 9-9: Calculate metrics, which is one-best accuracy
             preds = paddle.argmax(logits, axis=1)
             num_corrects += (preds == labels).numpy().sum()
             num_samples += feats.shape[0]
+            timer.count()  # step plus one in timer
 
-            timer.count()
-
+            # stage 9-10: print the log information only on 0-rank per log-freq batchs
             if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
                 avg_loss /= args.log_freq
                 avg_acc = num_corrects / num_samples
 
-                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
                     epoch, args.epochs, batch_idx + 1, steps_per_epoch)
                 print_msg += ' loss={:.4f}'.format(avg_loss)
                 print_msg += ' acc={:.4f}'.format(avg_acc)
@@ -177,36 +190,42 @@ def main(args):
                 num_corrects = 0
                 num_samples = 0
 
+        # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
         if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch:
             if local_rank != 0:
                 paddle.distributed.barrier(
                 )  # Wait for valid step in main process
                 continue  # Resume trainning on other process
 
-            dev_sampler = paddle.io.BatchSampler(
+            # stage 9-12: construct the valid dataset dataloader
+            dev_sampler = BatchSampler(
                 dev_ds,
                 batch_size=args.batch_size // 4,
                 shuffle=False,
                 drop_last=False)
-            dev_loader = paddle.io.DataLoader(
+            dev_loader = DataLoader(
                 dev_ds,
                 batch_sampler=dev_sampler,
                 collate_fn=waveform_collate_fn,
                 num_workers=args.num_workers,
                 return_list=True, )
 
+            # set the model to eval mode
             model.eval()
             num_corrects = 0
             num_samples = 0
+
+            # stage 9-13: evaluation the valid dataset batch data
             print('Evaluate on validation dataset')
             with paddle.no_grad():
                 for batch_idx, batch in enumerate(dev_loader):
                     waveforms, labels = batch['waveforms'], batch['labels']
-                    # feats = feature_extractor(waveforms)
+
                     feats = []
                     for waveform in waveforms.numpy():
                         feat = melspectrogram(x=waveform, **cpu_feat_conf)
                         feats.append(feat)
+
                     feats = paddle.to_tensor(np.asarray(feats))
                     feats = feature_normalize(
                         feats, mean_norm=True, std_norm=False)
@@ -218,10 +237,9 @@ def main(args):
 
             print_msg = '[Evaluation result]'
             print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
-
             print(print_msg)
 
-            # Save model
+            # stage 9-14: Save model parameters
             save_dir = os.path.join(args.checkpoint_dir,
                                     'epoch_{}'.format(epoch))
             print('Saving model checkpoint to {}'.format(save_dir))
@@ -264,10 +282,18 @@ if __name__ == "__main__":
                         type=int,
                         default=50,
                         help="Number of epoches for fine-tuning.")
-    parser.add_argument("--log_freq",
+    parser.add_argument("--log-freq",
                         type=int,
                         default=10,
                         help="Log the training infomation every n steps.")
+    parser.add_argument("--save-freq",
+                        type=int,
+                        default=1,
+                        help="Save checkpoint every n epoch.")
+    parser.add_argument("--checkpoint-dir",
+                        type=str,
+                        default='./checkpoint',
+                        help="Directory to save model checkpoints.")
 
     args = parser.parse_args()
     # yapf: enable
diff --git a/paddlespeech/vector/training/time.py b/paddlespeech/vector/training/time.py
new file mode 100644
index 00000000..3a4e183d
--- /dev/null
+++ b/paddlespeech/vector/training/time.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return time_used / run_steps
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        scale = self.total_step / self.current_step
+        remaining_time = (time.time() - self.start_time) * scale
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str

From 97ec01260b4188778256ef0a509ff77e8048f88a Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Fri, 4 Mar 2022 16:25:55 +0800
Subject: [PATCH 15/41] add speaker verification using cosine score, test=doc

---
 .../sv0/local/speaker_verification_cosine.py  | 238 ++++++++++++++++++
 examples/voxceleb/sv0/run.sh                  |  34 ++-
 2 files changed, 267 insertions(+), 5 deletions(-)
 create mode 100644 examples/voxceleb/sv0/local/speaker_verification_cosine.py

diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
new file mode 100644
index 00000000..5665b5ee
--- /dev/null
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import ast
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+import paddle.nn.functional as F
+from paddlespeech.vector.training.metrics import compute_eer
+from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.training.sid_model import SpeakerIdetification
+from tqdm import tqdm
+
+
+def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
+    x = np.asarray(x)
+    assert len(
+        x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
+
+    w = target_length - x.shape[axis]
+    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
+
+    if axis == 0:
+        pad_width = [[0, w], [0, 0]]
+    else:
+        pad_width = [[0, 0], [0, w]]
+
+    return np.pad(x, pad_width, mode=mode, **kwargs)
+
+
+def feature_normalize(batch, mean_norm: bool = True, std_norm: bool = True):
+    ids = [item['id'] for item in batch]
+    lengths = np.asarray([item['feat'].shape[1] for item in batch])
+    feats = list(
+        map(lambda x: pad_right_2d(x, lengths.max()),
+            [item['feat'] for item in batch]))
+    feats = np.stack(feats)
+
+    # Features normalization if needed
+    for i in range(len(feats)):
+        feat = feats[i][:, :lengths[i]]  # Excluding pad values.
+        mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feat.std(axis=-1, keepdims=True) if std_norm else 1
+        feats[i][:, :lengths[i]] = (feat - mean) / std
+        assert feats[i][:, lengths[i]:].sum(
+        ) == 0  # Padding valus should all be 0.
+
+    # Converts into ratios.
+    lengths = (lengths / lengths.max()).astype(np.float32)
+
+    return {'ids': ids, 'feats': feats, 'lengths': lengths}
+
+
+def main(args):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+
+    # stage1: build the dnn backbone model network
+    ##"channels": [1024, 1024, 1024, 1024, 3072],
+    model_conf = {
+        "input_size": 80,
+        "channels": [512, 512, 512, 512, 1536],
+        "kernel_sizes": [5, 3, 3, 3, 1],
+        "dilations": [1, 2, 3, 4, 1],
+        "attention_channels": 128,
+        "lin_neurons": 192,
+    }
+    ecapa_tdnn = EcapaTdnn(**model_conf)
+
+    # stage2: build the speaker verification eval instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+
+    # stage3: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    print(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage4: construct the enroll and test dataloader
+    enrol_ds = VoxCeleb1(subset='enrol',
+                        feat_type='melspectrogram',
+                        random_chunk=False,
+                        n_mels=80,
+                        window_size=400,
+                        hop_length=160)
+    enrol_sampler = BatchSampler(
+                    enrol_ds, 
+                    batch_size=args.batch_size,
+                    shuffle=True)  # Shuffle to make embedding normalization more robust.
+    enrol_loader = DataLoader(enrol_ds,
+                    batch_sampler=enrol_sampler,
+                    collate_fn=lambda x: feature_normalize(
+                            x, mean_norm=True, std_norm=False),
+                    num_workers=args.num_workers,
+                    return_list=True,)
+
+    test_ds = VoxCeleb1(subset='test',
+                        feat_type='melspectrogram',
+                        random_chunk=False,
+                        n_mels=80,
+                        window_size=400,
+                        hop_length=160)
+
+    test_sampler = BatchSampler(test_ds, 
+                                batch_size=args.batch_size, 
+                                shuffle=True)
+    test_loader = DataLoader(test_ds,
+                            batch_sampler=test_sampler,
+                            collate_fn=lambda x: feature_normalize(
+                                x, mean_norm=True, std_norm=False),
+                            num_workers=args.num_workers,
+                            return_list=True,)
+    # stage6: we must set the model to eval mode
+    model.eval()
+
+    # stage7: global embedding norm to imporve the performance
+    if args.global_embedding_norm:
+        embedding_mean = None
+        embedding_std = None
+        mean_norm = args.embedding_mean_norm
+        std_norm = args.embedding_std_norm
+        batch_count = 0
+
+    # stage8: Compute embeddings of audios in enrol and test dataset from model.
+    id2embedding = {}
+    # Run multi times to make embedding normalization more stable.
+    for i in range(2):
+        for dl in [enrol_loader, test_loader]:
+            print(
+                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+            )
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(tqdm(dl)):
+
+                    # stage 8-1: extrac the audio embedding
+                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
+                        'lengths']
+                    embeddings = model.backbone(feats, lengths).squeeze(
+                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
+
+                    # Global embedding normalization.
+                    if args.global_embedding_norm:
+                        batch_count += 1
+                        mean = embeddings.mean(axis=0) if mean_norm else 0
+                        std = embeddings.std(axis=0) if std_norm else 1
+                        # Update global mean and std.
+                        if embedding_mean is None and embedding_std is None:
+                            embedding_mean, embedding_std = mean, std
+                        else:
+                            weight = 1 / batch_count  # Weight decay by batches.
+                            embedding_mean = (
+                                1 - weight) * embedding_mean + weight * mean
+                            embedding_std = (
+                                1 - weight) * embedding_std + weight * std
+                        # Apply global embedding normalization.
+                        embeddings = (embeddings - embedding_mean) / embedding_std
+
+                    # Update embedding dict.
+                    id2embedding.update(dict(zip(ids, embeddings)))
+
+    # stage 9: Compute cosine scores.
+    labels = []
+    enrol_ids = []
+    test_ids = []
+    with open(VoxCeleb1.veri_test_file, 'r') as f:
+        for line in f.readlines():
+            label, enrol_id, test_id = line.strip().split(' ')
+            labels.append(int(label))
+            enrol_ids.append(enrol_id.split('.')[0].replace('/', '-'))
+            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+
+    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+        np.asarray([id2embedding[id] for id in ids], dtype='float32')),
+                                            [enrol_ids, test_ids
+                                             ])  # (N, emb_size)
+    scores = cos_sim_func(enrol_embeddings, test_embeddings)
+    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
+    print(
+        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
+    )
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device', 
+                        choices=['cpu', 'gpu'], 
+                        default="gpu", 
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--batch-size", 
+                        type=int, 
+                        default=16, 
+                        help="Total examples' number in batch for training.")
+    parser.add_argument("--num-workers", 
+                        type=int, 
+                        default=0, 
+                        help="Number of workers in dataloader.")
+    parser.add_argument("--load-checkpoint", 
+                        type=str, 
+                        default='', 
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--global-embedding-norm", 
+                        type=bool, 
+                        default=True, 
+                        help="Apply global normalization on speaker embeddings.")
+    parser.add_argument("--embedding-mean-norm", 
+                        type=bool, 
+                        default=True, 
+                        help="Apply mean normalization on speaker embeddings.")
+    parser.add_argument("--embedding-std-norm", 
+                        type=bool, 
+                        default=False, 
+                        help="Apply std normalization on speaker embeddings.")
+    args = parser.parse_args()
+    # yapf: enable
+
+    main(args)
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index a96c3827..c3b31ce5 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -2,9 +2,33 @@
 . ./path.sh
 set -e
 
-dir=./data/
-mkdir -p ${dir}
+#######################################################################
+# stage 1: train the speaker identification model
+# stage 2: test speaker identification 
+# stage 3: extract the training embeding to train the LDA and PLDA
+######################################################################
+
 # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
-python3 \
-     local/train.py \
-     --data-dir ${dir}
+# default the dataset is the ~/.paddleaudio/
+# export PPAUDIO_HOME=
+
+stage=2
+dir=data/                     # data directory
+exp_dir=exp/ecapa-tdnn/       # experiment directory
+mkdir -p ${dir}
+
+if [ $stage -le 1 ]; then
+     # stage 1: train the speaker identification model
+     python3 \
+          -m paddle.distributed.launch --gpus=0,1,2,3 \
+          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} \
+          --save-freq 10 --data-dir ${dir} --batch-size 256 --epochs 60
+fi
+
+if [ $stage -le 2 ]; then
+     # stage 1: train the speaker identification model
+     python3 \
+          local/speaker_verification_cosine.py \
+          --load-checkpoint ${exp_dir}/epoch_40/
+fi
+

From 016ed6d69cedff84c1913cc9650ea88c47b52dcc Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Fri, 4 Mar 2022 17:20:37 +0800
Subject: [PATCH 16/41] repair the code according to the part comment, test=doc

---
 .../sv0/local/speaker_verification_cosine.py  | 108 +++++++++---------
 examples/voxceleb/sv0/local/train.py          |  19 +--
 paddleaudio/datasets/voxceleb.py              |   3 +
 paddleaudio/utils/download.py                 |  34 +++---
 paddlespeech/vector/{datasets => io}/batch.py |   0
 .../vector/{layers => modules}/loss.py        |   3 +-
 paddlespeech/vector/{layers => modules}/lr.py |   0
 .../vector/{training => modules}/sid_model.py |   0
 paddlespeech/vector/training/metrics.py       |  28 +++++
 paddlespeech/vector/utils/download.py         |  72 ++++++++++++
 .../vector/{training => utils}/time.py        |   0
 11 files changed, 182 insertions(+), 85 deletions(-)
 rename paddlespeech/vector/{datasets => io}/batch.py (100%)
 rename paddlespeech/vector/{layers => modules}/loss.py (99%)
 rename paddlespeech/vector/{layers => modules}/lr.py (100%)
 rename paddlespeech/vector/{training => modules}/sid_model.py (100%)
 create mode 100644 paddlespeech/vector/training/metrics.py
 create mode 100644 paddlespeech/vector/utils/download.py
 rename paddlespeech/vector/{training => utils}/time.py (100%)

diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
index 5665b5ee..1959e85c 100644
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@@ -11,21 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import ast
 import os
 
 import numpy as np
 import paddle
+import paddle.nn.functional as F
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
-import paddle.nn.functional as F
-from paddlespeech.vector.training.metrics import compute_eer
+from tqdm import tqdm
+
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
-from paddlespeech.vector.training.sid_model import SpeakerIdetification
-from tqdm import tqdm
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.metrics import compute_eer
 
 
 def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
@@ -44,7 +44,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
     return np.pad(x, pad_width, mode=mode, **kwargs)
 
 
-def feature_normalize(batch, mean_norm: bool = True, std_norm: bool = True):
+def feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
     ids = [item['id'] for item in batch]
     lengths = np.asarray([item['feat'].shape[1] for item in batch])
     feats = list(
@@ -58,8 +58,8 @@ def feature_normalize(batch, mean_norm: bool = True, std_norm: bool = True):
         mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
         std = feat.std(axis=-1, keepdims=True) if std_norm else 1
         feats[i][:, :lengths[i]] = (feat - mean) / std
-        assert feats[i][:, lengths[i]:].sum(
-        ) == 0  # Padding valus should all be 0.
+        assert feats[i][:, lengths[
+            i]:].sum() == 0  # Padding valus should all be 0.
 
     # Converts into ratios.
     lengths = (lengths / lengths.max()).astype(np.float32)
@@ -98,16 +98,16 @@ def main(args):
     print(f'Checkpoint loaded from {args.load_checkpoint}')
 
     # stage4: construct the enroll and test dataloader
-    enrol_ds = VoxCeleb1(subset='enrol',
-                        feat_type='melspectrogram',
-                        random_chunk=False,
-                        n_mels=80,
-                        window_size=400,
-                        hop_length=160)
+    enrol_ds = VoxCeleb1(
+        subset='enrol',
+        feat_type='melspectrogram',
+        random_chunk=False,
+        n_mels=80,
+        window_size=400,
+        hop_length=160)
     enrol_sampler = BatchSampler(
-                    enrol_ds, 
-                    batch_size=args.batch_size,
-                    shuffle=True)  # Shuffle to make embedding normalization more robust.
+        enrol_ds, batch_size=args.batch_size,
+        shuffle=True)  # Shuffle to make embedding normalization more robust.
     enrol_loader = DataLoader(enrol_ds,
                     batch_sampler=enrol_sampler,
                     collate_fn=lambda x: feature_normalize(
@@ -115,16 +115,16 @@ def main(args):
                     num_workers=args.num_workers,
                     return_list=True,)
 
-    test_ds = VoxCeleb1(subset='test',
-                        feat_type='melspectrogram',
-                        random_chunk=False,
-                        n_mels=80,
-                        window_size=400,
-                        hop_length=160)
+    test_ds = VoxCeleb1(
+        subset='test',
+        feat_type='melspectrogram',
+        random_chunk=False,
+        n_mels=80,
+        window_size=400,
+        hop_length=160)
 
-    test_sampler = BatchSampler(test_ds, 
-                                batch_size=args.batch_size, 
-                                shuffle=True)
+    test_sampler = BatchSampler(
+        test_ds, batch_size=args.batch_size, shuffle=True)
     test_loader = DataLoader(test_ds,
                             batch_sampler=test_sampler,
                             collate_fn=lambda x: feature_normalize(
@@ -169,12 +169,13 @@ def main(args):
                             embedding_mean, embedding_std = mean, std
                         else:
                             weight = 1 / batch_count  # Weight decay by batches.
-                            embedding_mean = (
-                                1 - weight) * embedding_mean + weight * mean
-                            embedding_std = (
-                                1 - weight) * embedding_std + weight * std
+                            embedding_mean = (1 - weight
+                                              ) * embedding_mean + weight * mean
+                            embedding_std = (1 - weight
+                                             ) * embedding_std + weight * std
                         # Apply global embedding normalization.
-                        embeddings = (embeddings - embedding_mean) / embedding_std
+                        embeddings = (
+                            embeddings - embedding_mean) / embedding_std
 
                     # Update embedding dict.
                     id2embedding.update(dict(zip(ids, embeddings)))
@@ -201,38 +202,39 @@ def main(args):
         f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
     )
 
+
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument('--device', 
-                        choices=['cpu', 'gpu'], 
-                        default="gpu", 
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="gpu",
                         help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--batch-size", 
-                        type=int, 
-                        default=16, 
+    parser.add_argument("--batch-size",
+                        type=int,
+                        default=16,
                         help="Total examples' number in batch for training.")
-    parser.add_argument("--num-workers", 
-                        type=int, 
-                        default=0, 
+    parser.add_argument("--num-workers",
+                        type=int,
+                        default=0,
                         help="Number of workers in dataloader.")
-    parser.add_argument("--load-checkpoint", 
-                        type=str, 
-                        default='', 
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
                         help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--global-embedding-norm", 
-                        type=bool, 
-                        default=True, 
+    parser.add_argument("--global-embedding-norm",
+                        type=bool,
+                        default=True,
                         help="Apply global normalization on speaker embeddings.")
-    parser.add_argument("--embedding-mean-norm", 
-                        type=bool, 
-                        default=True, 
+    parser.add_argument("--embedding-mean-norm",
+                        type=bool,
+                        default=True,
                         help="Apply mean normalization on speaker embeddings.")
-    parser.add_argument("--embedding-std-norm", 
-                        type=bool, 
-                        default=False, 
+    parser.add_argument("--embedding-std-norm",
+                        type=bool,
+                        default=False,
                         help="Apply std normalization on speaker embeddings.")
     args = parser.parse_args()
     # yapf: enable
 
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index f86b0a86..4eabf94c 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -22,22 +22,23 @@ from paddle.io import DistributedBatchSampler
 
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddleaudio.features.core import melspectrogram
-from paddlespeech.vector.training.time import Timer
-from paddlespeech.vector.datasets.batch import feature_normalize
-from paddlespeech.vector.datasets.batch import waveform_collate_fn
-from paddlespeech.vector.layers.loss import AdditiveAngularMargin
-from paddlespeech.vector.layers.loss import LogSoftmaxWrapper
-from paddlespeech.vector.layers.lr import CyclicLRScheduler
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.io.batch import waveform_collate_fn
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
-from paddlespeech.vector.training.sid_model import SpeakerIdetification
+from paddlespeech.vector.modules.loss import AdditiveAngularMargin
+from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
+from paddlespeech.vector.modules.lr import CyclicLRScheduler
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.utils.time import Timer
 
 # feat configuration
 cpu_feat_conf = {
     'n_mels': 80,
-    'window_size': 400,
-    'hop_length': 160,
+    'window_size': 400,  #ms
+    'hop_length': 160,  #ms
 }
 
+
 def main(args):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py
index 70cf3e7a..760db721 100644
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/datasets/voxceleb.py
@@ -76,6 +76,9 @@ class VoxCeleb1(Dataset):
         'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
     base_path = os.path.join(DATA_HOME, 'vox1')
     wav_path = os.path.join(base_path, 'wav')
+    meta_path = os.path.join(base_path, 'meta')
+    veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
+    csv_path = os.path.join(base_path, 'csv')
     subsets = ['train', 'dev', 'enrol', 'test']
 
     def __init__(
diff --git a/paddleaudio/utils/download.py b/paddleaudio/utils/download.py
index a0c02ee1..0535249b 100644
--- a/paddleaudio/utils/download.py
+++ b/paddleaudio/utils/download.py
@@ -22,30 +22,22 @@ from .log import logger
 
 download.logger = logger
 
+__all__ = [
+    'decompress',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
 
-def decompress(file: str, path: str=os.PathLike):
+
+def decompress(file: str):
     """
-    Extracts all files from a compressed file to specific path.
+    Extracts all files from a compressed file.
     """
     assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
 
-    if path is None:
-        print("decompress the data: {}".format(file))
-        download._decompress(file)
-    else:
-        print("decompress the data: {} to {}".format(file, path))
-        if not os.path.isdir(path):
-            os.makedirs(path)
-
-        tmp_file = os.path.join(path, os.path.basename(file))
-        os.rename(file, tmp_file)
-        download._decompress(tmp_file)
-        os.rename(tmp_file, file)
 
-
-def download_and_decompress(archives: List[Dict[str, str]],
-                            path: str,
-                            decompress: bool=True):
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
     """
     Download archieves and decompress to specific path.
     """
@@ -55,8 +47,8 @@ def download_and_decompress(archives: List[Dict[str, str]],
     for archive in archives:
         assert 'url' in archive and 'md5' in archive, \
             'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-        download.get_path_from_url(
-            archive['url'], path, archive['md5'], decompress=decompress)
+
+        download.get_path_from_url(archive['url'], path, archive['md5'])
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
@@ -67,4 +59,4 @@ def load_state_dict_from_url(url: str, path: str, md5: str=None):
         os.makedirs(path)
 
     download.get_path_from_url(url, path, md5)
-    return load_state_dict(os.path.join(path, os.path.basename(url)))
+    return load_state_dict(os.path.join(path, os.path.basename(url)))
\ No newline at end of file
diff --git a/paddlespeech/vector/datasets/batch.py b/paddlespeech/vector/io/batch.py
similarity index 100%
rename from paddlespeech/vector/datasets/batch.py
rename to paddlespeech/vector/io/batch.py
diff --git a/paddlespeech/vector/layers/loss.py b/paddlespeech/vector/modules/loss.py
similarity index 99%
rename from paddlespeech/vector/layers/loss.py
rename to paddlespeech/vector/modules/loss.py
index bf632b13..1aa0599a 100644
--- a/paddlespeech/vector/layers/loss.py
+++ b/paddlespeech/vector/modules/loss.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import math
 
 import paddle
@@ -67,4 +66,4 @@ class LogSoftmaxWrapper(nn.Layer):
 
         predictions = F.log_softmax(predictions, axis=1)
         loss = self.criterion(predictions, targets) / targets.sum()
-        return loss
\ No newline at end of file
+        return loss
diff --git a/paddlespeech/vector/layers/lr.py b/paddlespeech/vector/modules/lr.py
similarity index 100%
rename from paddlespeech/vector/layers/lr.py
rename to paddlespeech/vector/modules/lr.py
diff --git a/paddlespeech/vector/training/sid_model.py b/paddlespeech/vector/modules/sid_model.py
similarity index 100%
rename from paddlespeech/vector/training/sid_model.py
rename to paddlespeech/vector/modules/sid_model.py
diff --git a/paddlespeech/vector/training/metrics.py b/paddlespeech/vector/training/metrics.py
new file mode 100644
index 00000000..65dc7a3c
--- /dev/null
+++ b/paddlespeech/vector/training/metrics.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+from sklearn.metrics import roc_curve
+
+
+def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
+    '''
+    Compute EER and return score threshold.
+    '''
+    fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
+    fnr = 1 - tpr
+    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
+    return eer, eer_threshold
diff --git a/paddlespeech/vector/utils/download.py b/paddlespeech/vector/utils/download.py
new file mode 100644
index 00000000..476bfea7
--- /dev/null
+++ b/paddlespeech/vector/utils/download.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict
+from typing import List
+
+from paddle.framework import load as load_state_dict
+from paddle.utils import download
+
+__all__ = [
+    'decompress',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
+
+
+def decompress(file: str, path: str=os.PathLike):
+    """
+    Extracts all files from a compressed file to specific path.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+
+    if path is None:
+        print("decompress the data: {}".format(file))
+        download._decompress(file)
+    else:
+        print("decompress the data: {} to {}".format(file, path))
+        if not os.path.isdir(path):
+            os.makedirs(path)
+
+        tmp_file = os.path.join(path, os.path.basename(file))
+        os.rename(file, tmp_file)
+        download._decompress(tmp_file)
+        os.rename(tmp_file, file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            decompress: bool=True):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+        download.get_path_from_url(
+            archive['url'], path, archive['md5'], decompress=decompress)
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load_state_dict(os.path.join(path, os.path.basename(url)))
diff --git a/paddlespeech/vector/training/time.py b/paddlespeech/vector/utils/time.py
similarity index 100%
rename from paddlespeech/vector/training/time.py
rename to paddlespeech/vector/utils/time.py

From ac4967e204e14f6b96efc69132deeeaa89d8e4cd Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sun, 6 Mar 2022 17:54:19 +0800
Subject: [PATCH 17/41] optimize the data prepare process

---
 paddlespeech/vector/utils/time.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddlespeech/vector/utils/time.py b/paddlespeech/vector/utils/time.py
index 3a4e183d..8e85b0e1 100644
--- a/paddlespeech/vector/utils/time.py
+++ b/paddlespeech/vector/utils/time.py
@@ -53,8 +53,7 @@ class Timer(object):
     def eta(self) -> str:
         if not self.is_running:
             return '00:00:00'
-        scale = self.total_step / self.current_step
-        remaining_time = (time.time() - self.start_time) * scale
+        remaining_time = time.time() - self.start_time
         return seconds_to_hms(remaining_time)
 
 

From 2d89c80e6f85cc9cae841baef529a769715eb51f Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 7 Mar 2022 21:56:38 +0800
Subject: [PATCH 18/41] add waveform augment pipeline, test=doc

---
 .../sv0/local/speaker_verification_cosine.py  |  76 +-
 examples/voxceleb/sv0/local/train.py          |  46 +-
 paddleaudio/datasets/rirs_noises.py           | 207 ++++
 paddleaudio/datasets/voxceleb.py              |  18 +-
 paddlespeech/vector/io/augment.py             | 899 ++++++++++++++++++
 paddlespeech/vector/io/signal_processing.py   | 219 +++++
 paddlespeech/vector/models/ecapa_tdnn.py      |  93 ++
 paddlespeech/vector/training/seeding.py       |  28 +
 8 files changed, 1543 insertions(+), 43 deletions(-)
 create mode 100644 paddleaudio/datasets/rirs_noises.py
 create mode 100644 paddlespeech/vector/io/augment.py
 create mode 100644 paddlespeech/vector/io/signal_processing.py
 create mode 100644 paddlespeech/vector/training/seeding.py

diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
index 1959e85c..b0adcf66 100644
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@@ -23,9 +23,13 @@ from paddle.io import DataLoader
 from tqdm import tqdm
 
 from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.metrics import compute_eer
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
 
 
 def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
@@ -67,9 +71,19 @@ def feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
     return {'ids': ids, 'feats': feats, 'lengths': lengths}
 
 
+# feat configuration
+cpu_feat_conf = {
+    'n_mels': 80,
+    'window_size': 400,  #ms
+    'hop_length': 160,  #ms
+}
+
+
 def main(args):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(args.seed)
 
     # stage1: build the dnn backbone model network
     ##"channels": [1024, 1024, 1024, 1024, 3072],
@@ -95,19 +109,18 @@ def main(args):
     state_dict = paddle.load(
         os.path.join(args.load_checkpoint, 'model.pdparams'))
     model.set_state_dict(state_dict)
-    print(f'Checkpoint loaded from {args.load_checkpoint}')
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
 
     # stage4: construct the enroll and test dataloader
     enrol_ds = VoxCeleb1(
         subset='enrol',
+        target_dir=args.data_dir,
         feat_type='melspectrogram',
         random_chunk=False,
-        n_mels=80,
-        window_size=400,
-        hop_length=160)
+        **cpu_feat_conf)
     enrol_sampler = BatchSampler(
         enrol_ds, batch_size=args.batch_size,
-        shuffle=True)  # Shuffle to make embedding normalization more robust.
+        shuffle=False)  # Shuffle to make embedding normalization more robust.
     enrol_loader = DataLoader(enrol_ds,
                     batch_sampler=enrol_sampler,
                     collate_fn=lambda x: feature_normalize(
@@ -117,14 +130,13 @@ def main(args):
 
     test_ds = VoxCeleb1(
         subset='test',
+        target_dir=args.data_dir,
         feat_type='melspectrogram',
         random_chunk=False,
-        n_mels=80,
-        window_size=400,
-        hop_length=160)
+        **cpu_feat_conf)
 
     test_sampler = BatchSampler(
-        test_ds, batch_size=args.batch_size, shuffle=True)
+        test_ds, batch_size=args.batch_size, shuffle=False)
     test_loader = DataLoader(test_ds,
                             batch_sampler=test_sampler,
                             collate_fn=lambda x: feature_normalize(
@@ -136,10 +148,10 @@ def main(args):
 
     # stage7: global embedding norm to imporve the performance
     if args.global_embedding_norm:
-        embedding_mean = None
-        embedding_std = None
-        mean_norm = args.embedding_mean_norm
-        std_norm = args.embedding_std_norm
+        global_embedding_mean = None
+        global_embedding_std = None
+        mean_norm_flag = args.embedding_mean_norm
+        std_norm_flag = args.embedding_std_norm
         batch_count = 0
 
     # stage8: Compute embeddings of audios in enrol and test dataset from model.
@@ -147,7 +159,7 @@ def main(args):
     # Run multi times to make embedding normalization more stable.
     for i in range(2):
         for dl in [enrol_loader, test_loader]:
-            print(
+            logger.info(
                 f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
             )
             with paddle.no_grad():
@@ -162,20 +174,24 @@ def main(args):
                     # Global embedding normalization.
                     if args.global_embedding_norm:
                         batch_count += 1
-                        mean = embeddings.mean(axis=0) if mean_norm else 0
-                        std = embeddings.std(axis=0) if std_norm else 1
+                        current_mean = embeddings.mean(
+                            axis=0) if mean_norm_flag else 0
+                        current_std = embeddings.std(
+                            axis=0) if std_norm_flag else 1
                         # Update global mean and std.
-                        if embedding_mean is None and embedding_std is None:
-                            embedding_mean, embedding_std = mean, std
+                        if global_embedding_mean is None and global_embedding_std is None:
+                            global_embedding_mean, global_embedding_std = current_mean, current_std
                         else:
                             weight = 1 / batch_count  # Weight decay by batches.
-                            embedding_mean = (1 - weight
-                                              ) * embedding_mean + weight * mean
-                            embedding_std = (1 - weight
-                                             ) * embedding_std + weight * std
+                            global_embedding_mean = (
+                                1 - weight
+                            ) * global_embedding_mean + weight * current_mean
+                            global_embedding_std = (
+                                1 - weight
+                            ) * global_embedding_std + weight * current_std
                         # Apply global embedding normalization.
-                        embeddings = (
-                            embeddings - embedding_mean) / embedding_std
+                        embeddings = (embeddings - global_embedding_mean
+                                      ) / global_embedding_std
 
                     # Update embedding dict.
                     id2embedding.update(dict(zip(ids, embeddings)))
@@ -198,7 +214,7 @@ def main(args):
                                              ])  # (N, emb_size)
     scores = cos_sim_func(enrol_embeddings, test_embeddings)
     EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
-    print(
+    logger.info(
         f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
     )
 
@@ -210,10 +226,18 @@ if __name__ == "__main__":
                         choices=['cpu', 'gpu'],
                         default="gpu",
                         help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--seed",
+                        default=0,
+                        type=int,
+                        help="random seed for paddle, numpy and python random package")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
     parser.add_argument("--batch-size",
                         type=int,
                         default=16,
-                        help="Total examples' number in batch for training.")
+                        help="Total examples' number in batch for extract the embedding.")
     parser.add_argument("--num-workers",
                         type=int,
                         default=0,
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index 4eabf94c..745d5eab 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -22,6 +22,9 @@ from paddle.io import DistributedBatchSampler
 
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddleaudio.features.core import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.io.batch import waveform_collate_fn
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
@@ -29,8 +32,11 @@ from paddlespeech.vector.modules.loss import AdditiveAngularMargin
 from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
 from paddlespeech.vector.modules.lr import CyclicLRScheduler
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
 from paddlespeech.vector.utils.time import Timer
 
+logger = Log(__name__).getlog()
+
 # feat configuration
 cpu_feat_conf = {
     'n_mels': 80,
@@ -47,12 +53,19 @@ def main(args):
     paddle.distributed.init_parallel_env()
     nranks = paddle.distributed.get_world_size()
     local_rank = paddle.distributed.get_rank()
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(args.seed)
 
-    # stage2: data prepare
-    # note: some cmd must do in rank==0
+    # stage2: data prepare, such vox1 and vox2 data, and augment data and pipline
+    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
     train_ds = VoxCeleb1('train', target_dir=args.data_dir)
     dev_ds = VoxCeleb1('dev', target_dir=args.data_dir)
 
+    if args.augment:
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+    else:
+        augment_pipeline = []
+
     # stage3: build the dnn backbone model network
     #"channels": [1024, 1024, 1024, 1024, 3072],
     model_conf = {
@@ -83,7 +96,7 @@ def main(args):
     #         if pre-trained model exists, start epoch confirmed by the pre-trained model
     start_epoch = 0
     if args.load_checkpoint:
-        print("load the check point")
+        logger.info("load the check point")
         args.load_checkpoint = os.path.abspath(
             os.path.expanduser(args.load_checkpoint))
         try:
@@ -97,14 +110,14 @@ def main(args):
                 os.path.join(args.load_checkpoint, 'model.pdopt'))
             optimizer.set_state_dict(state_dict)
             if local_rank == 0:
-                print(f'Checkpoint loaded from {args.load_checkpoint}')
+                logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
         except FileExistsError:
             if local_rank == 0:
-                print('Train from scratch.')
+                logger.info('Train from scratch.')
 
         try:
             start_epoch = int(args.load_checkpoint[-1])
-            print(f'Restore training from epoch {start_epoch}.')
+            logger.info(f'Restore training from epoch {start_epoch}.')
         except ValueError:
             pass
 
@@ -137,7 +150,10 @@ def main(args):
             waveforms, labels = batch['waveforms'], batch['labels']
 
             # stage 9-2: audio sample augment method, which is done on the audio sample point
-            # todo
+            if len(augment_pipeline) != 0:
+                waveforms = waveform_augment(waveforms, augment_pipeline)
+                labels = paddle.concat(
+                    [labels for i in range(len(augment_pipeline) + 1)])
 
             # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
             feats = []
@@ -185,7 +201,7 @@ def main(args):
                 print_msg += ' acc={:.4f}'.format(avg_acc)
                 print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
                     lr, timer.timing, timer.eta)
-                print(print_msg)
+                logger.info(print_msg)
 
                 avg_loss = 0
                 num_corrects = 0
@@ -217,7 +233,7 @@ def main(args):
             num_samples = 0
 
             # stage 9-13: evaluation the valid dataset batch data
-            print('Evaluate on validation dataset')
+            logger.info('Evaluate on validation dataset')
             with paddle.no_grad():
                 for batch_idx, batch in enumerate(dev_loader):
                     waveforms, labels = batch['waveforms'], batch['labels']
@@ -238,12 +254,12 @@ def main(args):
 
             print_msg = '[Evaluation result]'
             print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
-            print(print_msg)
+            logger.info(print_msg)
 
             # stage 9-14: Save model parameters
             save_dir = os.path.join(args.checkpoint_dir,
                                     'epoch_{}'.format(epoch))
-            print('Saving model checkpoint to {}'.format(save_dir))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
             paddle.save(model.state_dict(),
                         os.path.join(save_dir, 'model.pdparams'))
             paddle.save(optimizer.state_dict(),
@@ -260,6 +276,10 @@ if __name__ == "__main__":
                         choices=['cpu', 'gpu'],
                         default="cpu",
                         help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--seed",
+                        default=0,
+                        type=int,
+                        help="random seed for paddle, numpy and python random package")
     parser.add_argument("--data-dir",
                         default="./data/",
                         type=str,
@@ -295,6 +315,10 @@ if __name__ == "__main__":
                         type=str,
                         default='./checkpoint',
                         help="Directory to save model checkpoints.")
+    parser.add_argument("--augment",
+                        action="store_true",
+                        default=False,
+                        help="Apply audio augments.")
 
     args = parser.parse_args()
     # yapf: enable
diff --git a/paddleaudio/datasets/rirs_noises.py b/paddleaudio/datasets/rirs_noises.py
new file mode 100644
index 00000000..fa9e7f09
--- /dev/null
+++ b/paddleaudio/datasets/rirs_noises.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import glob
+import os
+import random
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.backends import save_wav
+from paddleaudio.datasets.dataset import feat_funcs
+from paddleaudio.utils import DATA_HOME
+from paddleaudio.utils import decompress
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.utils.download import download_and_decompress
+
+logger = Log(__name__).getlog()
+
+__all__ = ['OpenRIRNoise']
+
+
+class OpenRIRNoise(Dataset):
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
+            'md5': 'e6f48e257286e05de56413b4779d8ffb',
+        },
+    ]
+
+    sample_rate = 16000
+    meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
+    base_path = os.path.join(DATA_HOME, 'open_rir_noise')
+    wav_path = os.path.join(base_path, 'RIRS_NOISES')
+    csv_path = os.path.join(base_path, 'csv')
+    subsets = ['rir', 'noise']
+
+    def __init__(self,
+                 subset: str='rir',
+                 feat_type: str='raw',
+                 target_dir=None,
+                 random_chunk: bool=True,
+                 chunk_duration: float=3.0,
+                 seed: int=0,
+                 **kwargs):
+
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+
+        self.csv_path = os.path.join(target_dir, "open_rir_noise",
+                                     "csv") if target_dir else self.csv_path
+        self._data = self._get_data()
+        super(OpenRIRNoise, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        logger.info(f"rirs noises base path: {self.base_path}")
+        if not os.path.isdir(self.base_path):
+            download_and_decompress(
+                self.archieves, self.base_path, decompress=True)
+        else:
+            logger.info(
+                f"{self.base_path} already exists, we will not download and decompress again"
+            )
+
+        # Data preparation.
+        logger.info(f"prepare the csv to {self.csv_path}")
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav = line.strip().split(',')
+                data.append(self.meta_info(audio_id, float(duration), wav))
+
+        random.shuffle(data)
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks and audio_duration > self.chunk_duration:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for idx, chunk in enumerate(uniq_chunks_list):
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                new_wav_file = os.path.join(self.base_path,
+                                            audio_id + f'_chunk_{idx+1:02}.wav')
+                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                # id, duration, new_wav
+                ret.append([chunk, self.chunk_duration, new_wav_file])
+        else:  # Keep whole audio.
+            ret.append([audio_id, audio_duration, wav_file])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        logger.info(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav"]
+
+        infos = list(
+            tqdm(
+                map(self._get_audio_info, wav_files, [split_chunks] * len(
+                    wav_files)),
+                total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
+                                "rir_list")
+        rir_files = []
+        with open(rir_list, 'r') as f:
+            for line in f.readlines():
+                rir_file = line.strip().split(' ')[-1]
+                rir_files.append(os.path.join(self.base_path, rir_file))
+
+        noise_list = os.path.join(self.wav_path, "pointsource_noises",
+                                  "noise_list")
+        noise_files = []
+        with open(noise_list, 'r') as f:
+            for line in f.readlines():
+                noise_file = line.strip().split(' ')[-1]
+                noise_files.append(os.path.join(self.base_path, noise_file))
+
+        self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
+        self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py
index 760db721..28f6dfc6 100644
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/datasets/voxceleb.py
@@ -29,9 +29,12 @@ from paddleaudio.datasets.dataset import feat_funcs
 from paddleaudio.utils import DATA_HOME
 from paddleaudio.utils import decompress
 from paddleaudio.utils import download_and_decompress
+from paddlespeech.s2t.utils.log import Log
 from utils.utility import download
 from utils.utility import unpack
 
+logger = Log(__name__).getlog()
+
 __all__ = ['VoxCeleb1']
 
 
@@ -121,9 +124,9 @@ class VoxCeleb1(Dataset):
         # Download audio files.
         # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
         # so, we check the vox1/wav dir status
-        print("wav base path: {}".format(self.wav_path))
+        logger.info(f"wav base path: {self.wav_path}")
         if not os.path.isdir(self.wav_path):
-            print("start to download the voxceleb1 dataset")
+            logger.info(f"start to download the voxceleb1 dataset")
             download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
                 self.archieves_audio_dev,
                 self.base_path,
@@ -135,7 +138,7 @@ class VoxCeleb1(Dataset):
 
             # Download all parts and concatenate the files into one zip file.
             dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
-            print(f'Concatenating all parts to: {dev_zipfile}')
+            logger.info(f'Concatenating all parts to: {dev_zipfile}')
             os.system(
                 f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
             )
@@ -154,6 +157,9 @@ class VoxCeleb1(Dataset):
             self.prepare_data()
 
         data = []
+        logger.info(
+            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
+        )
         with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
             for line in rf.readlines()[1:]:
                 audio_id, duration, wav, start, stop, spk_id = line.strip(
@@ -246,7 +252,7 @@ class VoxCeleb1(Dataset):
                      wav_files: List[str],
                      output_file: str,
                      split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
+        logger.info(f'Generating csv: {output_file}')
         header = ["id", "duration", "wav", "start", "stop", "spk_id"]
 
         with Pool(64) as p:
@@ -269,7 +275,7 @@ class VoxCeleb1(Dataset):
 
     def prepare_data(self):
         # Audio of speakers in veri_test_file should not be included in training set.
-        print("start to prepare the data csv file")
+        logger.info("start to prepare the data csv file")
         enrol_files = set()
         test_files = set()
         # get the enroll and test audio file path
@@ -299,7 +305,7 @@ class VoxCeleb1(Dataset):
                 speakers.add(spk)
                 audio_files.append(file)
 
-        print("start to generate the {}".format(
+        logger.info("start to generate the {}".format(
             os.path.join(self.meta_path, 'spk_id2label.txt')))
         # encode the train and dev speakers label to spk_id2label.txt
         with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
new file mode 100644
index 00000000..d6bbc8a9
--- /dev/null
+++ b/paddlespeech/vector/io/augment.py
@@ -0,0 +1,899 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.signal_processing import compute_amplitude
+from paddlespeech.vector.io.signal_processing import convolve1d
+from paddlespeech.vector.io.signal_processing import dB_to_amplitude
+from paddlespeech.vector.io.signal_processing import notch_filter
+from paddlespeech.vector.io.signal_processing import reverberate
+
+logger = Log(__name__).getlog()
+
+
+# TODO: Complete type-hint and doc string.
+class DropFreq(nn.Layer):
+    def __init__(
+            self,
+            drop_freq_low=1e-14,
+            drop_freq_high=1,
+            drop_count_low=1,
+            drop_count_high=2,
+            drop_width=0.05,
+            drop_prob=1, ):
+        super(DropFreq, self).__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = paddle.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, shape=[1])
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            paddle.rand([drop_count]) * drop_range + self.drop_freq_low)
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = paddle.zeros([1, filter_length, 1])
+        drop_filter[0, pad, 0] = 1
+
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(frequency, filter_length,
+                                        self.drop_width)
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+
+class DropChunk(nn.Layer):
+    def __init__(
+            self,
+            drop_length_low=100,
+            drop_length_high=1000,
+            drop_count_low=1,
+            drop_count_high=10,
+            drop_start=0,
+            drop_end=None,
+            drop_prob=1,
+            noise_factor=0.0, ):
+        super(DropChunk, self).__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        # Reading input list
+        lengths = (lengths * waveforms.shape[1]).astype('int64')
+        batch_size = waveforms.shape[0]
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = paddle.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            shape=[batch_size], )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = paddle.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                shape=[drop_times[i]], )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = paddle.randint(
+                low=start_min,
+                high=start_max + 1,
+                shape=[drop_times[i]], )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    dropped_waveform[i, start[j]:end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = paddle.rand([length[j]], dtype='float32')
+
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, int(start[j]):int(end[j])] = noise_vec
+
+        return dropped_waveform
+
+
+class Resample(nn.Layer):
+    def __init__(
+            self,
+            orig_freq=16000,
+            new_freq=16000,
+            lowpass_filter_width=6, ):
+        super(Resample, self).__init__()
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.lowpass_filter_width = lowpass_filter_width
+
+        # Compute rate for striding
+        self._compute_strides()
+        assert self.orig_freq % self.conv_stride == 0
+        assert self.new_freq % self.conv_transpose_stride == 0
+
+    def _compute_strides(self):
+        # Compute new unit based on ratio of in/out frequencies
+        base_freq = math.gcd(self.orig_freq, self.new_freq)
+        input_samples_in_unit = self.orig_freq // base_freq
+        self.output_samples = self.new_freq // base_freq
+
+        # Store the appropriate stride based on the new units
+        self.conv_stride = input_samples_in_unit
+        self.conv_transpose_stride = self.output_samples
+
+    def forward(self, waveforms):
+        if not hasattr(self, "first_indices"):
+            self._indices_and_weights(waveforms)
+
+        # Don't do anything if the frequencies are the same
+        if self.orig_freq == self.new_freq:
+            return waveforms
+
+        unsqueezed = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(1)
+            unsqueezed = True
+        elif len(waveforms.shape) == 3:
+            waveforms = waveforms.transpose([0, 2, 1])
+        else:
+            raise ValueError("Input must be 2 or 3 dimensions")
+
+        # Do resampling
+        resampled_waveform = self._perform_resample(waveforms)
+
+        if unsqueezed:
+            resampled_waveform = resampled_waveform.squeeze(1)
+        else:
+            resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+        return resampled_waveform
+
+    def _perform_resample(self, waveforms):
+        # Compute output size and initialize
+        batch_size, num_channels, wave_len = waveforms.shape
+        window_size = self.weights.shape[1]
+        tot_output_samp = self._output_samples(wave_len)
+        resampled_waveform = paddle.zeros((batch_size, num_channels,
+                                           tot_output_samp))
+
+        # eye size: (num_channels, num_channels, 1)
+        eye = paddle.eye(num_channels).unsqueeze(2)
+
+        # Iterate over the phases in the polyphase filter
+        for i in range(self.first_indices.shape[0]):
+            wave_to_conv = waveforms
+            first_index = int(self.first_indices[i].item())
+            if first_index >= 0:
+                # trim the signal as the filter will not be applied
+                # before the first_index
+                wave_to_conv = wave_to_conv[:, :, first_index:]
+
+            # pad the right of the signal to allow partial convolutions
+            # meaning compute values for partial windows (e.g. end of the
+            # window is outside the signal length)
+            max_index = (tot_output_samp - 1) // self.output_samples
+            end_index = max_index * self.conv_stride + window_size
+            current_wave_len = wave_len - first_index
+            right_padding = max(0, end_index + 1 - current_wave_len)
+            left_padding = max(0, -first_index)
+            wave_to_conv = paddle.nn.functional.pad(
+                wave_to_conv, [left_padding, right_padding], data_format='NCL')
+            conv_wave = paddle.nn.functional.conv1d(
+                x=wave_to_conv,
+                # weight=self.weights[i].repeat(num_channels, 1, 1),
+                weight=self.weights[i].expand((num_channels, 1, -1)),
+                stride=self.conv_stride,
+                groups=num_channels, )
+
+            # we want conv_wave[:, i] to be at
+            # output[:, i + n*conv_transpose_stride]
+            dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+                conv_wave, eye, stride=self.conv_transpose_stride)
+
+            # pad dilated_conv_wave so it reaches the output length if needed.
+            left_padding = i
+            previous_padding = left_padding + dilated_conv_wave.shape[-1]
+            right_padding = max(0, tot_output_samp - previous_padding)
+            dilated_conv_wave = paddle.nn.functional.pad(
+                dilated_conv_wave, [left_padding, right_padding],
+                data_format='NCL')
+            dilated_conv_wave = dilated_conv_wave[:, :, :tot_output_samp]
+
+            resampled_waveform += dilated_conv_wave
+
+        return resampled_waveform
+
+    def _output_samples(self, input_num_samp):
+        samp_in = int(self.orig_freq)
+        samp_out = int(self.new_freq)
+
+        tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+        ticks_per_input_period = tick_freq // samp_in
+
+        # work out the number of ticks in the time interval
+        # [ 0, input_num_samp/samp_in ).
+        interval_length = input_num_samp * ticks_per_input_period
+        if interval_length <= 0:
+            return 0
+        ticks_per_output_period = tick_freq // samp_out
+
+        # Get the last output-sample in the closed interval,
+        # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+        # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+        # explanation of the notation.
+        last_output_samp = interval_length // ticks_per_output_period
+
+        # We need the last output-sample in the open interval, so if it
+        # takes us to the end of the interval exactly, subtract one.
+        if last_output_samp * ticks_per_output_period == interval_length:
+            last_output_samp -= 1
+
+        # First output-sample index is zero, so the number of output samples
+        # is the last output-sample plus one.
+        num_output_samp = last_output_samp + 1
+
+        return num_output_samp
+
+    def _indices_and_weights(self, waveforms):
+        # Lowpass filter frequency depends on smaller of two frequencies
+        min_freq = min(self.orig_freq, self.new_freq)
+        lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+        assert lowpass_cutoff * 2 <= min_freq
+        window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+        assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+        output_t = paddle.arange(start=0.0, end=self.output_samples)
+        output_t /= self.new_freq
+        min_t = output_t - window_width
+        max_t = output_t + window_width
+
+        min_input_index = paddle.ceil(min_t * self.orig_freq)
+        max_input_index = paddle.floor(max_t * self.orig_freq)
+        num_indices = max_input_index - min_input_index + 1
+
+        max_weight_width = num_indices.max()
+        j = paddle.arange(max_weight_width, dtype='float32')
+        input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+        delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+        weights = paddle.zeros_like(delta_t)
+        inside_window_indices = delta_t.abs().less_than(
+            paddle.to_tensor(window_width))
+
+        # raised-cosine (Hanning) window with width `window_width`
+        weights[inside_window_indices] = 0.5 * (1 + paddle.cos(
+            2 * math.pi * lowpass_cutoff / self.lowpass_filter_width *
+            delta_t.masked_select(inside_window_indices)))
+
+        t_eq_zero_indices = delta_t.equal(paddle.zeros_like(delta_t))
+        t_not_eq_zero_indices = delta_t.not_equal(paddle.zeros_like(delta_t))
+
+        # sinc filter function
+        weights = paddle.where(
+            t_not_eq_zero_indices,
+            weights * paddle.sin(2 * math.pi * lowpass_cutoff * delta_t) /
+            (math.pi * delta_t), weights)
+
+        # limit of the function at t = 0
+        weights = paddle.where(t_eq_zero_indices, weights * 2 * lowpass_cutoff,
+                               weights)
+
+        # size (output_samples, max_weight_width)
+        weights /= self.orig_freq
+
+        self.first_indices = min_input_index
+        self.weights = weights
+
+
+class SpeedPerturb(nn.Layer):
+    def __init__(
+            self,
+            orig_freq,
+            speeds=[90, 100, 110],
+            perturb_prob=1.0, ):
+        super(SpeedPerturb, self).__init__()
+        self.orig_freq = orig_freq
+        self.speeds = speeds
+        self.perturb_prob = perturb_prob
+
+        # Initialize index of perturbation
+        self.samp_index = 0
+
+        # Initialize resamplers
+        self.resamplers = []
+        for speed in self.speeds:
+            config = {
+                "orig_freq": self.orig_freq,
+                "new_freq": self.orig_freq * speed // 100,
+            }
+            self.resamplers.append(Resample(**config))
+
+    def forward(self, waveform):
+        # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+        if paddle.rand([1]) > self.perturb_prob:
+            return waveform.clone()
+
+        # Perform a random perturbation
+        self.samp_index = paddle.randint(len(self.speeds), shape=[1]).item()
+        perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+        return perturbed_waveform
+
+
+class AddNoise(nn.Layer):
+    def __init__(
+            self,
+            noise_dataset=None,  # None for white noise
+            num_workers=0,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1.0,
+            start_index=None,
+            normalize=False, ):
+        super(AddNoise, self).__init__()
+
+        self.num_workers = num_workers
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+        self.start_index = start_index
+        self.normalize = normalize
+        self.noise_dataset = noise_dataset
+        self.noise_dataloader = None
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Copy clean waveform to initialize noisy waveform
+        noisy_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).astype('int64').unsqueeze(1)
+
+        # Don't add noise (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return noisy_waveform
+
+        # Compute the average amplitude of the clean waveforms
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        SNR = paddle.rand((len(waveforms), 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+        # Scale clean signal appropriately
+        noisy_waveform *= 1 - noise_amplitude_factor
+
+        # Loop through clean samples and create mixture
+        if self.noise_dataset is None:
+            white_noise = paddle.normal(shape=waveforms.shape)
+            noisy_waveform += new_noise_amplitude * white_noise
+        else:
+            tensor_length = waveforms.shape[1]
+            noise_waveform, noise_length = self._load_noise(
+                lengths,
+                tensor_length, )
+
+            # Rescale and add
+            noise_amplitude = compute_amplitude(noise_waveform, noise_length)
+            noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14)
+            noisy_waveform += noise_waveform
+
+        # Normalizing to prevent clipping
+        if self.normalize:
+            abs_max, _ = paddle.max(
+                paddle.abs(noisy_waveform), axis=1, keepdim=True)
+            noisy_waveform = noisy_waveform / abs_max.clip(min=1.0)
+
+        return noisy_waveform
+
+    def _load_noise(self, lengths, max_length):
+        """
+        Load a batch of noises
+
+        args
+        lengths(Paddle.Tensor): Num samples of waveforms with shape (N, 1).
+        max_length(int): Width of a batch.
+        """
+        lengths = lengths.squeeze(1)
+        batch_size = len(lengths)
+
+        # Load a noise batch
+        if self.noise_dataloader is None:
+
+            def noise_collate_fn(batch):
+                def pad(x, target_length, mode='constant', **kwargs):
+                    x = np.asarray(x)
+                    w = target_length - x.shape[0]
+                    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                    return np.pad(x, [0, w], mode=mode, **kwargs)
+
+                ids = [item['id'] for item in batch]
+                lengths = np.asarray([item['feat'].shape[0] for item in batch])
+                waveforms = list(
+                    map(lambda x: pad(x, max(max_length, lengths.max().item())),
+                        [item['feat'] for item in batch]))
+                waveforms = np.stack(waveforms)
+                return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+            # Create noise data loader.
+            self.noise_dataloader = paddle.io.DataLoader(
+                self.noise_dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=self.num_workers,
+                collate_fn=noise_collate_fn,
+                return_list=True, )
+            self.noise_data = iter(self.noise_dataloader)
+
+        noise_batch, noise_len = self._load_noise_batch_of_size(batch_size)
+
+        # Select a random starting location in the waveform
+        start_index = self.start_index
+        if self.start_index is None:
+            start_index = 0
+            max_chop = (noise_len - lengths).min().clip(min=1)
+            start_index = paddle.randint(high=max_chop, shape=[1])
+
+        # Truncate noise_batch to max_length
+        noise_batch = noise_batch[:, start_index:start_index + max_length]
+        noise_len = (noise_len - start_index).clip(max=max_length).unsqueeze(1)
+        return noise_batch, noise_len
+
+    def _load_noise_batch_of_size(self, batch_size):
+        """Concatenate noise batches, then chop to correct size"""
+        noise_batch, noise_lens = self._load_noise_batch()
+
+        # Expand
+        while len(noise_batch) < batch_size:
+            noise_batch = paddle.concat((noise_batch, noise_batch))
+            noise_lens = paddle.concat((noise_lens, noise_lens))
+
+        # Contract
+        if len(noise_batch) > batch_size:
+            noise_batch = noise_batch[:batch_size]
+            noise_lens = noise_lens[:batch_size]
+
+        return noise_batch, noise_lens
+
+    def _load_noise_batch(self):
+        """Load a batch of noises, restarting iteration if necessary."""
+        try:
+            batch = next(self.noise_data)
+        except StopIteration:
+            self.noise_data = iter(self.noise_dataloader)
+            batch = next(self.noise_data)
+
+        noises, lens = batch['feats'], batch['lengths']
+        return noises, lens
+
+
+class AddReverb(nn.Layer):
+    def __init__(
+            self,
+            rir_dataset,
+            reverb_prob=1.0,
+            rir_scale_factor=1.0,
+            num_workers=0, ):
+        super(AddReverb, self).__init__()
+        self.rir_dataset = rir_dataset
+        self.reverb_prob = reverb_prob
+        self.rir_scale_factor = rir_scale_factor
+
+        # Create rir data loader.
+        def rir_collate_fn(batch):
+            def pad(x, target_length, mode='constant', **kwargs):
+                x = np.asarray(x)
+                w = target_length - x.shape[0]
+                assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                return np.pad(x, [0, w], mode=mode, **kwargs)
+
+            ids = [item['id'] for item in batch]
+            lengths = np.asarray([item['feat'].shape[0] for item in batch])
+            waveforms = list(
+                map(lambda x: pad(x, lengths.max().item()),
+                    [item['feat'] for item in batch]))
+            waveforms = np.stack(waveforms)
+            return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+        self.rir_dataloader = paddle.io.DataLoader(
+            self.rir_dataset,
+            collate_fn=rir_collate_fn,
+            num_workers=num_workers,
+            shuffle=True,
+            return_list=True, )
+
+        self.rir_data = iter(self.rir_dataloader)
+
+    def forward(self, waveforms, lengths=None):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Don't add reverb (return early) 1-`reverb_prob` portion of the time
+        if paddle.rand([1]) > self.reverb_prob:
+            return waveforms.clone()
+
+        # Add channels dimension if necessary
+        channel_added = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(-1)
+            channel_added = True
+
+        # Load and prepare RIR
+        rir_waveform = self._load_rir()
+
+        # Compress or dilate RIR
+        if self.rir_scale_factor != 1:
+            rir_waveform = F.interpolate(
+                rir_waveform.transpose([0, 2, 1]),
+                scale_factor=self.rir_scale_factor,
+                mode="linear",
+                align_corners=False,
+                data_format='NCW', )
+            # (N, C, L) -> (N, L, C)
+            rir_waveform = rir_waveform.transpose([0, 2, 1])
+
+        rev_waveform = reverberate(
+            waveforms,
+            rir_waveform,
+            self.rir_dataset.sample_rate,
+            rescale_amp="avg")
+
+        # Remove channels dimension if added
+        if channel_added:
+            return rev_waveform.squeeze(-1)
+
+        return rev_waveform
+
+    def _load_rir(self):
+        try:
+            batch = next(self.rir_data)
+        except StopIteration:
+            self.rir_data = iter(self.rir_dataloader)
+            batch = next(self.rir_data)
+
+        rir_waveform = batch['feats']
+
+        # Make sure RIR has correct channels
+        if len(rir_waveform.shape) == 2:
+            rir_waveform = rir_waveform.unsqueeze(-1)
+
+        return rir_waveform
+
+
+class AddBabble(nn.Layer):
+    def __init__(
+            self,
+            speaker_count=3,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1, ):
+        super(AddBabble, self).__init__()
+        self.speaker_count = speaker_count
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        babbled_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).unsqueeze(1)
+        batch_size = len(waveforms)
+
+        # Don't mix (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return babbled_waveform
+
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+        SNR = paddle.rand((batch_size, 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+        # Scale clean signal appropriately
+        babbled_waveform *= 1 - noise_amplitude_factor
+
+        # For each speaker in the mixture, roll and add
+        babble_waveform = waveforms.roll((1, ), axis=0)
+        babble_len = lengths.roll((1, ), axis=0)
+        for i in range(1, self.speaker_count):
+            babble_waveform += waveforms.roll((1 + i, ), axis=0)
+            babble_len = paddle.concat(
+                [babble_len, babble_len.roll((1, ), axis=0)], axis=-1).max(
+                    axis=-1, keepdim=True)
+
+        # Rescale and add to mixture
+        babble_amplitude = compute_amplitude(babble_waveform, babble_len)
+        babble_waveform *= new_noise_amplitude / (babble_amplitude + 1e-14)
+        babbled_waveform += babble_waveform
+
+        return babbled_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+    def __init__(
+            self,
+            perturb_prob=1.0,
+            drop_freq_prob=1.0,
+            drop_chunk_prob=1.0,
+            speeds=[95, 100, 105],
+            sample_rate=16000,
+            drop_freq_count_low=0,
+            drop_freq_count_high=3,
+            drop_chunk_count_low=0,
+            drop_chunk_count_high=5,
+            drop_chunk_length_low=1000,
+            drop_chunk_length_high=2000,
+            drop_chunk_noise_factor=0, ):
+        super(TimeDomainSpecAugment, self).__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob,
+            orig_freq=sample_rate,
+            speeds=speeds, )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high, )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor, )
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        with paddle.no_grad():
+            # Augmentation
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+
+        return waveforms
+
+
+class EnvCorrupt(nn.Layer):
+    def __init__(
+            self,
+            reverb_prob=1.0,
+            babble_prob=1.0,
+            noise_prob=1.0,
+            rir_dataset=None,
+            noise_dataset=None,
+            num_workers=0,
+            babble_speaker_count=0,
+            babble_snr_low=0,
+            babble_snr_high=0,
+            noise_snr_low=0,
+            noise_snr_high=0,
+            rir_scale_factor=1.0, ):
+        super(EnvCorrupt, self).__init__()
+
+        # Initialize corrupters
+        if rir_dataset is not None and reverb_prob > 0.0:
+            self.add_reverb = AddReverb(
+                rir_dataset=rir_dataset,
+                num_workers=num_workers,
+                reverb_prob=reverb_prob,
+                rir_scale_factor=rir_scale_factor, )
+
+        if babble_speaker_count > 0 and babble_prob > 0.0:
+            self.add_babble = AddBabble(
+                speaker_count=babble_speaker_count,
+                snr_low=babble_snr_low,
+                snr_high=babble_snr_high,
+                mix_prob=babble_prob, )
+
+        if noise_dataset is not None and noise_prob > 0.0:
+            self.add_noise = AddNoise(
+                noise_dataset=noise_dataset,
+                num_workers=num_workers,
+                snr_low=noise_snr_low,
+                snr_high=noise_snr_high,
+                mix_prob=noise_prob, )
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Augmentation
+        with paddle.no_grad():
+            if hasattr(self, "add_reverb"):
+                try:
+                    waveforms = self.add_reverb(waveforms, lengths)
+                except Exception:
+                    pass
+            if hasattr(self, "add_babble"):
+                waveforms = self.add_babble(waveforms, lengths)
+            if hasattr(self, "add_noise"):
+                waveforms = self.add_noise(waveforms, lengths)
+
+        return waveforms
+
+
+def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
+    """build augment pipeline
+    Note: this pipeline cannot be used in the paddle.DataLoader
+
+    Returns:
+        List[paddle.nn.Layer]: all augment process
+    """
+    logger.info("start to build the augment pipeline")
+    noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
+    rir_dataset = OpenRIRNoise('rir')
+
+    wavedrop = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[100], )
+    speed_perturb = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[95, 100, 105], )
+    add_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        reverb_prob=0.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+    add_rev = EnvCorrupt(
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=0.0,
+        rir_scale_factor=1.0, )
+    add_rev_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+
+    return [wavedrop, speed_perturb, add_noise, add_rev, add_rev_noise]
+
+
+def waveform_augment(waveforms: paddle.Tensor,
+                     augment_pipeline: List[paddle.nn.Layer]) -> paddle.Tensor:
+    """process the augment pipeline and return all the waveforms
+
+    Args:
+        waveforms (paddle.Tensor): _description_
+        augment_pipeline (List[paddle.nn.Layer]): _description_
+
+    Returns:
+        paddle.Tensor: _description_
+    """
+    waveforms_aug_list = [waveforms]
+    for aug in augment_pipeline:
+        waveforms_aug = aug(waveforms)  # (N, L)
+        if waveforms_aug.shape[1] >= waveforms.shape[1]:
+            # Trunc
+            waveforms_aug = waveforms_aug[:, :waveforms.shape[1]]
+        else:
+            # Pad
+            lengths_to_pad = waveforms.shape[1] - waveforms_aug.shape[1]
+            waveforms_aug = F.pad(
+                waveforms_aug.unsqueeze(-1), [0, lengths_to_pad],
+                data_format='NLC').squeeze(-1)
+        waveforms_aug_list.append(waveforms_aug)
+
+    return paddle.concat(waveforms_aug_list, axis=0)
diff --git a/paddlespeech/vector/io/signal_processing.py b/paddlespeech/vector/io/signal_processing.py
new file mode 100644
index 00000000..a61bf554
--- /dev/null
+++ b/paddlespeech/vector/io/signal_processing.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import paddle
+
+# TODO: Complete type-hint and doc string.
+
+
+def blackman_window(win_len, dtype=np.float32):
+    arcs = np.pi * np.arange(win_len) / float(win_len)
+    win = np.asarray(
+        [0.42 - 0.5 * np.cos(2 * arc) + 0.08 * np.cos(4 * arc) for arc in arcs],
+        dtype=dtype)
+    return paddle.to_tensor(win)
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0)
+
+    assert amp_type in ["avg", "peak"]
+    assert scale in ["linear", "dB"]
+
+    if amp_type == "avg":
+        if lengths is None:
+            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+        else:
+            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+            out = wav_sum / lengths
+    elif amp_type == "peak":
+        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
+    else:
+        raise NotImplementedError
+
+    if scale == "linear":
+        return out
+    elif scale == "dB":
+        return paddle.clip(20 * paddle.log10(out), min=-80)
+    else:
+        raise NotImplementedError
+
+
+def dB_to_amplitude(SNR):
+    return 10**(SNR / 20)
+
+
+def convolve1d(
+        waveform,
+        kernel,
+        padding=0,
+        pad_type="constant",
+        stride=1,
+        groups=1, ):
+    if len(waveform.shape) != 3:
+        raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+    # Padding can be a tuple (left_pad, right_pad) or an int
+    if isinstance(padding, list):
+        waveform = paddle.nn.functional.pad(
+            x=waveform,
+            pad=padding,
+            mode=pad_type,
+            data_format='NLC', )
+
+    # Move time dimension last, which pad and fft and conv expect.
+    # (N, L, C) -> (N, C, L)
+    waveform = waveform.transpose([0, 2, 1])
+    kernel = kernel.transpose([0, 2, 1])
+
+    convolved = paddle.nn.functional.conv1d(
+        x=waveform,
+        weight=kernel,
+        stride=stride,
+        groups=groups,
+        padding=padding if not isinstance(padding, list) else 0, )
+
+    # Return time dimension to the second dimension.
+    return convolved.transpose([0, 2, 1])
+
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+    # Check inputs
+    assert 0 < notch_freq <= 1
+    assert filter_width % 2 != 0
+    pad = filter_width // 2
+    inputs = paddle.arange(filter_width, dtype='float32') - pad
+
+    # Avoid frequencies that are too low
+    notch_freq += notch_width
+
+    # Define sinc function, avoiding division by zero
+    def sinc(x):
+        def _sinc(x):
+            return paddle.sin(x) / x
+
+        # The zero is at the middle index
+        res = paddle.concat(
+            [_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1:])])
+        return res
+
+    # Compute a low-pass filter with cutoff frequency notch_freq.
+    hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+    # import torch
+    # hlpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hlpf *= blackman_window(filter_width)
+    hlpf /= paddle.sum(hlpf)
+
+    # Compute a high-pass filter with cutoff frequency notch_freq.
+    hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+    # hhpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hhpf *= blackman_window(filter_width)
+    hhpf /= -paddle.sum(hhpf)
+    hhpf[pad] += 1
+
+    # Adding filters creates notch filter
+    return (hlpf + hhpf).reshape([1, -1, 1])
+
+
+def reverberate(waveforms,
+                rir_waveform,
+                sample_rate,
+                impulse_duration=0.3,
+                rescale_amp="avg"):
+    orig_shape = waveforms.shape
+
+    if len(waveforms.shape) > 3 or len(rir_waveform.shape) > 3:
+        raise NotImplementedError
+
+    # if inputs are mono tensors we reshape to 1, samples
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0).unsqueeze(-1)
+    elif len(waveforms.shape) == 2:
+        waveforms = waveforms.unsqueeze(-1)
+
+    if len(rir_waveform.shape) == 1:  # convolve1d expects a 3d tensor !
+        rir_waveform = rir_waveform.unsqueeze(0).unsqueeze(-1)
+    elif len(rir_waveform.shape) == 2:
+        rir_waveform = rir_waveform.unsqueeze(-1)
+
+    # Compute the average amplitude of the clean
+    orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1],
+                                       rescale_amp)
+
+    # Compute index of the direct signal, so we can preserve alignment
+    impulse_index_start = rir_waveform.abs().argmax(axis=1).item()
+    impulse_index_end = min(
+        impulse_index_start + int(sample_rate * impulse_duration),
+        rir_waveform.shape[1])
+    rir_waveform = rir_waveform[:, impulse_index_start:impulse_index_end, :]
+    rir_waveform = rir_waveform / paddle.norm(rir_waveform, p=2)
+    rir_waveform = paddle.flip(rir_waveform, [1])
+
+    waveforms = convolve1d(
+        waveform=waveforms,
+        kernel=rir_waveform,
+        padding=[rir_waveform.shape[1] - 1, 0], )
+
+    # Rescale to the peak amplitude of the clean waveform
+    waveforms = rescale(waveforms, waveforms.shape[1], orig_amplitude,
+                        rescale_amp)
+
+    if len(orig_shape) == 1:
+        waveforms = waveforms.squeeze(0).squeeze(-1)
+    if len(orig_shape) == 2:
+        waveforms = waveforms.squeeze(-1)
+
+    return waveforms
+
+
+def rescale(waveforms, lengths, target_lvl, amp_type="avg", scale="linear"):
+    assert amp_type in ["peak", "avg"]
+    assert scale in ["linear", "dB"]
+
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+
+    waveforms = normalize(waveforms, lengths, amp_type)
+
+    if scale == "linear":
+        out = target_lvl * waveforms
+    elif scale == "dB":
+        out = dB_to_amplitude(target_lvl) * waveforms
+
+    else:
+        raise NotImplementedError("Invalid scale, choose between dB and linear")
+
+    if batch_added:
+        out = out.squeeze(0)
+
+    return out
+
+
+def normalize(waveforms, lengths=None, amp_type="avg", eps=1e-14):
+    assert amp_type in ["avg", "peak"]
+
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+
+    den = compute_amplitude(waveforms, lengths, amp_type) + eps
+    if batch_added:
+        waveforms = waveforms.squeeze(0)
+    return waveforms / den
diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
index e493b800..4c960e11 100644
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -19,6 +19,16 @@ import paddle.nn.functional as F
 
 
 def length_to_mask(length, max_len=None, dtype=None):
+    """_summary_
+
+    Args:
+        length (_type_): _description_
+        max_len (_type_, optional): _description_. Defaults to None.
+        dtype (_type_, optional): _description_. Defaults to None.
+
+    Returns:
+        _type_: _description_
+    """
     assert len(length.shape) == 1
 
     if max_len is None:
@@ -47,6 +57,19 @@ class Conv1d(nn.Layer):
             groups=1,
             bias=True,
             padding_mode="reflect", ):
+        """_summary_
+
+        Args:
+            in_channels (_type_): _description_
+            out_channels (_type_): _description_
+            kernel_size (_type_): _description_
+            stride (int, optional): _description_. Defaults to 1.
+            padding (str, optional): _description_. Defaults to "same".
+            dilation (int, optional): _description_. Defaults to 1.
+            groups (int, optional): _description_. Defaults to 1.
+            bias (bool, optional): _description_. Defaults to True.
+            padding_mode (str, optional): _description_. Defaults to "reflect".
+        """
         super().__init__()
 
         self.kernel_size = kernel_size
@@ -66,6 +89,17 @@ class Conv1d(nn.Layer):
             bias_attr=bias, )
 
     def forward(self, x):
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+
+        Raises:
+            ValueError: _description_
+
+        Returns:
+            _type_: _description_
+        """
         if self.padding == "same":
             x = self._manage_padding(x, self.kernel_size, self.dilation,
                                      self.stride)
@@ -75,6 +109,17 @@ class Conv1d(nn.Layer):
         return self.conv(x)
 
     def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+            kernel_size (int): _description_
+            dilation (int): _description_
+            stride (int): _description_
+
+        Returns:
+            _type_: _description_
+        """
         L_in = x.shape[-1]  # Detecting input shape
         padding = self._get_padding_elem(L_in, stride, kernel_size,
                                          dilation)  # Time padding
@@ -88,6 +133,17 @@ class Conv1d(nn.Layer):
                           stride: int,
                           kernel_size: int,
                           dilation: int):
+        """_summary_
+
+        Args:
+            L_in (int): _description_
+            stride (int): _description_
+            kernel_size (int): _description_
+            dilation (int): _description_
+
+        Returns:
+            _type_: _description_
+        """
         if stride > 1:
             n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
             L_out = stride * (n_steps - 1) + kernel_size * dilation
@@ -134,6 +190,15 @@ class TDNNBlock(nn.Layer):
             kernel_size,
             dilation,
             activation=nn.ReLU, ):
+        """Implementation of TDNN network
+
+        Args:
+            in_channels (int): input channels or input embedding dimensions
+            out_channels (int): output channels or output embedding dimensions
+            kernel_size (int): the kernel size of the TDNN network block
+            dilation (int): the dilation of the TDNN network block
+            activation (paddle class, optional): the activation layers. Defaults to nn.ReLU.
+        """
         super().__init__()
         self.conv = Conv1d(
             in_channels=in_channels,
@@ -149,6 +214,15 @@ class TDNNBlock(nn.Layer):
 
 class Res2NetBlock(nn.Layer):
     def __init__(self, in_channels, out_channels, scale=8, dilation=1):
+        """Implementation of Res2Net Block with dilation
+           The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
+           whose url is https://arxiv.org/abs/1904.01169
+        Args:
+            in_channels (int): input channels or input dimensions
+            out_channels (int): output channels or output dimensions
+            scale (int, optional): _description_. Defaults to 8.
+            dilation (int, optional): _description_. Defaults to 1.
+        """
         super().__init__()
         assert in_channels % scale == 0
         assert out_channels % scale == 0
@@ -179,6 +253,14 @@ class Res2NetBlock(nn.Layer):
 
 class SEBlock(nn.Layer):
     def __init__(self, in_channels, se_channels, out_channels):
+        """Implementation of SEBlock
+           The paper is refered as "Squeeze-and-Excitation Networks"
+           whose url is https://arxiv.org/abs/1709.01507
+        Args:
+            in_channels (int): input channels or input data dimensions
+            se_channels (_type_): _description_
+            out_channels (int): output channels or output data dimensions
+        """
         super().__init__()
 
         self.conv1 = Conv1d(
@@ -275,6 +357,17 @@ class SERes2NetBlock(nn.Layer):
             kernel_size=1,
             dilation=1,
             activation=nn.ReLU, ):
+        """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
+
+        Args:
+            in_channels (int): input channels or input data dimensions
+            out_channels (_type_): _description_
+            res2net_scale (int, optional): _description_. Defaults to 8.
+            se_channels (int, optional): _description_. Defaults to 128.
+            kernel_size (int, optional): _description_. Defaults to 1.
+            dilation (int, optional): _description_. Defaults to 1.
+            activation (_type_, optional): _description_. Defaults to nn.ReLU.
+        """
         super().__init__()
         self.out_channels = out_channels
         self.tdnn1 = TDNNBlock(
diff --git a/paddlespeech/vector/training/seeding.py b/paddlespeech/vector/training/seeding.py
new file mode 100644
index 00000000..0778a27d
--- /dev/null
+++ b/paddlespeech/vector/training/seeding.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+import random
+
+import numpy as np
+import paddle
+
+
+def seed_everything(seed: int):
+    """Seed paddle, random and np.random to help reproductivity."""
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    logger.info(f"Set the seed of paddle, random, np.random to {seed}.")

From 7db7eb8993a1dcc986de859f4b0f0f9cbf6b589f Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 7 Mar 2022 22:58:15 +0800
Subject: [PATCH 19/41] add extract audio embedding api, test=doc

---
 .../sv0/local/extract_speaker_embedding.py    | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 examples/voxceleb/sv0/local/extract_speaker_embedding.py

diff --git a/examples/voxceleb/sv0/local/extract_speaker_embedding.py b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
new file mode 100644
index 00000000..8eb24e1d
--- /dev/null
+++ b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from tqdm import tqdm
+
+from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddleaudio.features.core import melspectrogram
+from paddleaudio.backends import load as load_audio
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.metrics import compute_eer
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+# feat configuration
+cpu_feat_conf = {
+    'n_mels': 80,
+    'window_size': 400,  #ms
+    'hop_length': 160,  #ms
+}
+
+def extract_audio_embedding(args):
+    # stage 0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(args.seed)
+
+    # stage 1: build the dnn backbone model network
+    ##"channels": [1024, 1024, 1024, 1024, 3072],
+    model_conf = {
+        "input_size": 80,
+        "channels": [512, 512, 512, 512, 1536],
+        "kernel_sizes": [5, 3, 3, 3, 1],
+        "dilations": [1, 2, 3, 4, 1],
+        "attention_channels": 128,
+        "lin_neurons": 192,
+    }
+    ecapa_tdnn = EcapaTdnn(**model_conf)
+
+    # stage 2: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage 3: we must set the model to eval mode
+    model.eval()
+    
+    # stage 4: read the audio data and extract the embedding
+    waveform, sr = load_audio(args.audio_path)
+    feat = melspectrogram(x=waveform, **cpu_feat_conf)
+    feat = paddle.to_tensor(feat).unsqueeze(0)
+    lengths = paddle.ones([1]) # in paddle inference model, the lengths is all one without padding
+    feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+    embedding = ecapa_tdnn(feat, lengths
+                ).squeeze().numpy() # (1, emb_size, 1) -> (emb_size)
+
+    # stage 5: do global norm with external mean and std
+    # todo
+    return embedding
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="gpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--seed",
+                        default=0,
+                        type=int,
+                        help="random seed for paddle, numpy and python random package")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--global-embedding-norm",
+                        type=str,
+                        default=None,
+                        help="Apply global normalization on speaker embeddings.")
+    parser.add_argument("--audio-path",
+                        default="./data/demo.wav",
+                        type=str,
+                        help="Single audio file path")
+    args = parser.parse_args()
+    # yapf: enable
+
+    extract_audio_embedding(args)

From 386ef3f161531d6e9c7bace4d1097f72f71be5e2 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Tue, 8 Mar 2022 16:50:04 +0800
Subject: [PATCH 20/41] add voxceleb augment unit test, test=doc

---
 paddlespeech/vector/io/augment.py |   3 +-
 tests/unit/vector/conftest.py     |  11 +++
 tests/unit/vector/test_augment.py | 138 ++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/vector/conftest.py
 create mode 100644 tests/unit/vector/test_augment.py

diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index d6bbc8a9..af7aeb22 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -178,7 +178,8 @@ class DropChunk(nn.Layer):
             # Update waveform
             if not self.noise_factor:
                 for j in range(drop_times[i]):
-                    dropped_waveform[i, start[j]:end[j]] = 0.0
+                    if start[j] < end[j]:
+                        dropped_waveform[i, start[j]:end[j]] = 0.0
             else:
                 # Uniform distribution of -2 to +2 * avg amplitude should
                 # preserve the average for normalization
diff --git a/tests/unit/vector/conftest.py b/tests/unit/vector/conftest.py
new file mode 100644
index 00000000..7cac519b
--- /dev/null
+++ b/tests/unit/vector/conftest.py
@@ -0,0 +1,11 @@
+def pytest_addoption(parser):
+    parser.addoption("--device", action="store", default="cpu")
+
+
+def pytest_generate_tests(metafunc):
+    # This is called for every test. Only get/set command line arguments
+    # if the argument is specified in the list of test "fixturenames".
+    option_value = metafunc.config.option.device
+    if "device" in metafunc.fixturenames and option_value is not None:
+        metafunc.parametrize("device", [option_value])
+
diff --git a/tests/unit/vector/test_augment.py b/tests/unit/vector/test_augment.py
new file mode 100644
index 00000000..21d75bb3
--- /dev/null
+++ b/tests/unit/vector/test_augment.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import Dataset
+
+
+def test_add_noise(tmpdir, device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import AddNoise
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+    test_noise = paddle.cos(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+    wav_lens = paddle.ones([1], dtype="float32")
+
+    # Edge cases
+    no_noise = AddNoise(mix_prob=0.0)
+    assert no_noise(test_waveform, wav_lens).allclose(test_waveform)
+
+
+def test_speed_perturb(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import SpeedPerturb
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+
+    # Edge cases
+    no_perturb = SpeedPerturb(16000, perturb_prob=0.0)
+    assert no_perturb(test_waveform).allclose(test_waveform)
+    no_perturb = SpeedPerturb(16000, speeds=[100])
+    assert no_perturb(test_waveform).allclose(test_waveform)
+
+    # # Half speed
+    half_speed = SpeedPerturb(16000, speeds=[50])
+    assert half_speed(test_waveform).allclose(test_waveform[:, ::2], atol=3e-1)
+
+
+def test_babble(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import AddBabble
+
+    test_waveform = paddle.stack(
+        (paddle.sin(paddle.arange(16000.0, dtype="float32")),
+         paddle.cos(paddle.arange(16000.0, dtype="float32")), ))
+    lengths = paddle.ones([2])
+
+    # Edge cases
+    no_babble = AddBabble(mix_prob=0.0)
+    assert no_babble(test_waveform, lengths).allclose(test_waveform)
+    no_babble = AddBabble(speaker_count=1, snr_low=1000, snr_high=1000)
+    assert no_babble(test_waveform, lengths).allclose(test_waveform)
+
+    # One babbler just averages the two speakers
+    babble = AddBabble(speaker_count=1).to(device)
+    expected = (test_waveform + test_waveform.roll(1, 0)) / 2
+    assert babble(test_waveform, lengths).allclose(expected, atol=1e-4)
+
+
+def test_drop_freq(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import DropFreq
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+
+    # Edge cases
+    no_drop = DropFreq(drop_prob=0.0)
+    assert no_drop(test_waveform).allclose(test_waveform)
+    no_drop = DropFreq(drop_count_low=0, drop_count_high=0)
+    assert no_drop(test_waveform).allclose(test_waveform)
+
+    # Check case where frequency range *does not* include signal frequency
+    drop_diff_freq = DropFreq(drop_freq_low=0.5, drop_freq_high=0.9)
+    assert drop_diff_freq(test_waveform).allclose(test_waveform, atol=1e-1)
+
+    # Check case where frequency range *does* include signal frequency
+    drop_same_freq = DropFreq(drop_freq_low=0.28, drop_freq_high=0.28)
+    assert drop_same_freq(test_waveform).allclose(
+        paddle.zeros([1, 16000]), atol=4e-1)
+
+
+def test_drop_chunk(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import DropChunk
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+    lengths = paddle.ones([1])
+
+    # Edge cases
+    no_drop = DropChunk(drop_prob=0.0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+    no_drop = DropChunk(drop_length_low=0, drop_length_high=0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+    no_drop = DropChunk(drop_count_low=0, drop_count_high=0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+    no_drop = DropChunk(drop_start=0, drop_end=0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+
+    # Specify all parameters to ensure it is deterministic
+    dropper = DropChunk(
+        drop_length_low=100,
+        drop_length_high=100,
+        drop_count_low=1,
+        drop_count_high=1,
+        drop_start=100,
+        drop_end=200,
+        noise_factor=0.0, )
+    expected_waveform = test_waveform.clone()
+    expected_waveform[:, 100:200] = 0.0
+
+    assert dropper(test_waveform, lengths).allclose(expected_waveform)
+
+    # Make sure amplitude is similar before and after
+    dropper = DropChunk(noise_factor=1.0)
+    drop_amplitude = dropper(test_waveform, lengths).abs().mean()
+    orig_amplitude = test_waveform.abs().mean()
+    assert drop_amplitude.allclose(orig_amplitude, atol=1e-2)

From 14efbf5b15e45299f16eab594fcd1155a8b74742 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Tue, 8 Mar 2022 21:04:52 +0800
Subject: [PATCH 21/41] check extract embedding result, test=doc

---
 .../sv0/local/extract_speaker_embedding.py    | 28 +++++++++++----
 .../sv0/local/speaker_verification_cosine.py  |  4 +--
 examples/voxceleb/sv0/local/train.py          | 14 ++++----
 examples/voxceleb/sv0/run.sh                  | 35 +++++++++++++++----
 paddleaudio/datasets/voxceleb.py              |  9 ++---
 paddlespeech/vector/io/batch.py               | 17 ++++++---
 6 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/examples/voxceleb/sv0/local/extract_speaker_embedding.py b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
index 8eb24e1d..e7dad140 100644
--- a/examples/voxceleb/sv0/local/extract_speaker_embedding.py
+++ b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
@@ -22,11 +22,11 @@ from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from tqdm import tqdm
 
+from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddleaudio.features.core import melspectrogram
-from paddleaudio.backends import load as load_audio
-from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.metrics import compute_eer
@@ -41,6 +41,7 @@ cpu_feat_conf = {
     'hop_length': 160,  #ms
 }
 
+
 def extract_audio_embedding(args):
     # stage 0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -59,6 +60,8 @@ def extract_audio_embedding(args):
     }
     ecapa_tdnn = EcapaTdnn(**model_conf)
 
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=1211)
     # stage 2: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
         os.path.expanduser(args.load_checkpoint))
@@ -71,18 +74,29 @@ def extract_audio_embedding(args):
 
     # stage 3: we must set the model to eval mode
     model.eval()
-    
+
     # stage 4: read the audio data and extract the embedding
+    # wavform is one dimension numpy array 
     waveform, sr = load_audio(args.audio_path)
+
+    # feat type is numpy array, whose shape is [dim, time]
+    # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
+    # so the final shape is [1, dim, time]
     feat = melspectrogram(x=waveform, **cpu_feat_conf)
     feat = paddle.to_tensor(feat).unsqueeze(0)
-    lengths = paddle.ones([1]) # in paddle inference model, the lengths is all one without padding
-    feat = feature_normalize(feat, mean_norm=True, std_norm=False)
-    embedding = ecapa_tdnn(feat, lengths
-                ).squeeze().numpy() # (1, emb_size, 1) -> (emb_size)
+
+    # in inference period, the lengths is all one without padding
+    lengths = paddle.ones([1])
+    feat = feature_normalize(
+        feat, mean_norm=True, std_norm=False, convert_to_numpy=True)
+
+    # model backbone network forward the feats and get the embedding
+    embedding = model.backbone(
+        feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
 
     # stage 5: do global norm with external mean and std
     # todo
+    # np.save("audio-embedding", embedding)
     return embedding
 
 
diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
index b0adcf66..417e8aa3 100644
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@@ -120,7 +120,7 @@ def main(args):
         **cpu_feat_conf)
     enrol_sampler = BatchSampler(
         enrol_ds, batch_size=args.batch_size,
-        shuffle=False)  # Shuffle to make embedding normalization more robust.
+        shuffle=True)  # Shuffle to make embedding normalization more robust.
     enrol_loader = DataLoader(enrol_ds,
                     batch_sampler=enrol_sampler,
                     collate_fn=lambda x: feature_normalize(
@@ -136,7 +136,7 @@ def main(args):
         **cpu_feat_conf)
 
     test_sampler = BatchSampler(
-        test_ds, batch_size=args.batch_size, shuffle=False)
+        test_ds, batch_size=args.batch_size, shuffle=True)
     test_loader = DataLoader(test_ds,
                             batch_sampler=test_sampler,
                             collate_fn=lambda x: feature_normalize(
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index 745d5eab..3fe67c8e 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -56,10 +56,10 @@ def main(args):
     # set the random seed, it is a must for multiprocess training
     seed_everything(args.seed)
 
-    # stage2: data prepare, such vox1 and vox2 data, and augment data and pipline
+    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
     # note: some cmd must do in rank==0, so wo will refactor the data prepare code
-    train_ds = VoxCeleb1('train', target_dir=args.data_dir)
-    dev_ds = VoxCeleb1('dev', target_dir=args.data_dir)
+    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
 
     if args.augment:
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
@@ -123,9 +123,9 @@ def main(args):
 
     # stage8: we build the batch sampler for paddle.DataLoader
     train_sampler = DistributedBatchSampler(
-        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
+        train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)
     train_loader = DataLoader(
-        train_ds,
+        train_dataset,
         batch_sampler=train_sampler,
         num_workers=args.num_workers,
         collate_fn=waveform_collate_fn,
@@ -216,12 +216,12 @@ def main(args):
 
             # stage 9-12: construct the valid dataset dataloader
             dev_sampler = BatchSampler(
-                dev_ds,
+                dev_dataset,
                 batch_size=args.batch_size // 4,
                 shuffle=False,
                 drop_last=False)
             dev_loader = DataLoader(
-                dev_ds,
+                dev_dataset,
                 batch_sampler=dev_sampler,
                 collate_fn=waveform_collate_fn,
                 num_workers=args.num_workers,
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index c3b31ce5..34a1cbd4 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -3,6 +3,8 @@
 set -e
 
 #######################################################################
+# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
+#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md
 # stage 1: train the speaker identification model
 # stage 2: test speaker identification 
 # stage 3: extract the training embeding to train the LDA and PLDA
@@ -12,23 +14,42 @@ set -e
 # default the dataset is the ~/.paddleaudio/
 # export PPAUDIO_HOME=
 
-stage=2
-dir=data/                     # data directory
-exp_dir=exp/ecapa-tdnn/       # experiment directory
+stage=0
+dir=data.bak/                     # data directory
+exp_dir=exp/ecapa-tdnn/           # experiment directory
 mkdir -p ${dir}
+mkdir -p ${exp_dir}
+
+# if [ $stage -le 0 ]; then 
+#      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+#      # todo
+# fi 
 
 if [ $stage -le 1 ]; then
      # stage 1: train the speaker identification model
      python3 \
           -m paddle.distributed.launch --gpus=0,1,2,3 \
-          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} \
-          --save-freq 10 --data-dir ${dir} --batch-size 256 --epochs 60
+          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
+          --save-freq 10 --data-dir ${dir} --batch-size 64 --epochs 100
 fi
 
 if [ $stage -le 2 ]; then
      # stage 1: train the speaker identification model
+     # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
+     python3 \
+          local/speaker_verification_cosine.py\
+          --batch-size 4 --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
+fi
+
+if [ $stage -le 3 ]; then
+     # stage 1: train the speaker identification model
+     # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
      python3 \
-          local/speaker_verification_cosine.py \
-          --load-checkpoint ${exp_dir}/epoch_40/
+          local/extract_speaker_embedding.py\
+          --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
 fi
 
+# if [ $stage -le 3 ]; then
+#      # stage 2: extract the training embeding to train the LDA and PLDA
+#      # todo: extract the training embedding
+# fi 
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py
index 28f6dfc6..c97e825e 100644
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/datasets/voxceleb.py
@@ -28,7 +28,7 @@ from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets.dataset import feat_funcs
 from paddleaudio.utils import DATA_HOME
 from paddleaudio.utils import decompress
-from paddleaudio.utils import download_and_decompress
+from paddlespeech.vector.utils.download import download_and_decompress
 from paddlespeech.s2t.utils.log import Log
 from utils.utility import download
 from utils.utility import unpack
@@ -106,13 +106,14 @@ class VoxCeleb1(Dataset):
         self.chunk_duration = chunk_duration
         self.split_ratio = split_ratio
         self.target_dir = target_dir if target_dir else self.base_path
-        self.csv_path = os.path.join(
+        VoxCeleb1.csv_path = os.path.join(
             target_dir, 'csv') if target_dir else os.path.join(self.base_path,
                                                                'csv')
-        self.meta_path = os.path.join(
+        VoxCeleb1.meta_path = os.path.join(
             target_dir, 'meta') if target_dir else os.path.join(self.base_path,
                                                                 'meta')
-        self.veri_test_file = os.path.join(self.meta_path, 'veri_test2.txt')
+        VoxCeleb1.veri_test_file = os.path.join(self.meta_path,
+                                                'veri_test2.txt')
         # self._data = self._get_data()[:1000]  # KP: Small dataset test.
         self._data = self._get_data()
         super(VoxCeleb1, self).__init__()
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 9db615f6..879cde3a 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -24,10 +24,19 @@ def waveform_collate_fn(batch):
 
 def feature_normalize(feats: paddle.Tensor,
                       mean_norm: bool=True,
-                      std_norm: bool=True):
+                      std_norm: bool=True,
+                      convert_to_numpy: bool=False):
     # Features normalization if needed
-    mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
-    std = feats.std(axis=-1, keepdim=True) if std_norm else 1
-    feats = (feats - mean) / std
+    # numpy.mean is a little with paddle.mean about 1e-6
+    if convert_to_numpy:
+        feats_np = feats.numpy()
+        mean = feats_np.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feats_np.std(axis=-1, keepdims=True) if std_norm else 1
+        feats_np = (feats_np - mean) / std
+        feats = paddle.to_tensor(feats_np, dtype=feats.dtype)
+    else:
+        mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
+        std = feats.std(axis=-1, keepdim=True) if std_norm else 1
+        feats = (feats - mean) / std
 
     return feats

From 60d73bb7bd5af81001c0b90837fc3fb01cd65da9 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 9 Mar 2022 12:10:02 +0800
Subject: [PATCH 22/41] add state 0 to  prepare the voxcele data and augment
 data

---
 examples/voxceleb/README.md                 | 53 ++++++++++++++++++
 examples/voxceleb/sv0/local/data_prepare.py | 60 +++++++++++++++++++++
 examples/voxceleb/sv0/run.sh                |  8 +--
 paddleaudio/datasets/rirs_noises.py         |  5 +-
 paddleaudio/datasets/voxceleb.py            | 37 +++++++------
 paddlespeech/vector/io/augment.py           |  2 +-
 6 files changed, 142 insertions(+), 23 deletions(-)
 create mode 100644 examples/voxceleb/sv0/local/data_prepare.py

diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
index 2c8ad138..59fb491c 100644
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@@ -6,3 +6,56 @@ sv0 - speaker verfication with softmax backend etc, all python code
 
 sv1 - dependence on kaldi, speaker verfication with plda/sc backend, 
       more info refer to the sv1/readme.txt
+
+
+## VoxCeleb2 preparation
+
+VoxCeleb2 audio files are released in m4a format. All the VoxCeleb2 m4a audio files must be converted in wav files before feeding them in PaddleSpeech. 
+Please, follow these steps to prepare the dataset correctly:
+
+1. Download Voxceleb2.
+You can find download instructions here: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
+
+2. Convert .m4a to wav
+VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech,  you have to convert all the m4a audio files into wav files.
+
+``` shell
+ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
+```
+
+``` shell
+# copy this to root directory of data and 
+# chmod a+x convert.sh
+# ./convert.sh
+# https://unix.stackexchange.com/questions/103920/parallelize-a-bash-for-loop
+
+open_sem(){
+    mkfifo pipe-$$
+    exec 3<>pipe-$$
+    rm pipe-$$
+    local i=$1
+    for((;i>0;i--)); do
+        printf %s 000 >&3
+    done
+}
+run_with_lock(){
+    local x
+    read -u 3 -n 3 x && ((0==x)) || exit $x
+    (
+     ( "$@"; )
+    printf '%.3d' $? >&3
+    )&
+}
+
+N=32 # number of vCPU
+open_sem $N
+for f in $(find . -name "*.m4a"); do
+    run_with_lock ffmpeg -loglevel panic -i "$f" -ar 16000 "${f%.*}.wav"
+done
+```
+
+You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
+
+3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
+
+4. 
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
new file mode 100644
index 00000000..ca707fc2
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -0,0 +1,60 @@
+import argparse
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+
+from paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddleaudio.features.core import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.io.batch import waveform_collate_fn
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.loss import AdditiveAngularMargin
+from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
+from paddlespeech.vector.modules.lr import CyclicLRScheduler
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+from paddlespeech.vector.utils.time import Timer
+
+logger = Log(__name__).getlog()
+
+def main(args):
+    # stage0: set the cpu device, all data prepare process will be done in cpu mode
+    paddle.set_device("cpu")
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(args.seed)
+
+    # stage 1: generate the voxceleb csv file
+    # Note: this may occurs c++ execption, but the program will execute fine
+    # so we can ignore the execption 
+    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
+
+    # stage 2: generate the augment noise csv file
+    if args.augment:
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument("--seed",
+                        default=0,
+                        type=int,
+                        help="random seed for paddle, numpy and python random package")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--augment",
+                        action="store_true",
+                        default=False,
+                        help="Apply audio augments.")
+    args = parser.parse_args()
+    # yapf: enable
+    main(args)                    
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index 34a1cbd4..7ad3a36f 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -20,10 +20,10 @@ exp_dir=exp/ecapa-tdnn/           # experiment directory
 mkdir -p ${dir}
 mkdir -p ${exp_dir}
 
-# if [ $stage -le 0 ]; then 
-#      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-#      # todo
-# fi 
+if [ $stage -le 0 ]; then 
+     # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+     python3 local/data_prepare.py --data-dir ${dir} --augment
+fi 
 
 if [ $stage -le 1 ]; then
      # stage 1: train the speaker identification model
diff --git a/paddleaudio/datasets/rirs_noises.py b/paddleaudio/datasets/rirs_noises.py
index fa9e7f09..6af9fd9d 100644
--- a/paddleaudio/datasets/rirs_noises.py
+++ b/paddleaudio/datasets/rirs_noises.py
@@ -69,8 +69,9 @@ class OpenRIRNoise(Dataset):
         self.random_chunk = random_chunk
         self.chunk_duration = chunk_duration
 
-        self.csv_path = os.path.join(target_dir, "open_rir_noise",
-                                     "csv") if target_dir else self.csv_path
+        OpenRIRNoise.csv_path = os.path.join(
+            target_dir, "open_rir_noise",
+            "csv") if target_dir else self.csv_path
         self._data = self._get_data()
         super(OpenRIRNoise, self).__init__()
 
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py
index c97e825e..0011340e 100644
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/datasets/voxceleb.py
@@ -16,6 +16,7 @@ import csv
 import glob
 import os
 import random
+from multiprocessing import cpu_count
 from typing import Dict
 from typing import List
 from typing import Tuple
@@ -28,8 +29,8 @@ from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets.dataset import feat_funcs
 from paddleaudio.utils import DATA_HOME
 from paddleaudio.utils import decompress
-from paddlespeech.vector.utils.download import download_and_decompress
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.utils.download import download_and_decompress
 from utils.utility import download
 from utils.utility import unpack
 
@@ -105,14 +106,15 @@ class VoxCeleb1(Dataset):
         self.random_chunk = random_chunk
         self.chunk_duration = chunk_duration
         self.split_ratio = split_ratio
-        self.target_dir = target_dir if target_dir else self.base_path
+        self.target_dir = target_dir if target_dir else VoxCeleb1.base_path
+
+        # if we set the target dir, we will change the vox data info data from base path to target dir
         VoxCeleb1.csv_path = os.path.join(
-            target_dir, 'csv') if target_dir else os.path.join(self.base_path,
-                                                               'csv')
+            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb1.csv_path
         VoxCeleb1.meta_path = os.path.join(
-            target_dir, 'meta') if target_dir else os.path.join(self.base_path,
-                                                                'meta')
-        VoxCeleb1.veri_test_file = os.path.join(self.meta_path,
+            target_dir, "voxceleb",
+            'meta') if target_dir else VoxCeleb1.meta_path
+        VoxCeleb1.veri_test_file = os.path.join(VoxCeleb1.meta_path,
                                                 'veri_test2.txt')
         # self._data = self._get_data()[:1000]  # KP: Small dataset test.
         self._data = self._get_data()
@@ -255,8 +257,9 @@ class VoxCeleb1(Dataset):
                      split_chunks: bool=True):
         logger.info(f'Generating csv: {output_file}')
         header = ["id", "duration", "wav", "start", "stop", "spk_id"]
-
-        with Pool(64) as p:
+        # Note: this may occurs c++ execption, but the program will execute fine
+        # so we can ignore the execption 
+        with Pool(cpu_count()) as p:
             infos = list(
                 tqdm(
                     p.imap(lambda x: self._get_audio_info(x, split_chunks),
@@ -277,20 +280,20 @@ class VoxCeleb1(Dataset):
     def prepare_data(self):
         # Audio of speakers in veri_test_file should not be included in training set.
         logger.info("start to prepare the data csv file")
-        enrol_files = set()
+        enroll_files = set()
         test_files = set()
         # get the enroll and test audio file path
         with open(self.veri_test_file, 'r') as f:
             for line in f.readlines():
                 _, enrol_file, test_file = line.strip().split(' ')
-                enrol_files.add(os.path.join(self.wav_path, enrol_file))
+                enroll_files.add(os.path.join(self.wav_path, enrol_file))
                 test_files.add(os.path.join(self.wav_path, test_file))
-            enrol_files = sorted(enrol_files)
+            enroll_files = sorted(enroll_files)
             test_files = sorted(test_files)
 
         # get the enroll and test speakers
         test_spks = set()
-        for file in (enrol_files + test_files):
+        for file in (enroll_files + test_files):
             spk = file.split('/wav/')[1].split('/')[0]
             test_spks.add(spk)
 
@@ -306,8 +309,9 @@ class VoxCeleb1(Dataset):
                 speakers.add(spk)
                 audio_files.append(file)
 
-        logger.info("start to generate the {}".format(
-            os.path.join(self.meta_path, 'spk_id2label.txt')))
+        logger.info(
+            f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
+        )
         # encode the train and dev speakers label to spk_id2label.txt
         with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
             for label, spk_id in enumerate(
@@ -323,8 +327,9 @@ class VoxCeleb1(Dataset):
 
         self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
         self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
+
         self.generate_csv(
-            enrol_files,
+            enroll_files,
             os.path.join(self.csv_path, 'enrol.csv'),
             split_chunks=False)
         self.generate_csv(
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index af7aeb22..366c0cff 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -840,7 +840,7 @@ def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
     """
     logger.info("start to build the augment pipeline")
     noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
-    rir_dataset = OpenRIRNoise('rir')
+    rir_dataset = OpenRIRNoise('rir', target_dir=target_dir)
 
     wavedrop = TimeDomainSpecAugment(
         sample_rate=16000,

From 0e87037f2c9fc67a87c665e7a2f1c8058666646b Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 9 Mar 2022 14:03:47 +0800
Subject: [PATCH 23/41] refactor to compilance paddleaudio

---
 examples/voxceleb/sv0/local/data_prepare.py   |  15 +-
 .../sv0/local/extract_speaker_embedding.py    | 129 -------
 .../sv0/local/speaker_verification_cosine.py  | 264 --------------
 examples/voxceleb/sv0/local/train.py          | 326 ------------------
 examples/voxceleb/sv0/path.sh                 |   3 +
 examples/voxceleb/sv0/run.sh                  |  12 +-
 paddleaudio/paddleaudio/datasets/__init__.py  |   2 +
 .../{ => paddleaudio}/datasets/rirs_noises.py |  10 +-
 .../{ => paddleaudio}/datasets/voxceleb.py    |  12 +-
 paddleaudio/paddleaudio/metric/__init__.py    |   1 +
 .../paddleaudio/metric/eer.py                 |   0
 paddlespeech/vector/io/augment.py             |   4 +-
 paddlespeech/vector/io/batch.py               |  38 ++
 .../{modules/lr.py => training/scheduler.py}  |   0
 14 files changed, 63 insertions(+), 753 deletions(-)
 delete mode 100644 examples/voxceleb/sv0/local/extract_speaker_embedding.py
 delete mode 100644 examples/voxceleb/sv0/local/speaker_verification_cosine.py
 delete mode 100644 examples/voxceleb/sv0/local/train.py
 rename paddleaudio/{ => paddleaudio}/datasets/rirs_noises.py (97%)
 rename paddleaudio/{ => paddleaudio}/datasets/voxceleb.py (97%)
 rename paddlespeech/vector/training/metrics.py => paddleaudio/paddleaudio/metric/eer.py (100%)
 rename paddlespeech/vector/{modules/lr.py => training/scheduler.py} (100%)

diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
index ca707fc2..1a0a6392 100644
--- a/examples/voxceleb/sv0/local/data_prepare.py
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -3,24 +3,11 @@ import os
 
 import numpy as np
 import paddle
-from paddle.io import BatchSampler
-from paddle.io import DataLoader
-from paddle.io import DistributedBatchSampler
 
-from paddleaudio.datasets.voxceleb import VoxCeleb1
-from paddleaudio.features.core import melspectrogram
+from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
-from paddlespeech.vector.io.augment import waveform_augment
-from paddlespeech.vector.io.batch import feature_normalize
-from paddlespeech.vector.io.batch import waveform_collate_fn
-from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
-from paddlespeech.vector.modules.loss import AdditiveAngularMargin
-from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
-from paddlespeech.vector.modules.lr import CyclicLRScheduler
-from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.seeding import seed_everything
-from paddlespeech.vector.utils.time import Timer
 
 logger = Log(__name__).getlog()
 
diff --git a/examples/voxceleb/sv0/local/extract_speaker_embedding.py b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
deleted file mode 100644
index e7dad140..00000000
--- a/examples/voxceleb/sv0/local/extract_speaker_embedding.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-from paddle.io import BatchSampler
-from paddle.io import DataLoader
-from tqdm import tqdm
-
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets.voxceleb import VoxCeleb1
-from paddleaudio.features.core import melspectrogram
-from paddlespeech.s2t.utils.log import Log
-from paddlespeech.vector.io.batch import feature_normalize
-from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
-from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-from paddlespeech.vector.training.metrics import compute_eer
-from paddlespeech.vector.training.seeding import seed_everything
-
-logger = Log(__name__).getlog()
-
-# feat configuration
-cpu_feat_conf = {
-    'n_mels': 80,
-    'window_size': 400,  #ms
-    'hop_length': 160,  #ms
-}
-
-
-def extract_audio_embedding(args):
-    # stage 0: set the training device, cpu or gpu
-    paddle.set_device(args.device)
-    # set the random seed, it is a must for multiprocess training
-    seed_everything(args.seed)
-
-    # stage 1: build the dnn backbone model network
-    ##"channels": [1024, 1024, 1024, 1024, 3072],
-    model_conf = {
-        "input_size": 80,
-        "channels": [512, 512, 512, 512, 1536],
-        "kernel_sizes": [5, 3, 3, 3, 1],
-        "dilations": [1, 2, 3, 4, 1],
-        "attention_channels": 128,
-        "lin_neurons": 192,
-    }
-    ecapa_tdnn = EcapaTdnn(**model_conf)
-
-    # stage4: build the speaker verification train instance with backbone model
-    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=1211)
-    # stage 2: load the pre-trained model
-    args.load_checkpoint = os.path.abspath(
-        os.path.expanduser(args.load_checkpoint))
-
-    # load model checkpoint to sid model
-    state_dict = paddle.load(
-        os.path.join(args.load_checkpoint, 'model.pdparams'))
-    model.set_state_dict(state_dict)
-    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
-
-    # stage 3: we must set the model to eval mode
-    model.eval()
-
-    # stage 4: read the audio data and extract the embedding
-    # wavform is one dimension numpy array 
-    waveform, sr = load_audio(args.audio_path)
-
-    # feat type is numpy array, whose shape is [dim, time]
-    # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
-    # so the final shape is [1, dim, time]
-    feat = melspectrogram(x=waveform, **cpu_feat_conf)
-    feat = paddle.to_tensor(feat).unsqueeze(0)
-
-    # in inference period, the lengths is all one without padding
-    lengths = paddle.ones([1])
-    feat = feature_normalize(
-        feat, mean_norm=True, std_norm=False, convert_to_numpy=True)
-
-    # model backbone network forward the feats and get the embedding
-    embedding = model.backbone(
-        feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
-
-    # stage 5: do global norm with external mean and std
-    # todo
-    # np.save("audio-embedding", embedding)
-    return embedding
-
-
-if __name__ == "__main__":
-    # yapf: disable
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument('--device',
-                        choices=['cpu', 'gpu'],
-                        default="gpu",
-                        help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--seed",
-                        default=0,
-                        type=int,
-                        help="random seed for paddle, numpy and python random package")
-    parser.add_argument("--load-checkpoint",
-                        type=str,
-                        default='',
-                        help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--global-embedding-norm",
-                        type=str,
-                        default=None,
-                        help="Apply global normalization on speaker embeddings.")
-    parser.add_argument("--audio-path",
-                        default="./data/demo.wav",
-                        type=str,
-                        help="Single audio file path")
-    args = parser.parse_args()
-    # yapf: enable
-
-    extract_audio_embedding(args)
diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
deleted file mode 100644
index 417e8aa3..00000000
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-from paddle.io import BatchSampler
-from paddle.io import DataLoader
-from tqdm import tqdm
-
-from paddleaudio.datasets.voxceleb import VoxCeleb1
-from paddlespeech.s2t.utils.log import Log
-from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
-from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-from paddlespeech.vector.training.metrics import compute_eer
-from paddlespeech.vector.training.seeding import seed_everything
-
-logger = Log(__name__).getlog()
-
-
-def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
-    x = np.asarray(x)
-    assert len(
-        x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
-
-    w = target_length - x.shape[axis]
-    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
-
-    if axis == 0:
-        pad_width = [[0, w], [0, 0]]
-    else:
-        pad_width = [[0, 0], [0, w]]
-
-    return np.pad(x, pad_width, mode=mode, **kwargs)
-
-
-def feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
-    ids = [item['id'] for item in batch]
-    lengths = np.asarray([item['feat'].shape[1] for item in batch])
-    feats = list(
-        map(lambda x: pad_right_2d(x, lengths.max()),
-            [item['feat'] for item in batch]))
-    feats = np.stack(feats)
-
-    # Features normalization if needed
-    for i in range(len(feats)):
-        feat = feats[i][:, :lengths[i]]  # Excluding pad values.
-        mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
-        std = feat.std(axis=-1, keepdims=True) if std_norm else 1
-        feats[i][:, :lengths[i]] = (feat - mean) / std
-        assert feats[i][:, lengths[
-            i]:].sum() == 0  # Padding valus should all be 0.
-
-    # Converts into ratios.
-    lengths = (lengths / lengths.max()).astype(np.float32)
-
-    return {'ids': ids, 'feats': feats, 'lengths': lengths}
-
-
-# feat configuration
-cpu_feat_conf = {
-    'n_mels': 80,
-    'window_size': 400,  #ms
-    'hop_length': 160,  #ms
-}
-
-
-def main(args):
-    # stage0: set the training device, cpu or gpu
-    paddle.set_device(args.device)
-    # set the random seed, it is a must for multiprocess training
-    seed_everything(args.seed)
-
-    # stage1: build the dnn backbone model network
-    ##"channels": [1024, 1024, 1024, 1024, 3072],
-    model_conf = {
-        "input_size": 80,
-        "channels": [512, 512, 512, 512, 1536],
-        "kernel_sizes": [5, 3, 3, 3, 1],
-        "dilations": [1, 2, 3, 4, 1],
-        "attention_channels": 128,
-        "lin_neurons": 192,
-    }
-    ecapa_tdnn = EcapaTdnn(**model_conf)
-
-    # stage2: build the speaker verification eval instance with backbone model
-    model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
-
-    # stage3: load the pre-trained model
-    args.load_checkpoint = os.path.abspath(
-        os.path.expanduser(args.load_checkpoint))
-
-    # load model checkpoint to sid model
-    state_dict = paddle.load(
-        os.path.join(args.load_checkpoint, 'model.pdparams'))
-    model.set_state_dict(state_dict)
-    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
-
-    # stage4: construct the enroll and test dataloader
-    enrol_ds = VoxCeleb1(
-        subset='enrol',
-        target_dir=args.data_dir,
-        feat_type='melspectrogram',
-        random_chunk=False,
-        **cpu_feat_conf)
-    enrol_sampler = BatchSampler(
-        enrol_ds, batch_size=args.batch_size,
-        shuffle=True)  # Shuffle to make embedding normalization more robust.
-    enrol_loader = DataLoader(enrol_ds,
-                    batch_sampler=enrol_sampler,
-                    collate_fn=lambda x: feature_normalize(
-                            x, mean_norm=True, std_norm=False),
-                    num_workers=args.num_workers,
-                    return_list=True,)
-
-    test_ds = VoxCeleb1(
-        subset='test',
-        target_dir=args.data_dir,
-        feat_type='melspectrogram',
-        random_chunk=False,
-        **cpu_feat_conf)
-
-    test_sampler = BatchSampler(
-        test_ds, batch_size=args.batch_size, shuffle=True)
-    test_loader = DataLoader(test_ds,
-                            batch_sampler=test_sampler,
-                            collate_fn=lambda x: feature_normalize(
-                                x, mean_norm=True, std_norm=False),
-                            num_workers=args.num_workers,
-                            return_list=True,)
-    # stage6: we must set the model to eval mode
-    model.eval()
-
-    # stage7: global embedding norm to imporve the performance
-    if args.global_embedding_norm:
-        global_embedding_mean = None
-        global_embedding_std = None
-        mean_norm_flag = args.embedding_mean_norm
-        std_norm_flag = args.embedding_std_norm
-        batch_count = 0
-
-    # stage8: Compute embeddings of audios in enrol and test dataset from model.
-    id2embedding = {}
-    # Run multi times to make embedding normalization more stable.
-    for i in range(2):
-        for dl in [enrol_loader, test_loader]:
-            logger.info(
-                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
-            )
-            with paddle.no_grad():
-                for batch_idx, batch in enumerate(tqdm(dl)):
-
-                    # stage 8-1: extrac the audio embedding
-                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
-                        'lengths']
-                    embeddings = model.backbone(feats, lengths).squeeze(
-                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
-
-                    # Global embedding normalization.
-                    if args.global_embedding_norm:
-                        batch_count += 1
-                        current_mean = embeddings.mean(
-                            axis=0) if mean_norm_flag else 0
-                        current_std = embeddings.std(
-                            axis=0) if std_norm_flag else 1
-                        # Update global mean and std.
-                        if global_embedding_mean is None and global_embedding_std is None:
-                            global_embedding_mean, global_embedding_std = current_mean, current_std
-                        else:
-                            weight = 1 / batch_count  # Weight decay by batches.
-                            global_embedding_mean = (
-                                1 - weight
-                            ) * global_embedding_mean + weight * current_mean
-                            global_embedding_std = (
-                                1 - weight
-                            ) * global_embedding_std + weight * current_std
-                        # Apply global embedding normalization.
-                        embeddings = (embeddings - global_embedding_mean
-                                      ) / global_embedding_std
-
-                    # Update embedding dict.
-                    id2embedding.update(dict(zip(ids, embeddings)))
-
-    # stage 9: Compute cosine scores.
-    labels = []
-    enrol_ids = []
-    test_ids = []
-    with open(VoxCeleb1.veri_test_file, 'r') as f:
-        for line in f.readlines():
-            label, enrol_id, test_id = line.strip().split(' ')
-            labels.append(int(label))
-            enrol_ids.append(enrol_id.split('.')[0].replace('/', '-'))
-            test_ids.append(test_id.split('.')[0].replace('/', '-'))
-
-    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
-    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
-        np.asarray([id2embedding[id] for id in ids], dtype='float32')),
-                                            [enrol_ids, test_ids
-                                             ])  # (N, emb_size)
-    scores = cos_sim_func(enrol_embeddings, test_embeddings)
-    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
-    logger.info(
-        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
-    )
-
-
-if __name__ == "__main__":
-    # yapf: disable
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument('--device',
-                        choices=['cpu', 'gpu'],
-                        default="gpu",
-                        help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--seed",
-                        default=0,
-                        type=int,
-                        help="random seed for paddle, numpy and python random package")
-    parser.add_argument("--data-dir",
-                        default="./data/",
-                        type=str,
-                        help="data directory")
-    parser.add_argument("--batch-size",
-                        type=int,
-                        default=16,
-                        help="Total examples' number in batch for extract the embedding.")
-    parser.add_argument("--num-workers",
-                        type=int,
-                        default=0,
-                        help="Number of workers in dataloader.")
-    parser.add_argument("--load-checkpoint",
-                        type=str,
-                        default='',
-                        help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--global-embedding-norm",
-                        type=bool,
-                        default=True,
-                        help="Apply global normalization on speaker embeddings.")
-    parser.add_argument("--embedding-mean-norm",
-                        type=bool,
-                        default=True,
-                        help="Apply mean normalization on speaker embeddings.")
-    parser.add_argument("--embedding-std-norm",
-                        type=bool,
-                        default=False,
-                        help="Apply std normalization on speaker embeddings.")
-    args = parser.parse_args()
-    # yapf: enable
-
-    main(args)
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
deleted file mode 100644
index 3fe67c8e..00000000
--- a/examples/voxceleb/sv0/local/train.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-
-import numpy as np
-import paddle
-from paddle.io import BatchSampler
-from paddle.io import DataLoader
-from paddle.io import DistributedBatchSampler
-
-from paddleaudio.datasets.voxceleb import VoxCeleb1
-from paddleaudio.features.core import melspectrogram
-from paddlespeech.s2t.utils.log import Log
-from paddlespeech.vector.io.augment import build_augment_pipeline
-from paddlespeech.vector.io.augment import waveform_augment
-from paddlespeech.vector.io.batch import feature_normalize
-from paddlespeech.vector.io.batch import waveform_collate_fn
-from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
-from paddlespeech.vector.modules.loss import AdditiveAngularMargin
-from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
-from paddlespeech.vector.modules.lr import CyclicLRScheduler
-from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-from paddlespeech.vector.training.seeding import seed_everything
-from paddlespeech.vector.utils.time import Timer
-
-logger = Log(__name__).getlog()
-
-# feat configuration
-cpu_feat_conf = {
-    'n_mels': 80,
-    'window_size': 400,  #ms
-    'hop_length': 160,  #ms
-}
-
-
-def main(args):
-    # stage0: set the training device, cpu or gpu
-    paddle.set_device(args.device)
-
-    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
-    paddle.distributed.init_parallel_env()
-    nranks = paddle.distributed.get_world_size()
-    local_rank = paddle.distributed.get_rank()
-    # set the random seed, it is a must for multiprocess training
-    seed_everything(args.seed)
-
-    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
-    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
-    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
-    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
-
-    if args.augment:
-        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
-    else:
-        augment_pipeline = []
-
-    # stage3: build the dnn backbone model network
-    #"channels": [1024, 1024, 1024, 1024, 3072],
-    model_conf = {
-        "input_size": 80,
-        "channels": [512, 512, 512, 512, 1536],
-        "kernel_sizes": [5, 3, 3, 3, 1],
-        "dilations": [1, 2, 3, 4, 1],
-        "attention_channels": 128,
-        "lin_neurons": 192,
-    }
-    ecapa_tdnn = EcapaTdnn(**model_conf)
-
-    # stage4: build the speaker verification train instance with backbone model
-    model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
-
-    # stage5: build the optimizer, we now only construct the AdamW optimizer
-    lr_schedule = CyclicLRScheduler(
-        base_lr=args.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
-    optimizer = paddle.optimizer.AdamW(
-        learning_rate=lr_schedule, parameters=model.parameters())
-
-    # stage6: build the loss function, we now only support LogSoftmaxWrapper
-    criterion = LogSoftmaxWrapper(
-        loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
-
-    # stage7: confirm training start epoch
-    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
-    start_epoch = 0
-    if args.load_checkpoint:
-        logger.info("load the check point")
-        args.load_checkpoint = os.path.abspath(
-            os.path.expanduser(args.load_checkpoint))
-        try:
-            # load model checkpoint
-            state_dict = paddle.load(
-                os.path.join(args.load_checkpoint, 'model.pdparams'))
-            model.set_state_dict(state_dict)
-
-            # load optimizer checkpoint
-            state_dict = paddle.load(
-                os.path.join(args.load_checkpoint, 'model.pdopt'))
-            optimizer.set_state_dict(state_dict)
-            if local_rank == 0:
-                logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
-        except FileExistsError:
-            if local_rank == 0:
-                logger.info('Train from scratch.')
-
-        try:
-            start_epoch = int(args.load_checkpoint[-1])
-            logger.info(f'Restore training from epoch {start_epoch}.')
-        except ValueError:
-            pass
-
-    # stage8: we build the batch sampler for paddle.DataLoader
-    train_sampler = DistributedBatchSampler(
-        train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)
-    train_loader = DataLoader(
-        train_dataset,
-        batch_sampler=train_sampler,
-        num_workers=args.num_workers,
-        collate_fn=waveform_collate_fn,
-        return_list=True,
-        use_buffer_reader=True, )
-
-    # stage9: start to train
-    #         we will comment the training process
-    steps_per_epoch = len(train_sampler)
-    timer = Timer(steps_per_epoch * args.epochs)
-    timer.start()
-
-    for epoch in range(start_epoch + 1, args.epochs + 1):
-        # at the begining, model must set to train mode
-        model.train()
-
-        avg_loss = 0
-        num_corrects = 0
-        num_samples = 0
-        for batch_idx, batch in enumerate(train_loader):
-            # stage 9-1: batch data is audio sample points and speaker id label
-            waveforms, labels = batch['waveforms'], batch['labels']
-
-            # stage 9-2: audio sample augment method, which is done on the audio sample point
-            if len(augment_pipeline) != 0:
-                waveforms = waveform_augment(waveforms, augment_pipeline)
-                labels = paddle.concat(
-                    [labels for i in range(len(augment_pipeline) + 1)])
-
-            # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
-            feats = []
-            for waveform in waveforms.numpy():
-                feat = melspectrogram(x=waveform, **cpu_feat_conf)
-                feats.append(feat)
-            feats = paddle.to_tensor(np.asarray(feats))
-
-            # stage 9-4: feature normalize, which help converge and imporve the performance
-            feats = feature_normalize(
-                feats, mean_norm=True, std_norm=False)  # Features normalization
-
-            # stage 9-5: model forward, such ecapa-tdnn, x-vector
-            logits = model(feats)
-
-            # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
-            loss = criterion(logits, labels)
-
-            # stage 9-7: update the gradient and clear the gradient cache
-            loss.backward()
-            optimizer.step()
-            if isinstance(optimizer._learning_rate,
-                          paddle.optimizer.lr.LRScheduler):
-                optimizer._learning_rate.step()
-            optimizer.clear_grad()
-
-            # stage 9-8: Calculate average loss per batch
-            avg_loss += loss.numpy()[0]
-
-            # stage 9-9: Calculate metrics, which is one-best accuracy
-            preds = paddle.argmax(logits, axis=1)
-            num_corrects += (preds == labels).numpy().sum()
-            num_samples += feats.shape[0]
-            timer.count()  # step plus one in timer
-
-            # stage 9-10: print the log information only on 0-rank per log-freq batchs
-            if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
-                lr = optimizer.get_lr()
-                avg_loss /= args.log_freq
-                avg_acc = num_corrects / num_samples
-
-                print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
-                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
-                print_msg += ' loss={:.4f}'.format(avg_loss)
-                print_msg += ' acc={:.4f}'.format(avg_acc)
-                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
-                    lr, timer.timing, timer.eta)
-                logger.info(print_msg)
-
-                avg_loss = 0
-                num_corrects = 0
-                num_samples = 0
-
-        # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
-        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch:
-            if local_rank != 0:
-                paddle.distributed.barrier(
-                )  # Wait for valid step in main process
-                continue  # Resume trainning on other process
-
-            # stage 9-12: construct the valid dataset dataloader
-            dev_sampler = BatchSampler(
-                dev_dataset,
-                batch_size=args.batch_size // 4,
-                shuffle=False,
-                drop_last=False)
-            dev_loader = DataLoader(
-                dev_dataset,
-                batch_sampler=dev_sampler,
-                collate_fn=waveform_collate_fn,
-                num_workers=args.num_workers,
-                return_list=True, )
-
-            # set the model to eval mode
-            model.eval()
-            num_corrects = 0
-            num_samples = 0
-
-            # stage 9-13: evaluation the valid dataset batch data
-            logger.info('Evaluate on validation dataset')
-            with paddle.no_grad():
-                for batch_idx, batch in enumerate(dev_loader):
-                    waveforms, labels = batch['waveforms'], batch['labels']
-
-                    feats = []
-                    for waveform in waveforms.numpy():
-                        feat = melspectrogram(x=waveform, **cpu_feat_conf)
-                        feats.append(feat)
-
-                    feats = paddle.to_tensor(np.asarray(feats))
-                    feats = feature_normalize(
-                        feats, mean_norm=True, std_norm=False)
-                    logits = model(feats)
-
-                    preds = paddle.argmax(logits, axis=1)
-                    num_corrects += (preds == labels).numpy().sum()
-                    num_samples += feats.shape[0]
-
-            print_msg = '[Evaluation result]'
-            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
-            logger.info(print_msg)
-
-            # stage 9-14: Save model parameters
-            save_dir = os.path.join(args.checkpoint_dir,
-                                    'epoch_{}'.format(epoch))
-            logger.info('Saving model checkpoint to {}'.format(save_dir))
-            paddle.save(model.state_dict(),
-                        os.path.join(save_dir, 'model.pdparams'))
-            paddle.save(optimizer.state_dict(),
-                        os.path.join(save_dir, 'model.pdopt'))
-
-            if nranks > 1:
-                paddle.distributed.barrier()  # Main process
-
-
-if __name__ == "__main__":
-    # yapf: disable
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument('--device',
-                        choices=['cpu', 'gpu'],
-                        default="cpu",
-                        help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--seed",
-                        default=0,
-                        type=int,
-                        help="random seed for paddle, numpy and python random package")
-    parser.add_argument("--data-dir",
-                        default="./data/",
-                        type=str,
-                        help="data directory")
-    parser.add_argument("--learning-rate",
-                        type=float,
-                        default=1e-8,
-                        help="Learning rate used to train with warmup.")
-    parser.add_argument("--load-checkpoint",
-                        type=str,
-                        default=None,
-                        help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--batch-size",
-                        type=int, default=64,
-                        help="Total examples' number in batch for training.")
-    parser.add_argument("--num-workers",
-                        type=int,
-                        default=0,
-                        help="Number of workers in dataloader.")
-    parser.add_argument("--epochs",
-                        type=int,
-                        default=50,
-                        help="Number of epoches for fine-tuning.")
-    parser.add_argument("--log-freq",
-                        type=int,
-                        default=10,
-                        help="Log the training infomation every n steps.")
-    parser.add_argument("--save-freq",
-                        type=int,
-                        default=1,
-                        help="Save checkpoint every n epoch.")
-    parser.add_argument("--checkpoint-dir",
-                        type=str,
-                        default='./checkpoint',
-                        help="Directory to save model checkpoints.")
-    parser.add_argument("--augment",
-                        action="store_true",
-                        default=False,
-                        help="Apply audio augments.")
-
-    args = parser.parse_args()
-    # yapf: enable
-
-    main(args)
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
index 38a242a4..6d19f994 100755
--- a/examples/voxceleb/sv0/path.sh
+++ b/examples/voxceleb/sv0/path.sh
@@ -9,3 +9,6 @@ export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=ecapa-tdnn
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index a2336fb6..a6346cd5 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -30,23 +30,21 @@ if [ $stage -le 1 ]; then
      # stage 1: train the speaker identification model
      python3 \
           -m paddle.distributed.launch --gpus=0,1,2,3 \
-          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
+          ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
           --save-freq 10 --data-dir ${dir} --batch-size 64 --epochs 100
 fi
 
 if [ $stage -le 2 ]; then
-     # stage 1: train the speaker identification model
-     # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
+     # stage 1: get the speaker verification scores with cosine function
      python3 \
-          local/speaker_verification_cosine.py\
+          ${BIN_DIR}/speaker_verification_cosine.py\
           --batch-size 4 --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
 fi
 
 if [ $stage -le 3 ]; then
-     # stage 1: train the speaker identification model
-     # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
+     # stage 3: extract the audio embedding
      python3 \
-          local/extract_speaker_embedding.py\
+          ${BIN_DIR}/extract_speaker_embedding.py\
           --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
 fi
 
diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
index 5c5f0369..cbf9b3ae 100644
--- a/paddleaudio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -15,3 +15,5 @@ from .esc50 import ESC50
 from .gtzan import GTZAN
 from .tess import TESS
 from .urban_sound import UrbanSound8K
+from .voxceleb import VoxCeleb1
+from .rirs_noises import OpenRIRNoise
diff --git a/paddleaudio/datasets/rirs_noises.py b/paddleaudio/paddleaudio/datasets/rirs_noises.py
similarity index 97%
rename from paddleaudio/datasets/rirs_noises.py
rename to paddleaudio/paddleaudio/datasets/rirs_noises.py
index 6af9fd9d..df5dec61 100644
--- a/paddleaudio/datasets/rirs_noises.py
+++ b/paddleaudio/paddleaudio/datasets/rirs_noises.py
@@ -23,11 +23,11 @@ from typing import Tuple
 from paddle.io import Dataset
 from tqdm import tqdm
 
-from paddleaudio.backends import load as load_audio
-from paddleaudio.backends import save_wav
-from paddleaudio.datasets.dataset import feat_funcs
-from paddleaudio.utils import DATA_HOME
-from paddleaudio.utils import decompress
+from ..backends import load as load_audio
+from ..backends import save as save_wav
+from .dataset import feat_funcs
+from ..utils import DATA_HOME
+from ..utils import decompress
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.download import download_and_decompress
 
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
similarity index 97%
rename from paddleaudio/datasets/voxceleb.py
rename to paddleaudio/paddleaudio/datasets/voxceleb.py
index 0011340e..4989accb 100644
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -25,10 +25,10 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm
 
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets.dataset import feat_funcs
-from paddleaudio.utils import DATA_HOME
-from paddleaudio.utils import decompress
+from .dataset import feat_funcs
+from ..backends import load as load_audio
+from ..utils import DATA_HOME
+from ..utils import decompress
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.download import download_and_decompress
 from utils.utility import download
@@ -83,7 +83,7 @@ class VoxCeleb1(Dataset):
     meta_path = os.path.join(base_path, 'meta')
     veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
     csv_path = os.path.join(base_path, 'csv')
-    subsets = ['train', 'dev', 'enrol', 'test']
+    subsets = ['train', 'dev', 'enroll', 'test']
 
     def __init__(
             self,
@@ -330,7 +330,7 @@ class VoxCeleb1(Dataset):
 
         self.generate_csv(
             enroll_files,
-            os.path.join(self.csv_path, 'enrol.csv'),
+            os.path.join(self.csv_path, 'enroll.csv'),
             split_chunks=False)
         self.generate_csv(
             test_files,
diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py
index a96530ff..b435571d 100644
--- a/paddleaudio/paddleaudio/metric/__init__.py
+++ b/paddleaudio/paddleaudio/metric/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 from .dtw import dtw_distance
 from .mcd import mcd_distance
+from .eer import compute_eer
diff --git a/paddlespeech/vector/training/metrics.py b/paddleaudio/paddleaudio/metric/eer.py
similarity index 100%
rename from paddlespeech/vector/training/metrics.py
rename to paddleaudio/paddleaudio/metric/eer.py
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index 366c0cff..76312978 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -20,8 +20,8 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddleaudio.paddleaudio import load as load_audio
+from paddleaudio.paddleaudio.datasets.rirs_noises import OpenRIRNoise
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.signal_processing import compute_amplitude
 from paddlespeech.vector.io.signal_processing import convolve1d
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 879cde3a..811775e2 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -40,3 +40,41 @@ def feature_normalize(feats: paddle.Tensor,
         feats = (feats - mean) / std
 
     return feats
+
+
+def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
+    x = np.asarray(x)
+    assert len(
+        x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
+
+    w = target_length - x.shape[axis]
+    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
+
+    if axis == 0:
+        pad_width = [[0, w], [0, 0]]
+    else:
+        pad_width = [[0, 0], [0, w]]
+
+    return np.pad(x, pad_width, mode=mode, **kwargs)
+
+def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
+    ids = [item['id'] for item in batch]
+    lengths = np.asarray([item['feat'].shape[1] for item in batch])
+    feats = list(
+        map(lambda x: pad_right_2d(x, lengths.max()),
+            [item['feat'] for item in batch]))
+    feats = np.stack(feats)
+
+    # Features normalization if needed
+    for i in range(len(feats)):
+        feat = feats[i][:, :lengths[i]]  # Excluding pad values.
+        mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feat.std(axis=-1, keepdims=True) if std_norm else 1
+        feats[i][:, :lengths[i]] = (feat - mean) / std
+        assert feats[i][:, lengths[
+            i]:].sum() == 0  # Padding valus should all be 0.
+
+    # Converts into ratios.
+    lengths = (lengths / lengths.max()).astype(np.float32)
+
+    return {'ids': ids, 'feats': feats, 'lengths': lengths}
\ No newline at end of file
diff --git a/paddlespeech/vector/modules/lr.py b/paddlespeech/vector/training/scheduler.py
similarity index 100%
rename from paddlespeech/vector/modules/lr.py
rename to paddlespeech/vector/training/scheduler.py

From 993d6783d7ca15d190892037e92111d2e50d3326 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 9 Mar 2022 15:39:57 +0800
Subject: [PATCH 24/41] remove unused code, test=doc

---
 examples/voxceleb/sv0/local/data.sh         |  25 ----
 paddlespeech/vector/__init__.py             |  30 +---
 paddlespeech/vector/datasets/ark_dataset.py | 142 -------------------
 paddlespeech/vector/datasets/dataset.py     | 143 --------------------
 paddlespeech/vector/datasets/egs_dataset.py |  91 -------------
 paddlespeech/vector/utils/data_utils.py     | 125 -----------------
 paddlespeech/vector/utils/utils.py          | 132 ------------------
 7 files changed, 1 insertion(+), 687 deletions(-)
 delete mode 100755 examples/voxceleb/sv0/local/data.sh
 delete mode 100755 paddlespeech/vector/datasets/ark_dataset.py
 delete mode 100644 paddlespeech/vector/datasets/dataset.py
 delete mode 100644 paddlespeech/vector/datasets/egs_dataset.py
 delete mode 100755 paddlespeech/vector/utils/data_utils.py
 delete mode 100755 paddlespeech/vector/utils/utils.py

diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
deleted file mode 100755
index 6df9c3b8..00000000
--- a/examples/voxceleb/sv0/local/data.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-stage=-1
-stop_stage=100
-TARGET_DIR=${MAIN_ROOT}/dataset
-
-. utils/parse_options.sh || exit -1;
-
-src=$1
-mkdir -p data/{dev,test}
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    # download data, generate manifests
-    # create data/{dev,test} directory to store the manifest files
-    python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
-    --manifest_prefix="data/manifest" \
-    --target_dir="${src}"
-
-    if [ $? -ne 0 ]; then
-        echo "Prepare Voxceleb failed. Terminated."
-        exit 1
-    fi
-    mv data/manifest.dev data/dev
-    mv data/voxceleb1.dev.meta data/dev
-
-    mv data/manifest.test data/test
-    mv data/voxceleb1.test.meta data/test
-fi
diff --git a/paddlespeech/vector/__init__.py b/paddlespeech/vector/__init__.py
index 5c846193..61d5aa21 100644
--- a/paddlespeech/vector/__init__.py
+++ b/paddlespeech/vector/__init__.py
@@ -10,32 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-__init__ file for sidt package.
-"""
-
-import logging as sidt_logging
-import colorlog
-
-LOG_COLOR_CONFIG = {
-    'DEBUG': 'white',
-    'INFO': 'white',
-    'WARNING': 'yellow',
-    'ERROR': 'red',
-    'CRITICAL': 'purple',
-}
-
-# 设置全局的logger
-colored_formatter = colorlog.ColoredFormatter(
-    '%(log_color)s [%(levelname)s] [%(asctime)s] [%(filename)s:%(lineno)d] - %(message)s',
-    datefmt="%Y-%m-%d %H:%M:%S",
-    log_colors=LOG_COLOR_CONFIG)  # 日志输出格式
-_logger = sidt_logging.getLogger("sidt")
-handler = colorlog.StreamHandler()
-handler.setLevel(sidt_logging.INFO)
-handler.setFormatter(colored_formatter)
-_logger.addHandler(handler)
-_logger.setLevel(sidt_logging.INFO)
-
+# limitations under the License.
\ No newline at end of file
diff --git a/paddlespeech/vector/datasets/ark_dataset.py b/paddlespeech/vector/datasets/ark_dataset.py
deleted file mode 100755
index 7a00e7ba..00000000
--- a/paddlespeech/vector/datasets/ark_dataset.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import random
-import numpy as np
-import kaldi_python_io as k_io
-from paddle.io import Dataset
-from paddlespeech.vector.utils.data_utils import batch_pad_right
-import paddlespeech.vector.utils as utils
-from paddlespeech.vector.utils.utils import read_map_file
-from paddlespeech.vector import _logger as log
-
-def ark_collate_fn(batch):
-    """
-    Custom collate function] for kaldi feats dataset
-
-    Args:
-        min_chunk_size: min chunk size of a utterance
-        max_chunk_size: max chunk size of a utterance
-
-    Returns:
-        ark_collate_fn: collate funtion for dataloader
-    """
-
-    data = []
-    target = []
-    for items in batch:
-        for x, y in zip(items[0], items[1]):
-            data.append(np.array(x))
-            target.append(y)
-
-    data, lengths = batch_pad_right(data)
-    return np.array(data, dtype=np.float32), \
-           np.array(lengths, dtype=np.float32), \
-           np.array(target, dtype=np.long).reshape((len(target), 1))
-
-
-class KaldiArkDataset(Dataset):
-    """
-    Dataset used to load kaldi ark/scp files.
-    """
-    def __init__(self, scp_file, label2utt, min_item_size=1,
-                 max_item_size=1, repeat=50, min_chunk_size=200,
-                 max_chunk_size=400, select_by_speaker=True):
-        self.scp_file = scp_file
-        self.scp_reader = None
-        self.repeat = repeat
-        self.min_item_size = min_item_size
-        self.max_item_size = max_item_size
-        self.min_chunk_size = min_chunk_size
-        self.max_chunk_size = max_chunk_size
-        self._collate_fn = ark_collate_fn
-        self._is_select_by_speaker = select_by_speaker
-        if utils.is_exist(self.scp_file):
-            self.scp_reader = k_io.ScriptReader(self.scp_file)
-
-        label2utts, utt2label = read_map_file(label2utt, key_func=int)
-        self.utt_info = list(label2utts.items()) if self._is_select_by_speaker else list(utt2label.items())
-
-    @property
-    def collate_fn(self):
-        """
-        Return a collate funtion.
-        """
-        return self._collate_fn
-
-    def _random_chunk(self, length):
-        chunk_size = random.randint(self.min_chunk_size, self.max_chunk_size)
-        if chunk_size >= length:
-            return 0, length
-        start = random.randint(0, length - chunk_size)
-        end = start + chunk_size
-
-        return start, end
-
-    def _select_by_speaker(self, index):
-        if self.scp_reader is None or not self.utt_info:
-            return []
-        index = index % (len(self.utt_info))
-        inputs = []
-        labels = []
-        item_size = random.randint(self.min_item_size, self.max_item_size)
-        for loop_idx in range(item_size):
-            try:
-                utt_index = random.randint(0, len(self.utt_info[index][1])) \
-                        % len(self.utt_info[index][1])
-                key = self.utt_info[index][1][utt_index]
-            except:
-                print(index, utt_index, len(self.utt_info[index][1]))
-                sys.exit(-1)
-            x = self.scp_reader[key]
-            x = np.transpose(x)
-            bg, end = self._random_chunk(x.shape[-1])
-            inputs.append(x[:, bg: end])
-            labels.append(self.utt_info[index][0])
-        return inputs, labels
-
-    def _select_by_utt(self, index):
-        if self.scp_reader is None or len(self.utt_info) == 0:
-            return {}
-        index = index % (len(self.utt_info))
-        key = self.utt_info[index][0]
-        x = self.scp_reader[key]
-        x = np.transpose(x)
-        bg, end = self._random_chunk(x.shape[-1])
-
-        y = self.utt_info[index][1]
-
-        return [x[:, bg: end]], [y]
-
-    def __getitem__(self, index):
-        if self._is_select_by_speaker:
-            return self._select_by_speaker(index)
-        else:
-            return self._select_by_utt(index)
-
-    def __len__(self):
-        return len(self.utt_info) * self.repeat
-
-    def __iter__(self):
-        self._start = 0
-        return self
-
-    def __next__(self):
-        if self._start < len(self):
-            ret = self[self._start]
-            self._start += 1
-            return ret
-        else:
-            raise StopIteration
diff --git a/paddlespeech/vector/datasets/dataset.py b/paddlespeech/vector/datasets/dataset.py
deleted file mode 100644
index e7030053..00000000
--- a/paddlespeech/vector/datasets/dataset.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import random
-import numpy as np
-import kaldi_python_io as k_io
-from paddle.io import Dataset
-from paddlespeech.vector.utils.data_utils import batch_pad_right
-import paddlespeech.vector.utils as utils
-from paddlespeech.vector.utils.utils import read_map_file
-
-def ark_collate_fn(batch):
-    """
-    Custom collate function for kaldi feats dataset
-
-    Args:
-        min_chunk_size: min chunk size of a utterance
-        max_chunk_size: max chunk size of a utterance
-
-    Returns:
-        ark_collate_fn: collate funtion for dataloader
-    """
-
-    data = []
-    target = []
-    for items in batch:
-        for x, y in zip(items[0], items[1]):
-            data.append(np.array(x))
-            target.append(y)
-
-    data, lengths = batch_pad_right(data)
-    return np.array(data, dtype=np.float32), \
-           np.array(lengths, dtype=np.float32), \
-           np.array(target, dtype=np.long).reshape((len(target), 1))
-
-
-class KaldiArkDataset(Dataset):
-    """
-    Dataset used to load kaldi ark/scp files.
-    """
-    def __init__(self, scp_file, label2utt, min_item_size=1,
-                 max_item_size=1, repeat=50, min_chunk_size=200,
-                 max_chunk_size=400, select_by_speaker=True):
-        self.scp_file = scp_file
-        self.scp_reader = None
-        self.repeat = repeat
-        self.min_item_size = min_item_size
-        self.max_item_size = max_item_size
-        self.min_chunk_size = min_chunk_size
-        self.max_chunk_size = max_chunk_size
-        self._collate_fn = ark_collate_fn
-        self._is_select_by_speaker = select_by_speaker
-        if utils.is_exist(self.scp_file):
-            self.scp_reader = k_io.ScriptReader(self.scp_file)
-
-        label2utts, utt2label = read_map_file(label2utt, key_func=int)
-        self.utt_info = list(label2utts.items()) if self._is_select_by_speaker else list(utt2label.items())
-
-    @property
-    def collate_fn(self):
-        """
-        Return a collate funtion.
-        """
-        return self._collate_fn
-
-    def _random_chunk(self, length):
-        chunk_size = random.randint(self.min_chunk_size, self.max_chunk_size)
-        if chunk_size >= length:
-            return 0, length
-        start = random.randint(0, length - chunk_size)
-        end = start + chunk_size
-
-        return start, end
-
-    def _select_by_speaker(self, index):
-        if self.scp_reader is None or not self.utt_info:
-            return []
-        index = index % (len(self.utt_info))
-        inputs = []
-        labels = []
-        item_size = random.randint(self.min_item_size, self.max_item_size)
-        for loop_idx in range(item_size):
-            try:
-                utt_index = random.randint(0, len(self.utt_info[index][1])) \
-                        % len(self.utt_info[index][1])
-                key = self.utt_info[index][1][utt_index]
-            except:
-                print(index, utt_index, len(self.utt_info[index][1]))
-                sys.exit(-1)
-            x = self.scp_reader[key]
-            x = np.transpose(x)
-            bg, end = self._random_chunk(x.shape[-1])
-            inputs.append(x[:, bg: end])
-            labels.append(self.utt_info[index][0])
-        return inputs, labels
-
-    def _select_by_utt(self, index):
-        if self.scp_reader is None or len(self.utt_info) == 0:
-            return {}
-        index = index % (len(self.utt_info))
-        key = self.utt_info[index][0]
-        x = self.scp_reader[key]
-        x = np.transpose(x)
-        bg, end = self._random_chunk(x.shape[-1])
-
-        y = self.utt_info[index][1]
-
-        return [x[:, bg: end]], [y]
-
-    def __getitem__(self, index):
-        if self._is_select_by_speaker:
-            return self._select_by_speaker(index)
-        else:
-            return self._select_by_utt(index)
-
-    def __len__(self):
-        return len(self.utt_info) * self.repeat
-
-    def __iter__(self):
-        self._start = 0
-        return self
-
-    def __next__(self):
-        if self._start < len(self):
-            ret = self[self._start]
-            self._start += 1
-            return ret
-        else:
-            raise StopIteration
-
-return KaldiArkDataset
diff --git a/paddlespeech/vector/datasets/egs_dataset.py b/paddlespeech/vector/datasets/egs_dataset.py
deleted file mode 100644
index 53130d5f..00000000
--- a/paddlespeech/vector/datasets/egs_dataset.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Load nnet3 training egs which generated by kaldi
-"""
-
-import random
-import numpy as np
-import kaldi_python_io as k_io
-from paddle.io import Dataset
-import paddlespeech.vector.utils.utils as utils
-from paddlespeech.vector import _logger as log
-class KaldiEgsDataset(Dataset):
-    """
-    Dataset used to load kaldi nnet3 egs files.
-    """
-    def __init__(self, egs_list_file, egs_idx, transforms=None):
-        self.scp_reader = None
-        self.subset_idx = egs_idx - 1
-        self.transforms = transforms
-        if not utils.is_exist(egs_list_file):
-            return
-
-        self.egs_files = []
-        with open(egs_list_file, 'r') as in_fh:
-            for line in in_fh:
-                if line.strip():
-                    self.egs_files.append(line.strip())
-
-        self.next_subset()
-
-    def next_subset(self, target_index=None, delta_index=None):
-        """
-        Use next specific subset
-
-        Args:
-            target_index: target egs index
-            delta_index: incremental value of egs index
-        """
-        if self.egs_files:
-            if target_index:
-                self.subset_idx = target_index
-            else:
-                delta_index = delta_index if delta_index else 1
-                self.subset_idx += delta_index
-            log.info("egs dataset subset index: %d" % (self.subset_idx))
-            egs_file = self.egs_files[self.subset_idx % len(self.egs_files)]
-            if utils.is_exist(egs_file):
-                self.scp_reader = k_io.Nnet3EgsScriptReader(egs_file)
-            else:
-                log.warning("No such file or directory: %s" % (egs_file))
-
-    def __getitem__(self, index):
-        if self.scp_reader is None:
-            return {}
-        index %= len(self)
-        in_dict, out_dict = self.scp_reader[index]
-        x = np.array(in_dict['matrix'])
-        x = np.transpose(x)
-        y = np.array(out_dict['matrix'][0][0][0], dtype=np.int).reshape((1,))
-        if self.transforms is not None:
-            idx = random.randint(0, len(self.transforms) - 1)
-            x = self.transforms[idx](x)
-        return x, y
-
-    def __len__(self):
-        return len(self.scp_reader)
-
-    def __iter__(self):
-        self._start = 0
-        return self
-
-    def __next__(self):
-        if self._start < len(self):
-            ret = self[self._start]
-            self._start += 1
-            return ret
-        else:
-            raise StopIteration
\ No newline at end of file
diff --git a/paddlespeech/vector/utils/data_utils.py b/paddlespeech/vector/utils/data_utils.py
deleted file mode 100755
index 4a33a795..00000000
--- a/paddlespeech/vector/utils/data_utils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-data utilities
-"""
-import os
-import sys
-import numpy
-import paddle
-
-
-def pad_right_to(array, target_shape, mode="constant", value=0):
-    """
-    This function takes a numpy array of arbitrary shape and pads it to target
-    shape by appending values on the right.
-
-    Args:
-        array: input numpy array. Input array whose dimension we need to pad.
-    target_shape : (list, tuple). Target shape we want for the target array its len must be equal to array.ndim
-    mode : str. Pad mode, please refer to numpy.pad documentation.
-    value : float. Pad value, please refer to numpy.pad documentation.
-
-    Returns:
-        array: numpy.array. Padded array.
-        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
-    """
-    assert len(target_shape) == array.ndim
-    pads = []  # this contains the abs length of the padding for each dimension.
-    valid_vals = []  # thic contains the relative lengths for each dimension.
-    i = 0 # iterating over target_shape ndims
-    while i < len(target_shape):
-        assert (
-            target_shape[i] >= array.shape[i]
-        ), "Target shape must be >= original shape for every dim"
-        pads.append([0, target_shape[i] - array.shape[i]])
-        valid_vals.append(array.shape[i] / target_shape[i])
-        i += 1
-
-    array = numpy.pad(array, pads, mode=mode, constant_values=value)
-
-    return array, valid_vals
-
-
-def batch_pad_right(arrays, mode="constant", value=0):
-    """Given a list of numpy arrays it batches them together by padding to the right
-    on each dimension in order to get same length for all.
-
-    Args:
-        arrays : list. List of array we wish to pad together.
-        mode : str. Padding mode see numpy.pad documentation.
-        value : float. Padding value see numpy.pad documentation.
-
-    Returns:
-        array : numpy.array. Padded array.
-        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
-    """
-
-    if not len(arrays):
-        raise IndexError("arrays list must not be empty")
-
-    if len(arrays) == 1:
-        # if there is only one array in the batch we simply unsqueeze it.
-        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
-
-    if not (
-        any(
-            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
-        )
-    ):
-        raise IndexError("All arrays must have same number of dimensions")
-
-    # FIXME we limit the support here: we allow padding of only the last dimension
-    # need to remove this when feat extraction is updated to handle multichannel.
-    max_shape = []
-    for dim in range(arrays[0].ndim):
-        if dim != (arrays[0].ndim - 1):
-            if not all(
-                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
-            ):
-                raise EnvironmentError(
-                    "arrays should have same dimensions except for last one"
-                )
-        max_shape.append(max([x.shape[dim] for x in arrays]))
-
-    batched = []
-    valid = []
-    for t in arrays:
-        # for each array we apply pad_right_to
-        padded, valid_percent = pad_right_to(
-            t, max_shape, mode=mode, value=value
-        )
-        batched.append(padded)
-        valid.append(valid_percent[-1])
-
-    batched = numpy.stack(batched)
-
-    return batched, numpy.array(valid)
-
-
-def length_to_mask(length, max_len=None, dtype=None):
-    """Creates a binary mask for each sequence.
-    """
-    assert len(length.shape) == 1
-
-    if max_len is None:
-        max_len = paddle.cast(paddle.max(length), dtype="int64")  # using arange to generate mask
-    mask = paddle.arange(max_len, dtype=length.dtype).expand([paddle.shape(length)[0], max_len]) < length.unsqueeze(1)
-
-    if dtype is None:
-        dtype = length.dtype
-
-    mask = paddle.cast(mask, dtype=dtype)
-    return mask
diff --git a/paddlespeech/vector/utils/utils.py b/paddlespeech/vector/utils/utils.py
deleted file mode 100755
index a28cb526..00000000
--- a/paddlespeech/vector/utils/utils.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-utilities
-"""
-import os
-import sys
-import paddle
-import numpy as np
-
-from paddlespeech.vector import _logger as log
-
-
-def exit_if_not_exist(in_path):
-    """
-    Check the existence of a file or directory, if not exit, exit the program.
-
-    Args:
-        in_path: input dicrector
-    """
-    if not is_exist(in_path):
-        sys.exit(-1)
-
-
-def is_exist(in_path):
-    """
-    Check the existence of a file or directory
-
-    Args:
-        in_path: input dicrector
-
-    Returns:
-        True or False
-    """
-    if not os.path.exists(in_path):
-        log.error("No such file or directory: %s" % (in_path))
-        return False
-
-    return True
-
-
-def get_latest_file(target_dir):
-    """
-    Get the latest file in target directory
-
-    Args:
-        target_dir: target directory
-
-    Returns:
-        latest_file: a string or None
-    """
-    items = os.listdir(target_dir)
-    items.sort(key=lambda fn: os.path.getmtime(os.path.join(target_dir, fn)) \
-               if not os.path.isdir(os.path.join(target_dir, fn)) else 0)
-    latest_file = None if not items else os.path.join(target_dir, items[-1])
-    return latest_file
-
-
-def avg_models(models):
-    """
-    merge multiple models
-    """
-    checkpoint_dict = paddle.load(models[0])
-    final_state_dict = checkpoint_dict
-
-    if len(models) > 1:
-        for model in models[1:]:
-            checkpoint_dict = paddle.load(model)
-            for k, v in checkpoint_dict.items():
-                final_state_dict[k] += v
-        for k in final_state_dict.keys():
-            final_state_dict[k] /= float(len(models))
-            if np.any(np.isnan(final_state_dict[k])):
-                print("Nan in %s" % (k))
-
-    return final_state_dict
-
-def Q_from_tokens(token_num):
-    """
-    get prior model, data from uniform, would support others(guassian) in future
-    """
-    freq = [1] * token_num
-    Q = paddle.to_tensor(freq, dtype = 'float64')
-    return Q / Q.sum()
-
-
-def read_map_file(map_file, key_func=None, value_func=None, values_func=None):
-    """ Read map file. First colume is key, the rest columes are values.
-
-    Args:
-        map_file: map file
-        key_func: convert function for key
-        value_func: convert function for each value
-        values_func: convert function for values
-
-    Returns:
-        dict: key 2 value
-        dict: value 2 key
-    """
-    if not is_exist(map_file):
-        sys.exit(0)
-
-    key2val = {}
-    val2key = {}
-    with open(map_file, 'r') as f:
-        for line in f:
-            line = line.strip()
-            if not line:
-                continue
-            items = line.split()
-            assert len(items) >= 2
-            key = items[0] if not key_func else key_func(items[0])
-            values = items[1:] if not value_func else [value_func(item) for item in items[1:]]
-            if values_func:
-                values = values_func(values)
-            key2val[key] = values
-            for value in values:
-                val2key[value] = key
-
-    return key2val, val2key

From 584a2c0e39ab73b4a5826077528eccb4edf7afbd Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 9 Mar 2022 20:46:57 +0800
Subject: [PATCH 25/41] add ecapa-tdnn config yaml file

---
 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml    |  35 ++
 examples/voxceleb/sv0/run.sh                  |   6 +-
 .../ecapa-tdnn/extract_speaker_embedding.py   | 112 +++++++
 .../ecapa-tdnn/speaker_verification_cosine.py | 207 ++++++++++++
 paddlespeech/vector/exps/ecapa-tdnn/train.py  | 298 ++++++++++++++++++
 5 files changed, 656 insertions(+), 2 deletions(-)
 create mode 100644 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
 create mode 100644 paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py
 create mode 100644 paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py
 create mode 100644 paddlespeech/vector/exps/ecapa-tdnn/train.py

diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
new file mode 100644
index 00000000..33304054
--- /dev/null
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -0,0 +1,35 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# currently, we only support fbank
+feature:
+  n_mels: 80
+  window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+  hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
+# if we want use another model, please choose another configuration yaml file
+model:
+  input_size: 80
+  ##"channels": [1024, 1024, 1024, 1024, 3072],
+  # "channels": [512, 512, 512, 512, 1536],
+  channels: [512, 512, 512, 512, 1536]
+  kernel_sizes: [5, 3, 3, 3, 1]
+  dilations: [1, 2, 3, 4, 1]
+  attention_channels: 128
+  lin_neurons: 192
+
+###########################################
+#                Training                 #
+###########################################
+seed: 0
+epochs: 10
+batch_size: 32
+num_workers: 2
+save_freq: 10
+log_freq: 10
+learning_rate: 1e-8
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index a6346cd5..2c0e55a6 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -31,20 +31,22 @@ if [ $stage -le 1 ]; then
      python3 \
           -m paddle.distributed.launch --gpus=0,1,2,3 \
           ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
-          --save-freq 10 --data-dir ${dir} --batch-size 64 --epochs 100
+          --data-dir ${dir} --config conf/ecapa_tdnn.yaml
 fi
 
 if [ $stage -le 2 ]; then
      # stage 1: get the speaker verification scores with cosine function
      python3 \
           ${BIN_DIR}/speaker_verification_cosine.py\
-          --batch-size 4 --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
+          --config conf/ecapa_tdnn.yaml \
+          --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
 fi
 
 if [ $stage -le 3 ]; then
      # stage 3: extract the audio embedding
      python3 \
           ${BIN_DIR}/extract_speaker_embedding.py\
+          --config conf/ecapa_tdnn.yaml \
           --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
 fi
 
diff --git a/paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py b/paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py
new file mode 100644
index 00000000..78498c61
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.paddleaudio.backends import load as load_audio
+from paddleaudio.paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+def extract_audio_embedding(args, config):
+    # stage 0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage 1: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=1211)
+    # stage 2: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage 3: we must set the model to eval mode
+    model.eval()
+
+    # stage 4: read the audio data and extract the embedding
+    # wavform is one dimension numpy array 
+    waveform, sr = load_audio(args.audio_path)
+
+    # feat type is numpy array, whose shape is [dim, time]
+    # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
+    # so the final shape is [1, dim, time]
+    feat = melspectrogram(x=waveform, **config.feature)
+    feat = paddle.to_tensor(feat).unsqueeze(0)
+
+    # in inference period, the lengths is all one without padding
+    lengths = paddle.ones([1])
+    feat = feature_normalize(
+        feat, mean_norm=True, std_norm=False, convert_to_numpy=True)
+
+    # model backbone network forward the feats and get the embedding
+    embedding = model.backbone(
+        feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
+
+    # stage 5: do global norm with external mean and std
+    # todo
+    return embedding
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="gpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config", 
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--global-embedding-norm",
+                        type=str,
+                        default=None,
+                        help="Apply global normalization on speaker embeddings.")
+    parser.add_argument("--audio-path",
+                        default="./data/demo.wav",
+                        type=str,
+                        help="Single audio file path")
+    args = parser.parse_args()
+    # yapf: enable
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    extract_audio_embedding(args, config)
diff --git a/paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py b/paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py
new file mode 100644
index 00000000..4d85bd62
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+
+import numpy as np
+import paddle
+from yacs.config import CfgNode
+import paddle.nn.functional as F
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from tqdm import tqdm
+
+from paddleaudio.paddleaudio.datasets import VoxCeleb1
+from paddlespeech.s2t.utils.log import Log
+from paddleaudio.paddleaudio.metric import compute_eer
+from paddlespeech.vector.io.batch import batch_feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+def main(args, config):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage1: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage2: build the speaker verification eval instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+
+    # stage3: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage4: construct the enroll and test dataloader
+    enroll_dataset = VoxCeleb1(
+        subset='enroll',
+        target_dir=args.data_dir,
+        feat_type='melspectrogram',
+        random_chunk=False,
+        **config.feature)
+    enroll_sampler = BatchSampler(
+        enroll_dataset, batch_size=config.batch_size,
+        shuffle=True)  # Shuffle to make embedding normalization more robust.
+    enrol_loader = DataLoader(enroll_dataset,
+                    batch_sampler=enroll_sampler,
+                    collate_fn=lambda x: batch_feature_normalize(
+                            x, mean_norm=True, std_norm=False),
+                    num_workers=config.num_workers,
+                    return_list=True,)
+
+    test_dataset = VoxCeleb1(
+        subset='test',
+        target_dir=args.data_dir,
+        feat_type='melspectrogram',
+        random_chunk=False,
+        **config.feature)
+
+    test_sampler = BatchSampler(
+        test_dataset, batch_size=config.batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset,
+                            batch_sampler=test_sampler,
+                            collate_fn=lambda x: batch_feature_normalize(
+                                x, mean_norm=True, std_norm=False),
+                            num_workers=config.num_workers,
+                            return_list=True,)
+    # stage6: we must set the model to eval mode
+    model.eval()
+
+    # stage7: global embedding norm to imporve the performance
+    if args.global_embedding_norm:
+        global_embedding_mean = None
+        global_embedding_std = None
+        mean_norm_flag = args.embedding_mean_norm
+        std_norm_flag = args.embedding_std_norm
+        batch_count = 0
+
+    # stage8: Compute embeddings of audios in enrol and test dataset from model.
+    id2embedding = {}
+    # Run multi times to make embedding normalization more stable.
+    for i in range(2):
+        for dl in [enrol_loader, test_loader]:
+            logger.info(
+                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+            )
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(tqdm(dl)):
+
+                    # stage 8-1: extrac the audio embedding
+                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
+                        'lengths']
+                    embeddings = model.backbone(feats, lengths).squeeze(
+                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
+
+                    # Global embedding normalization.
+                    if args.global_embedding_norm:
+                        batch_count += 1
+                        current_mean = embeddings.mean(
+                            axis=0) if mean_norm_flag else 0
+                        current_std = embeddings.std(
+                            axis=0) if std_norm_flag else 1
+                        # Update global mean and std.
+                        if global_embedding_mean is None and global_embedding_std is None:
+                            global_embedding_mean, global_embedding_std = current_mean, current_std
+                        else:
+                            weight = 1 / batch_count  # Weight decay by batches.
+                            global_embedding_mean = (
+                                1 - weight
+                            ) * global_embedding_mean + weight * current_mean
+                            global_embedding_std = (
+                                1 - weight
+                            ) * global_embedding_std + weight * current_std
+                        # Apply global embedding normalization.
+                        embeddings = (embeddings - global_embedding_mean
+                                      ) / global_embedding_std
+
+                    # Update embedding dict.
+                    id2embedding.update(dict(zip(ids, embeddings)))
+
+    # stage 9: Compute cosine scores.
+    labels = []
+    enrol_ids = []
+    test_ids = []
+    with open(VoxCeleb1.veri_test_file, 'r') as f:
+        for line in f.readlines():
+            label, enrol_id, test_id = line.strip().split(' ')
+            labels.append(int(label))
+            enrol_ids.append(enrol_id.split('.')[0].replace('/', '-'))
+            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+
+    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+        np.asarray([id2embedding[id] for id in ids], dtype='float32')),
+                                            [enrol_ids, test_ids
+                                             ])  # (N, emb_size)
+    scores = cos_sim_func(enrol_embeddings, test_embeddings)
+    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
+    logger.info(
+        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
+    )
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="gpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config", 
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--global-embedding-norm",
+                        type=bool,
+                        default=True,
+                        help="Apply global normalization on speaker embeddings.")
+    parser.add_argument("--embedding-mean-norm",
+                        type=bool,
+                        default=True,
+                        help="Apply mean normalization on speaker embeddings.")
+    parser.add_argument("--embedding-std-norm",
+                        type=bool,
+                        default=False,
+                        help="Apply std normalization on speaker embeddings.")
+    args = parser.parse_args()
+    # yapf: enable
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+    main(args, config)
diff --git a/paddlespeech/vector/exps/ecapa-tdnn/train.py b/paddlespeech/vector/exps/ecapa-tdnn/train.py
new file mode 100644
index 00000000..08a4ac1c
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa-tdnn/train.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+from paddleaudio.paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.io.batch import waveform_collate_fn
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.loss import AdditiveAngularMargin
+from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
+from paddlespeech.vector.training.scheduler import CyclicLRScheduler
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+from paddlespeech.vector.utils.time import Timer
+
+logger = Log(__name__).getlog()
+
+def main(args, config):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    paddle.distributed.init_parallel_env()
+    nranks = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
+    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
+    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
+
+    if args.augment:
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+    else:
+        augment_pipeline = []
+
+    # stage3: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+
+    # stage5: build the optimizer, we now only construct the AdamW optimizer
+    lr_schedule = CyclicLRScheduler(
+        base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_schedule, parameters=model.parameters())
+
+    # stage6: build the loss function, we now only support LogSoftmaxWrapper
+    criterion = LogSoftmaxWrapper(
+        loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
+
+    # stage7: confirm training start epoch
+    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
+    start_epoch = 0
+    if args.load_checkpoint:
+        logger.info("load the check point")
+        args.load_checkpoint = os.path.abspath(
+            os.path.expanduser(args.load_checkpoint))
+        try:
+            # load model checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdparams'))
+            model.set_state_dict(state_dict)
+
+            # load optimizer checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdopt'))
+            optimizer.set_state_dict(state_dict)
+            if local_rank == 0:
+                logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+        except FileExistsError:
+            if local_rank == 0:
+                logger.info('Train from scratch.')
+
+        try:
+            start_epoch = int(args.load_checkpoint[-1])
+            logger.info(f'Restore training from epoch {start_epoch}.')
+        except ValueError:
+            pass
+
+    # stage8: we build the batch sampler for paddle.DataLoader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=False)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        num_workers=config.num_workers,
+        collate_fn=waveform_collate_fn,
+        return_list=True,
+        use_buffer_reader=True, )
+
+    # stage9: start to train
+    #         we will comment the training process
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * config.epochs)
+    timer.start()
+
+    for epoch in range(start_epoch + 1, config.epochs + 1):
+        # at the begining, model must set to train mode
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        for batch_idx, batch in enumerate(train_loader):
+            # stage 9-1: batch data is audio sample points and speaker id label
+            waveforms, labels = batch['waveforms'], batch['labels']
+
+            # stage 9-2: audio sample augment method, which is done on the audio sample point
+            if len(augment_pipeline) != 0:
+                waveforms = waveform_augment(waveforms, augment_pipeline)
+                labels = paddle.concat(
+                    [labels for i in range(len(augment_pipeline) + 1)])
+
+            # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
+            feats = []
+            for waveform in waveforms.numpy():
+                feat = melspectrogram(x=waveform, **config.feature)
+                feats.append(feat)
+            feats = paddle.to_tensor(np.asarray(feats))
+
+            # stage 9-4: feature normalize, which help converge and imporve the performance
+            feats = feature_normalize(
+                feats, mean_norm=True, std_norm=False)  # Features normalization
+
+            # stage 9-5: model forward, such ecapa-tdnn, x-vector
+            logits = model(feats)
+
+            # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
+            loss = criterion(logits, labels)
+
+            # stage 9-7: update the gradient and clear the gradient cache
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+
+            # stage 9-8: Calculate average loss per batch
+            avg_loss += loss.numpy()[0]
+
+            # stage 9-9: Calculate metrics, which is one-best accuracy
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+            timer.count()  # step plus one in timer
+
+            # stage 9-10: print the log information only on 0-rank per log-freq batchs
+            if (batch_idx + 1) % config.log_freq == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= config.log_freq
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
+                    epoch, config.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.info(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+
+        # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
+        if epoch % config.save_freq == 0 and batch_idx + 1 == steps_per_epoch:
+            if local_rank != 0:
+                paddle.distributed.barrier(
+                )  # Wait for valid step in main process
+                continue  # Resume trainning on other process
+
+            # stage 9-12: construct the valid dataset dataloader
+            dev_sampler = BatchSampler(
+                dev_dataset,
+                batch_size=config.batch_size // 4,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = DataLoader(
+                dev_dataset,
+                batch_sampler=dev_sampler,
+                collate_fn=waveform_collate_fn,
+                num_workers=config.num_workers,
+                return_list=True, )
+
+            # set the model to eval mode
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+
+            # stage 9-13: evaluation the valid dataset batch data
+            logger.info('Evaluate on validation dataset')
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(dev_loader):
+                    waveforms, labels = batch['waveforms'], batch['labels']
+
+                    feats = []
+                    for waveform in waveforms.numpy():
+                        # feat = melspectrogram(x=waveform, **cpu_feat_conf)
+                        feat = melspectrogram(x=waveform, **config.feature)
+                        feats.append(feat)
+
+                    feats = paddle.to_tensor(np.asarray(feats))
+                    feats = feature_normalize(
+                        feats, mean_norm=True, std_norm=False)
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+            logger.info(print_msg)
+
+            # stage 9-14: Save model parameters
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
+
+            if nranks > 1:
+                paddle.distributed.barrier()  # Main process
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="cpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config", 
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default=None,
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--checkpoint-dir",
+                        type=str,
+                        default='./checkpoint',
+                        help="Directory to save model checkpoints.")
+    parser.add_argument("--augment",
+                        action="store_true",
+                        default=False,
+                        help="Apply audio augments.")
+
+    args = parser.parse_args()
+    # yapf: enable
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)

From 8ed5c287a323b0d59b3ef44855f579f8a03102f9 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 10 Mar 2022 16:54:48 +0800
Subject: [PATCH 26/41] add vox2 data into VoxCeleb class

---
 examples/voxceleb/README.md                   | 33 -------
 examples/voxceleb/sv0/local/data_prepare.py   | 33 ++++++-
 examples/voxceleb/sv0/path.sh                 | 16 +++-
 examples/voxceleb/sv0/run.sh                  | 34 ++++++-
 paddleaudio/paddleaudio/datasets/__init__.py  |  2 +-
 paddleaudio/paddleaudio/datasets/voxceleb.py  | 44 ++++++---
 .../extract_speaker_embedding.py              |  3 +-
 .../speaker_verification_cosine.py            | 17 ++--
 .../exps/{ecapa-tdnn => ecapa_tdnn}/train.py  | 14 +--
 paddlespeech/vector/io/augment.py             |  1 +
 paddlespeech/vector/models/ecapa_tdnn.py      | 96 +++++++------------
 11 files changed, 158 insertions(+), 135 deletions(-)
 rename paddlespeech/vector/exps/{ecapa-tdnn => ecapa_tdnn}/extract_speaker_embedding.py (99%)
 rename paddlespeech/vector/exps/{ecapa-tdnn => ecapa_tdnn}/speaker_verification_cosine.py (96%)
 rename paddlespeech/vector/exps/{ecapa-tdnn => ecapa_tdnn}/train.py (97%)

diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
index 59fb491c..fc847cd8 100644
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@@ -23,39 +23,6 @@ VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech,
 ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
 ```
 
-``` shell
-# copy this to root directory of data and 
-# chmod a+x convert.sh
-# ./convert.sh
-# https://unix.stackexchange.com/questions/103920/parallelize-a-bash-for-loop
-
-open_sem(){
-    mkfifo pipe-$$
-    exec 3<>pipe-$$
-    rm pipe-$$
-    local i=$1
-    for((;i>0;i--)); do
-        printf %s 000 >&3
-    done
-}
-run_with_lock(){
-    local x
-    read -u 3 -n 3 x && ((0==x)) || exit $x
-    (
-     ( "$@"; )
-    printf '%.3d' $? >&3
-    )&
-}
-
-N=32 # number of vCPU
-open_sem $N
-for f in $(find . -name "*.m4a"); do
-    run_with_lock ffmpeg -loglevel panic -i "$f" -ar 16000 "${f%.*}.wav"
-done
-```
-
 You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
 
 3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
-
-4. 
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
index 1a0a6392..b906b5da 100644
--- a/examples/voxceleb/sv0/local/data_prepare.py
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -1,17 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 import os
 
 import numpy as np
 import paddle
 
-from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
+
 def main(args):
+
     # stage0: set the cpu device, all data prepare process will be done in cpu mode
     paddle.set_device("cpu")
     # set the random seed, it is a must for multiprocess training
@@ -19,14 +34,18 @@ def main(args):
 
     # stage 1: generate the voxceleb csv file
     # Note: this may occurs c++ execption, but the program will execute fine
-    # so we can ignore the execption 
-    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
-    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
+    # so we ignore the execption 
+    # we explicitly pass the vox2 base path to data prepare and generate the audio info
+    train_dataset = VoxCeleb(
+        'train', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
+    dev_dataset = VoxCeleb(
+        'dev', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
 
     # stage 2: generate the augment noise csv file
     if args.augment:
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
 
+
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
@@ -38,10 +57,14 @@ if __name__ == "__main__":
                         default="./data/",
                         type=str,
                         help="data directory")
+    parser.add_argument("--vox2-base-path",
+                        default=None,
+                        type=str,
+                        help="vox2 base path, where is store the wav audio")
     parser.add_argument("--augment",
                         action="store_true",
                         default=False,
                         help="Apply audio augments.")
     args = parser.parse_args()
     # yapf: enable
-    main(args)                    
\ No newline at end of file
+    main(args)
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
index 6d19f994..2be098e0 100755
--- a/examples/voxceleb/sv0/path.sh
+++ b/examples/voxceleb/sv0/path.sh
@@ -1,3 +1,17 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
@@ -10,5 +24,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 
-MODEL=ecapa-tdnn
+MODEL=ecapa_tdnn
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index 2c0e55a6..769332eb 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -1,4 +1,17 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 . ./path.sh
 set -e
@@ -11,19 +24,30 @@ set -e
 # stage 3: extract the training embeding to train the LDA and PLDA
 ######################################################################
 
-# you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
-# default the dataset is the ~/.paddleaudio/
+# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset 
+# default the dataset will be stored in the ~/.paddleaudio/
+# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
+# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
+# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
 # export PPAUDIO_HOME=
 
 stage=0
-dir=data.bak/                     # data directory
-exp_dir=exp/ecapa-tdnn/           # experiment directory
+# data directory
+# if we set the variable ${dir}, we will store the wav info to this directory
+# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+dir=data/                          
+exp_dir=exp/ecapa-tdnn/            # experiment directory
+
+# vox2 wav path, we must convert the m4a format to wav format 
+# and store them in the ${PPAUDIO_HOME}/datasets/vox2/wav/ directory
+vox2_base_path=${PPAUDIO_HOME}/datasets/vox2/wav/
 mkdir -p ${dir}
 mkdir -p ${exp_dir}
 
 if [ $stage -le 0 ]; then 
      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-     python3 local/data_prepare.py --data-dir ${dir} --augment
+     python3 local/data_prepare.py \
+     --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path}
 fi 
 
 if [ $stage -le 1 ]; then
diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
index cbf9b3ae..6f44e977 100644
--- a/paddleaudio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -15,5 +15,5 @@ from .esc50 import ESC50
 from .gtzan import GTZAN
 from .tess import TESS
 from .urban_sound import UrbanSound8K
-from .voxceleb import VoxCeleb1
+from .voxceleb import VoxCeleb
 from .rirs_noises import OpenRIRNoise
diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
index 4989accb..f8d634f2 100644
--- a/paddleaudio/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -25,10 +25,10 @@ from paddle.io import Dataset
 from pathos.multiprocessing import Pool
 from tqdm import tqdm
 
-from .dataset import feat_funcs
 from ..backends import load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
+from .dataset import feat_funcs
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.download import download_and_decompress
 from utils.utility import download
@@ -36,10 +36,10 @@ from utils.utility import unpack
 
 logger = Log(__name__).getlog()
 
-__all__ = ['VoxCeleb1']
+__all__ = ['VoxCeleb']
 
 
-class VoxCeleb1(Dataset):
+class VoxCeleb(Dataset):
     source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
     archieves_audio_dev = [
         {
@@ -94,8 +94,18 @@ class VoxCeleb1(Dataset):
             split_ratio: float=0.9,  # train split ratio
             seed: int=0,
             target_dir: str=None,
+            vox2_base_path=None,
             **kwargs):
-
+        """VoxCeleb data prepare and get the specific dataset audio info
+
+        Args:
+            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
+            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
+            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
+            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
+            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
+            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
+        """
         assert subset in self.subsets, \
             'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
 
@@ -106,19 +116,20 @@ class VoxCeleb1(Dataset):
         self.random_chunk = random_chunk
         self.chunk_duration = chunk_duration
         self.split_ratio = split_ratio
-        self.target_dir = target_dir if target_dir else VoxCeleb1.base_path
+        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
+        self.vox2_base_path = vox2_base_path
 
         # if we set the target dir, we will change the vox data info data from base path to target dir
-        VoxCeleb1.csv_path = os.path.join(
-            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb1.csv_path
-        VoxCeleb1.meta_path = os.path.join(
+        VoxCeleb.csv_path = os.path.join(
+            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
+        VoxCeleb.meta_path = os.path.join(
             target_dir, "voxceleb",
-            'meta') if target_dir else VoxCeleb1.meta_path
-        VoxCeleb1.veri_test_file = os.path.join(VoxCeleb1.meta_path,
-                                                'veri_test2.txt')
+            'meta') if target_dir else VoxCeleb.meta_path
+        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
+                                               'veri_test2.txt')
         # self._data = self._get_data()[:1000]  # KP: Small dataset test.
         self._data = self._get_data()
-        super(VoxCeleb1, self).__init__()
+        super(VoxCeleb, self).__init__()
 
         # Set up a seed to reproduce training or predicting result.
         # random.seed(seed)
@@ -300,7 +311,14 @@ class VoxCeleb1(Dataset):
         # get all the train and dev audios file path
         audio_files = []
         speakers = set()
-        for path in [self.wav_path]:
+        for path in [self.wav_path, self.vox2_base_path]:
+            # if vox2 directory is not set and vox2 is not a directory 
+            # we will not process this directory
+            if not path or not os.path.exists(path):
+                logger.warning(
+                    f"{path} is an invalid path, please check again, "
+                    "and we will ignore the vox2 base path")
+                continue
             for file in glob.glob(
                     os.path.join(path, "**", "*.wav"), recursive=True):
                 spk = file.split('/wav/')[1].split('/')[0]
diff --git a/paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_speaker_embedding.py
similarity index 99%
rename from paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py
rename to paddlespeech/vector/exps/ecapa_tdnn/extract_speaker_embedding.py
index 78498c61..44cbd204 100644
--- a/paddlespeech/vector/exps/ecapa-tdnn/extract_speaker_embedding.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_speaker_embedding.py
@@ -28,6 +28,7 @@ from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
+
 def extract_audio_embedding(args, config):
     # stage 0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -83,7 +84,7 @@ if __name__ == "__main__":
                         choices=['cpu', 'gpu'],
                         default="gpu",
                         help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--config", 
+    parser.add_argument("--config",
                         default=None,
                         type=str,
                         help="configuration file")
diff --git a/paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py b/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
similarity index 96%
rename from paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py
rename to paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
index 4d85bd62..01a3506a 100644
--- a/paddlespeech/vector/exps/ecapa-tdnn/speaker_verification_cosine.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
@@ -17,15 +17,15 @@ import os
 
 import numpy as np
 import paddle
-from yacs.config import CfgNode
 import paddle.nn.functional as F
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from tqdm import tqdm
+from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.datasets import VoxCeleb1
-from paddlespeech.s2t.utils.log import Log
+from paddleaudio.paddleaudio.datasets import VoxCeleb
 from paddleaudio.paddleaudio.metric import compute_eer
+from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import batch_feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
@@ -33,6 +33,7 @@ from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
+
 def main(args, config):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -44,7 +45,7 @@ def main(args, config):
 
     # stage2: build the speaker verification eval instance with backbone model
     model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
 
     # stage3: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
@@ -57,7 +58,7 @@ def main(args, config):
     logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
 
     # stage4: construct the enroll and test dataloader
-    enroll_dataset = VoxCeleb1(
+    enroll_dataset = VoxCeleb(
         subset='enroll',
         target_dir=args.data_dir,
         feat_type='melspectrogram',
@@ -73,7 +74,7 @@ def main(args, config):
                     num_workers=config.num_workers,
                     return_list=True,)
 
-    test_dataset = VoxCeleb1(
+    test_dataset = VoxCeleb(
         subset='test',
         target_dir=args.data_dir,
         feat_type='melspectrogram',
@@ -145,7 +146,7 @@ def main(args, config):
     labels = []
     enrol_ids = []
     test_ids = []
-    with open(VoxCeleb1.veri_test_file, 'r') as f:
+    with open(VoxCeleb.veri_test_file, 'r') as f:
         for line in f.readlines():
             label, enrol_id, test_id = line.strip().split(' ')
             labels.append(int(label))
@@ -171,7 +172,7 @@ if __name__ == "__main__":
                         choices=['cpu', 'gpu'],
                         default="gpu",
                         help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--config", 
+    parser.add_argument("--config",
                         default=None,
                         type=str,
                         help="configuration file")
diff --git a/paddlespeech/vector/exps/ecapa-tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
similarity index 97%
rename from paddlespeech/vector/exps/ecapa-tdnn/train.py
rename to paddlespeech/vector/exps/ecapa_tdnn/train.py
index 08a4ac1c..6e6e5ab2 100644
--- a/paddlespeech/vector/exps/ecapa-tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -20,8 +20,9 @@ from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from yacs.config import CfgNode
+
 from paddleaudio.paddleaudio.compliance.librosa import melspectrogram
-from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb1
+from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.io.augment import waveform_augment
@@ -30,13 +31,14 @@ from paddlespeech.vector.io.batch import waveform_collate_fn
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.loss import AdditiveAngularMargin
 from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
-from paddlespeech.vector.training.scheduler import CyclicLRScheduler
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.scheduler import CyclicLRScheduler
 from paddlespeech.vector.training.seeding import seed_everything
 from paddlespeech.vector.utils.time import Timer
 
 logger = Log(__name__).getlog()
 
+
 def main(args, config):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -50,8 +52,8 @@ def main(args, config):
 
     # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
     # note: some cmd must do in rank==0, so wo will refactor the data prepare code
-    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
-    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
+    train_dataset = VoxCeleb('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
 
     if args.augment:
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
@@ -63,7 +65,7 @@ def main(args, config):
 
     # stage4: build the speaker verification train instance with backbone model
     model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb1.num_speakers)
+        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
 
     # stage5: build the optimizer, we now only construct the AdamW optimizer
     lr_schedule = CyclicLRScheduler(
@@ -263,7 +265,7 @@ if __name__ == "__main__":
                         choices=['cpu', 'gpu'],
                         default="cpu",
                         help="Select which device to train model, defaults to gpu.")
-    parser.add_argument("--config", 
+    parser.add_argument("--config",
                         default=None,
                         type=str,
                         help="configuration file")
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index 76312978..1b9d1fbd 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this is modified from https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
 import math
 import os
 from typing import List
diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
index 4c960e11..0e7287cd 100644
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -19,16 +19,6 @@ import paddle.nn.functional as F
 
 
 def length_to_mask(length, max_len=None, dtype=None):
-    """_summary_
-
-    Args:
-        length (_type_): _description_
-        max_len (_type_, optional): _description_. Defaults to None.
-        dtype (_type_, optional): _description_. Defaults to None.
-
-    Returns:
-        _type_: _description_
-    """
     assert len(length.shape) == 1
 
     if max_len is None:
@@ -60,15 +50,15 @@ class Conv1d(nn.Layer):
         """_summary_
 
         Args:
-            in_channels (_type_): _description_
-            out_channels (_type_): _description_
-            kernel_size (_type_): _description_
-            stride (int, optional): _description_. Defaults to 1.
-            padding (str, optional): _description_. Defaults to "same".
-            dilation (int, optional): _description_. Defaults to 1.
-            groups (int, optional): _description_. Defaults to 1.
-            bias (bool, optional): _description_. Defaults to True.
-            padding_mode (str, optional): _description_. Defaults to "reflect".
+            in_channels (int): intput channel or input data dimensions
+            out_channels (int): output channel or output data dimensions
+            kernel_size (int): kernel size of 1-d convolution
+            stride (int, optional): strid in 1-d convolution . Defaults to 1.
+            padding (str, optional): padding value. Defaults to "same".
+            dilation (int, optional): dilation in 1-d convolution. Defaults to 1.
+            groups (int, optional): groups in 1-d convolution. Defaults to 1.
+            bias (bool, optional): bias in 1-d convolution . Defaults to True.
+            padding_mode (str, optional): padding mode. Defaults to "reflect".
         """
         super().__init__()
 
@@ -89,17 +79,6 @@ class Conv1d(nn.Layer):
             bias_attr=bias, )
 
     def forward(self, x):
-        """_summary_
-
-        Args:
-            x (_type_): _description_
-
-        Raises:
-            ValueError: _description_
-
-        Returns:
-            _type_: _description_
-        """
         if self.padding == "same":
             x = self._manage_padding(x, self.kernel_size, self.dilation,
                                      self.stride)
@@ -109,17 +88,6 @@ class Conv1d(nn.Layer):
         return self.conv(x)
 
     def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
-        """_summary_
-
-        Args:
-            x (_type_): _description_
-            kernel_size (int): _description_
-            dilation (int): _description_
-            stride (int): _description_
-
-        Returns:
-            _type_: _description_
-        """
         L_in = x.shape[-1]  # Detecting input shape
         padding = self._get_padding_elem(L_in, stride, kernel_size,
                                          dilation)  # Time padding
@@ -133,17 +101,6 @@ class Conv1d(nn.Layer):
                           stride: int,
                           kernel_size: int,
                           dilation: int):
-        """_summary_
-
-        Args:
-            L_in (int): _description_
-            stride (int): _description_
-            kernel_size (int): _description_
-            dilation (int): _description_
-
-        Returns:
-            _type_: _description_
-        """
         if stride > 1:
             n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
             L_out = stride * (n_steps - 1) + kernel_size * dilation
@@ -220,8 +177,8 @@ class Res2NetBlock(nn.Layer):
         Args:
             in_channels (int): input channels or input dimensions
             out_channels (int): output channels or output dimensions
-            scale (int, optional): _description_. Defaults to 8.
-            dilation (int, optional): _description_. Defaults to 1.
+            scale (int, optional): scale in res2net bolck. Defaults to 8.
+            dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
         """
         super().__init__()
         assert in_channels % scale == 0
@@ -358,15 +315,16 @@ class SERes2NetBlock(nn.Layer):
             dilation=1,
             activation=nn.ReLU, ):
         """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
-
+           The paper is refered "Squeeze-and-Excitation Networks"
+           whose url is: https://arxiv.org/pdf/1709.01507.pdf
         Args:
             in_channels (int): input channels or input data dimensions
-            out_channels (_type_): _description_
-            res2net_scale (int, optional): _description_. Defaults to 8.
-            se_channels (int, optional): _description_. Defaults to 128.
-            kernel_size (int, optional): _description_. Defaults to 1.
-            dilation (int, optional): _description_. Defaults to 1.
-            activation (_type_, optional): _description_. Defaults to nn.ReLU.
+            out_channels (int): output channels or output data dimensions
+            res2net_scale (int, optional): scale in the res2net block. Defaults to 8.
+            se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128.
+            kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1.
+            dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+            activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
         """
         super().__init__()
         self.out_channels = out_channels
@@ -419,7 +377,21 @@ class EcapaTdnn(nn.Layer):
             res2net_scale=8,
             se_channels=128,
             global_context=True, ):
-
+        """Implementation of ECAPA-TDNN backbone model network
+           The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
+           whose url is: https://arxiv.org/abs/2005.07143
+        Args:
+            input_size (_type_): input fature dimension
+            lin_neurons (int, optional): speaker embedding size. Defaults to 192.
+            activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+            channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536].
+            kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1].
+            dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1].
+            attention_channels (int, optional): attention dimensions. Defaults to 128.
+            res2net_scale (int, optional): scale value in res2net. Defaults to 8.
+            se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128.
+            global_context (bool, optional): global context flag. Defaults to True.
+        """
         super().__init__()
         assert len(channels) == len(kernel_sizes)
         assert len(channels) == len(dilations)

From 311fa87a1193e784913434d502df3d5942e50b1f Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sun, 13 Mar 2022 22:56:26 +0800
Subject: [PATCH 27/41] add some comments to the code

---
 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml    | 20 ++++--
 examples/voxceleb/sv0/run.sh                  |  3 +-
 paddleaudio/paddleaudio/metric/__init__.py    |  3 +-
 paddleaudio/paddleaudio/metric/eer.py         | 66 +++++++++++++++++++
 ...ct_speaker_embedding.py => extract_emb.py} |  0
 .../ecapa_tdnn/speaker_verification_cosine.py | 14 ++--
 paddlespeech/vector/exps/ecapa_tdnn/train.py  |  6 +-
 paddlespeech/vector/io/augment.py             |  3 +-
 paddlespeech/vector/io/batch.py               |  3 +
 9 files changed, 99 insertions(+), 19 deletions(-)
 rename paddlespeech/vector/exps/ecapa_tdnn/{extract_speaker_embedding.py => extract_emb.py} (100%)

diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
index 33304054..720326f8 100644
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -1,3 +1,12 @@
+###########################################
+#                Data                 #
+###########################################
+batch_size: 32
+num_workers: 2
+num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+shuffle: True
+random_chunk: True
+
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
@@ -7,7 +16,6 @@ feature:
   window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
   hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 
-
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################
@@ -15,9 +23,8 @@ feature:
 # if we want use another model, please choose another configuration yaml file
 model:
   input_size: 80
-  ##"channels": [1024, 1024, 1024, 1024, 3072],
   # "channels": [512, 512, 512, 512, 1536],
-  channels: [512, 512, 512, 512, 1536]
+  channels: [1024, 1024, 1024, 1024, 3072]
   kernel_sizes: [5, 3, 3, 3, 1]
   dilations: [1, 2, 3, 4, 1]
   attention_channels: 128
@@ -26,10 +33,9 @@ model:
 ###########################################
 #                Training                 #
 ###########################################
-seed: 0
+seed: 1986 # according from speechbrain configuration
 epochs: 10
-batch_size: 32
-num_workers: 2
 save_freq: 10
-log_freq: 10
+log_interval: 10
 learning_rate: 1e-8
+
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index 769332eb..c5dc3dd2 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -47,7 +47,8 @@ mkdir -p ${exp_dir}
 if [ $stage -le 0 ]; then 
      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
      python3 local/data_prepare.py \
-     --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path}
+     --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path} \
+     --config conf/ecapa_tdnn.yaml
 fi 
 
 if [ $stage -le 1 ]; then
diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py
index b435571d..8e5ca9f7 100644
--- a/paddleaudio/paddleaudio/metric/__init__.py
+++ b/paddleaudio/paddleaudio/metric/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .dtw import dtw_distance
-from .mcd import mcd_distance
 from .eer import compute_eer
+from .eer import compute_minDCF
+from .mcd import mcd_distance
diff --git a/paddleaudio/paddleaudio/metric/eer.py b/paddleaudio/paddleaudio/metric/eer.py
index 65dc7a3c..7738987e 100644
--- a/paddleaudio/paddleaudio/metric/eer.py
+++ b/paddleaudio/paddleaudio/metric/eer.py
@@ -14,6 +14,7 @@
 from typing import List
 
 import numpy as np
+import paddle
 from sklearn.metrics import roc_curve
 
 
@@ -26,3 +27,68 @@ def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
     eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
     eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
     return eer, eer_threshold
+
+
+def compute_minDCF(positive_scores,
+                   negative_scores,
+                   c_miss=1.0,
+                   c_fa=1.0,
+                   p_target=0.01):
+    """
+    This is modified from SpeechBrain
+    https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
+    Computes the minDCF metric normally used to evaluate speaker verification
+    systems. The min_DCF is the minimum of the following C_det function computed
+    within the defined threshold range:
+
+    C_det =  c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
+
+    where p_miss is the missing probability and p_fa is the probability of having
+    a false alarm.
+
+    Args:
+        positive_scores (Paddle.Tensor): The scores from entries of the same class.
+        negative_scores (Paddle.Tensor): The scores from entries of different classes.
+        c_miss (float, optional): Cost assigned to a missing error (default 1.0).
+        c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
+        p_target (float, optional): Prior probability of having a target (default 0.01).
+
+    Returns:
+        _type_: min dcf 
+    """
+    # Computing candidate thresholds
+    if len(positive_scores.shape) > 1:
+        positive_scores = positive_scores.squeeze()
+
+    if len(negative_scores.shape) > 1:
+        negative_scores = negative_scores.squeeze()
+
+    thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
+    thresholds = paddle.unique(thresholds)
+
+    # Adding intermediate thresholds
+    interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
+    thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
+
+    # Computing False Rejection Rate (miss detection)
+    positive_scores = paddle.concat(
+        len(thresholds) * [positive_scores.unsqueeze(0)])
+    pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
+    p_miss = (pos_scores_threshold.sum(0)
+              ).astype("float32") / positive_scores.shape[1]
+    del positive_scores
+    del pos_scores_threshold
+
+    # Computing False Acceptance Rate (false alarm)
+    negative_scores = paddle.concat(
+        len(thresholds) * [negative_scores.unsqueeze(0)])
+    neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
+    p_fa = (neg_scores_threshold.sum(0)
+            ).astype("float32") / negative_scores.shape[1]
+    del negative_scores
+    del neg_scores_threshold
+
+    c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
+    c_min = paddle.min(c_det, axis=0)
+    min_index = paddle.argmin(c_det, axis=0)
+    return float(c_min), float(thresholds[min_index])
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_speaker_embedding.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
similarity index 100%
rename from paddlespeech/vector/exps/ecapa_tdnn/extract_speaker_embedding.py
rename to paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py b/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
index 01a3506a..781bf2a5 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
@@ -45,7 +45,7 @@ def main(args, config):
 
     # stage2: build the speaker verification eval instance with backbone model
     model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
 
     # stage3: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
@@ -93,6 +93,7 @@ def main(args, config):
     model.eval()
 
     # stage7: global embedding norm to imporve the performance
+    print("global embedding norm: {}".format(args.global_embedding_norm))
     if args.global_embedding_norm:
         global_embedding_mean = None
         global_embedding_std = None
@@ -118,6 +119,8 @@ def main(args, config):
                         -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
 
                     # Global embedding normalization.
+                    # if we use the global embedding norm
+                    # eer can reduece about relative 10%
                     if args.global_embedding_norm:
                         batch_count += 1
                         current_mean = embeddings.mean(
@@ -150,8 +153,8 @@ def main(args, config):
         for line in f.readlines():
             label, enrol_id, test_id = line.strip().split(' ')
             labels.append(int(label))
-            enrol_ids.append(enrol_id.split('.')[0].replace('/', '-'))
-            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+            enrol_ids.append(enrol_id.split('.')[0].replace('/', '--'))
+            test_ids.append(test_id.split('.')[0].replace('/', '--'))
 
     cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
     enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
@@ -185,11 +188,10 @@ if __name__ == "__main__":
                         default='',
                         help="Directory to load model checkpoint to contiune trainning.")
     parser.add_argument("--global-embedding-norm",
-                        type=bool,
-                        default=True,
+                        default=False,
+                        action="store_true",
                         help="Apply global normalization on speaker embeddings.")
     parser.add_argument("--embedding-mean-norm",
-                        type=bool,
                         default=True,
                         help="Apply mean normalization on speaker embeddings.")
     parser.add_argument("--embedding-std-norm",
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index 6e6e5ab2..cb20ef16 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -178,9 +178,9 @@ def main(args, config):
             timer.count()  # step plus one in timer
 
             # stage 9-10: print the log information only on 0-rank per log-freq batchs
-            if (batch_idx + 1) % config.log_freq == 0 and local_rank == 0:
+            if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
-                avg_loss /= config.log_freq
+                avg_loss /= config.log_interval
                 avg_acc = num_corrects / num_samples
 
                 print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
@@ -196,7 +196,7 @@ def main(args, config):
                 num_samples = 0
 
         # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
-        if epoch % config.save_freq == 0 and batch_idx + 1 == steps_per_epoch:
+        if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch:
             if local_rank != 0:
                 paddle.distributed.barrier(
                 )  # Wait for valid step in main process
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index 1b9d1fbd..f40ce41b 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# this is modified from https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
+# this is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
 import math
 import os
 from typing import List
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 811775e2..85f2ab8b 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -75,6 +75,9 @@ def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
             i]:].sum() == 0  # Padding valus should all be 0.
 
     # Converts into ratios.
+    # the utterance of the max length doesn't need to padding
+    # the remaining utterances need to padding and all of them will be padded to max length
+    # we convert the original length of each utterance to the ratio of the max length
     lengths = (lengths / lengths.max()).astype(np.float32)
 
     return {'ids': ids, 'feats': feats, 'lengths': lengths}
\ No newline at end of file

From 7eb8fa72a1e50fd3a0338ec2ec5a9ac7b3bb56d2 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sun, 13 Mar 2022 22:59:33 +0800
Subject: [PATCH 28/41] convert save_freq to save_interval, test=doc

---
 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
index 720326f8..d7f66380 100644
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -35,7 +35,6 @@ model:
 ###########################################
 seed: 1986 # according from speechbrain configuration
 epochs: 10
-save_freq: 10
+save_interval: 10
 log_interval: 10
 learning_rate: 1e-8
-

From 506d26a9578f39808de010867217edcc48273fee Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 14 Mar 2022 21:18:29 +0800
Subject: [PATCH 29/41] change the code style to s2t code style, test=doc

---
 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml    | 26 +++++--
 examples/voxceleb/sv0/local/data.sh           | 18 +++++
 examples/voxceleb/sv0/local/data_prepare.py   | 39 +++++-----
 examples/voxceleb/sv0/local/emb.sh            | 13 ++++
 examples/voxceleb/sv0/local/test.sh           |  8 +++
 examples/voxceleb/sv0/local/train.sh          | 22 ++++++
 examples/voxceleb/sv0/run.sh                  | 50 ++++++-------
 .../paddleaudio/datasets/rirs_noises.py       | 15 ++--
 paddleaudio/paddleaudio/datasets/voxceleb.py  | 30 ++++----
 paddleaudio/paddleaudio/utils/download.py     |  8 ++-
 .../vector/exps/ecapa_tdnn/extract_emb.py     | 24 ++++---
 ...speaker_verification_cosine.py => test.py} | 58 +++++++--------
 paddlespeech/vector/exps/ecapa_tdnn/train.py  | 25 +++++--
 paddlespeech/vector/io/augment.py             | 16 +++--
 paddlespeech/vector/utils/download.py         | 72 -------------------
 15 files changed, 216 insertions(+), 208 deletions(-)
 create mode 100755 examples/voxceleb/sv0/local/data.sh
 create mode 100755 examples/voxceleb/sv0/local/emb.sh
 create mode 100644 examples/voxceleb/sv0/local/test.sh
 create mode 100755 examples/voxceleb/sv0/local/train.sh
 rename paddlespeech/vector/exps/ecapa_tdnn/{speaker_verification_cosine.py => test.py} (82%)
 delete mode 100644 paddlespeech/vector/utils/download.py

diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
index d7f66380..0f4bf189 100644
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -1,7 +1,10 @@
 ###########################################
 #                Data                 #
 ###########################################
-batch_size: 32
+# we should explicitly specify the wav path of vox2 audio data converted from m4a
+vox2_base_path: 
+augment: True
+batch_size: 16
 num_workers: 2
 num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
@@ -11,10 +14,10 @@ random_chunk: True
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 # currently, we only support fbank
-feature:
-  n_mels: 80
-  window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
-  hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+sample_rate: 16000
+n_mels: 80
+window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 
 ###########################################################
 #                       MODEL SETTING                     #
@@ -35,6 +38,15 @@ model:
 ###########################################
 seed: 1986 # according from speechbrain configuration
 epochs: 10
-save_interval: 10
-log_interval: 10
+save_interval: 1
+log_interval: 1
 learning_rate: 1e-8
+
+
+###########################################
+#                Testing                  #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+
diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
new file mode 100755
index 00000000..ec9c4c58
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=100
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+dir=$1
+conf_path=$2
+mkdir -p ${dir}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+    # we should use the local/convert.sh convert m4a to wav
+    python3 local/data_prepare.py \
+                        --data-dir ${dir} \
+                        --config ${conf_path}
+fi 
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
index b906b5da..19ba41b8 100644
--- a/examples/voxceleb/sv0/local/data_prepare.py
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -14,10 +14,10 @@
 import argparse
 import os
 
-import numpy as np
 import paddle
+from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
+from paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.training.seeding import seed_everything
@@ -25,46 +25,47 @@ from paddlespeech.vector.training.seeding import seed_everything
 logger = Log(__name__).getlog()
 
 
-def main(args):
+def main(args, config):
 
     # stage0: set the cpu device, all data prepare process will be done in cpu mode
     paddle.set_device("cpu")
     # set the random seed, it is a must for multiprocess training
-    seed_everything(args.seed)
+    seed_everything(config.seed)
 
     # stage 1: generate the voxceleb csv file
     # Note: this may occurs c++ execption, but the program will execute fine
     # so we ignore the execption 
     # we explicitly pass the vox2 base path to data prepare and generate the audio info
+    logger.info("start to generate the voxceleb dataset info")
     train_dataset = VoxCeleb(
-        'train', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
-    dev_dataset = VoxCeleb(
-        'dev', target_dir=args.data_dir, vox2_base_path=args.vox2_base_path)
+        'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
 
     # stage 2: generate the augment noise csv file
-    if args.augment:
+    if config.augment:
+        logger.info("start to generate the augment dataset info")
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
 
 
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument("--seed",
-                        default=0,
-                        type=int,
-                        help="random seed for paddle, numpy and python random package")
     parser.add_argument("--data-dir",
                         default="./data/",
                         type=str,
                         help="data directory")
-    parser.add_argument("--vox2-base-path",
+    parser.add_argument("--config",
                         default=None,
                         type=str,
-                        help="vox2 base path, where is store the wav audio")
-    parser.add_argument("--augment",
-                        action="store_true",
-                        default=False,
-                        help="Apply audio augments.")
+                        help="configuration file")
     args = parser.parse_args()
     # yapf: enable
-    main(args)
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)
diff --git a/examples/voxceleb/sv0/local/emb.sh b/examples/voxceleb/sv0/local/emb.sh
new file mode 100755
index 00000000..482e658e
--- /dev/null
+++ b/examples/voxceleb/sv0/local/emb.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+. ./path.sh
+
+exp_dir=exp/ecapa-tdnn-vox12-big//epoch_10/            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml
+audio_path="demo/voxceleb/00001.wav"
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# extract the audio embedding
+python3 ${BIN_DIR}/extract_emb.py --device "gpu" \
+          --config ${conf_path} \
+          --audio-path ${audio_path} --load-checkpoint ${exp_dir}
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh
new file mode 100644
index 00000000..d8a1a0ba
--- /dev/null
+++ b/examples/voxceleb/sv0/local/test.sh
@@ -0,0 +1,8 @@
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+python3 ${BIN_DIR}/test.py \
+        --config ${conf_path} \
+        --data-dir ${dir} \
+        --load-checkpoint ${exp_dir}
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh
new file mode 100755
index 00000000..385e8caa
--- /dev/null
+++ b/examples/voxceleb/sv0/local/train.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+# train the speaker identification task with voxceleb data
+# Note: we will store the log file in exp/log directory
+python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+    ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
+    --data-dir ${dir} --config ${conf_path}
+
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index c5dc3dd2..e38027e9 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -18,7 +18,7 @@ set -e
 
 #######################################################################
 # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
-#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md
+#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
 # stage 1: train the speaker identification model
 # stage 2: test speaker identification 
 # stage 3: extract the training embeding to train the LDA and PLDA
@@ -30,49 +30,39 @@ set -e
 # and put all of them to ${PPAUDIO_HOME}/datasets/vox2
 # we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
 # export PPAUDIO_HOME=
-
 stage=0
+stop_stage=50
+
 # data directory
 # if we set the variable ${dir}, we will store the wav info to this directory
 # otherwise, we will store the wav info to vox1 and vox2 directory respectively
-dir=data/                          
-exp_dir=exp/ecapa-tdnn/            # experiment directory
-
 # vox2 wav path, we must convert the m4a format to wav format 
-# and store them in the ${PPAUDIO_HOME}/datasets/vox2/wav/ directory
-vox2_base_path=${PPAUDIO_HOME}/datasets/vox2/wav/
-mkdir -p ${dir}
+# dir=data-demo/                          # data info directory    
+dir=demo/                          # data info directory   
+
+exp_dir=exp/ecapa-tdnn-vox12-big//            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml          
+gpus=0,1,2,3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
 mkdir -p ${exp_dir}
 
-if [ $stage -le 0 ]; then 
+if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-     python3 local/data_prepare.py \
-     --data-dir ${dir} --augment --vox2-base-path ${vox2_base_path} \
-     --config conf/ecapa_tdnn.yaml
+     # and we should specifiy the vox2 data in the data.sh
+     bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
 fi 
 
-if [ $stage -le 1 ]; then
+if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
      # stage 1: train the speaker identification model
-     python3 \
-          -m paddle.distributed.launch --gpus=0,1,2,3 \
-          ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
-          --data-dir ${dir} --config conf/ecapa_tdnn.yaml
+     CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} 
 fi
 
 if [ $stage -le 2 ]; then
-     # stage 1: get the speaker verification scores with cosine function
-     python3 \
-          ${BIN_DIR}/speaker_verification_cosine.py\
-          --config conf/ecapa_tdnn.yaml \
-          --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
-fi
-
-if [ $stage -le 3 ]; then
-     # stage 3: extract the audio embedding
-     python3 \
-          ${BIN_DIR}/extract_speaker_embedding.py\
-          --config conf/ecapa_tdnn.yaml \
-          --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
+     # stage 2: get the speaker verification scores with cosine function
+     #          now we only support use cosine to get the scores
+     CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
 fi
 
 # if [ $stage -le 3 ]; then
diff --git a/paddleaudio/paddleaudio/datasets/rirs_noises.py b/paddleaudio/paddleaudio/datasets/rirs_noises.py
index df5dec61..80bb2d74 100644
--- a/paddleaudio/paddleaudio/datasets/rirs_noises.py
+++ b/paddleaudio/paddleaudio/datasets/rirs_noises.py
@@ -25,13 +25,10 @@ from tqdm import tqdm
 
 from ..backends import load as load_audio
 from ..backends import save as save_wav
-from .dataset import feat_funcs
 from ..utils import DATA_HOME
 from ..utils import decompress
-from paddlespeech.s2t.utils.log import Log
-from paddlespeech.vector.utils.download import download_and_decompress
-
-logger = Log(__name__).getlog()
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
 
 __all__ = ['OpenRIRNoise']
 
@@ -80,17 +77,17 @@ class OpenRIRNoise(Dataset):
 
     def _get_data(self):
         # Download audio files.
-        logger.info(f"rirs noises base path: {self.base_path}")
+        print(f"rirs noises base path: {self.base_path}")
         if not os.path.isdir(self.base_path):
             download_and_decompress(
                 self.archieves, self.base_path, decompress=True)
         else:
-            logger.info(
+            print(
                 f"{self.base_path} already exists, we will not download and decompress again"
             )
 
         # Data preparation.
-        logger.info(f"prepare the csv to {self.csv_path}")
+        print(f"prepare the csv to {self.csv_path}")
         if not os.path.isdir(self.csv_path):
             os.makedirs(self.csv_path)
             self.prepare_data()
@@ -161,7 +158,7 @@ class OpenRIRNoise(Dataset):
                      wav_files: List[str],
                      output_file: str,
                      split_chunks: bool=True):
-        logger.info(f'Generating csv: {output_file}')
+        print(f'Generating csv: {output_file}')
         header = ["id", "duration", "wav"]
 
         infos = list(
diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
index f8d634f2..b9b8c271 100644
--- a/paddleaudio/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -28,13 +28,8 @@ from tqdm import tqdm
 from ..backends import load as load_audio
 from ..utils import DATA_HOME
 from ..utils import decompress
+from ..utils.download import download_and_decompress
 from .dataset import feat_funcs
-from paddlespeech.s2t.utils.log import Log
-from paddlespeech.vector.utils.download import download_and_decompress
-from utils.utility import download
-from utils.utility import unpack
-
-logger = Log(__name__).getlog()
 
 __all__ = ['VoxCeleb']
 
@@ -138,9 +133,9 @@ class VoxCeleb(Dataset):
         # Download audio files.
         # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
         # so, we check the vox1/wav dir status
-        logger.info(f"wav base path: {self.wav_path}")
+        print(f"wav base path: {self.wav_path}")
         if not os.path.isdir(self.wav_path):
-            logger.info(f"start to download the voxceleb1 dataset")
+            print(f"start to download the voxceleb1 dataset")
             download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
                 self.archieves_audio_dev,
                 self.base_path,
@@ -152,7 +147,7 @@ class VoxCeleb(Dataset):
 
             # Download all parts and concatenate the files into one zip file.
             dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
-            logger.info(f'Concatenating all parts to: {dev_zipfile}')
+            print(f'Concatenating all parts to: {dev_zipfile}')
             os.system(
                 f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
             )
@@ -162,6 +157,7 @@ class VoxCeleb(Dataset):
 
         # Download meta files.
         if not os.path.isdir(self.meta_path):
+            print("prepare the meta data")
             download_and_decompress(
                 self.archieves_meta, self.meta_path, decompress=False)
 
@@ -171,7 +167,7 @@ class VoxCeleb(Dataset):
             self.prepare_data()
 
         data = []
-        logger.info(
+        print(
             f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
         )
         with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
@@ -266,8 +262,8 @@ class VoxCeleb(Dataset):
                      wav_files: List[str],
                      output_file: str,
                      split_chunks: bool=True):
-        logger.info(f'Generating csv: {output_file}')
-        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
+        print(f'Generating csv: {output_file}')
+        header = ["ID", "duration", "wav", "start", "stop", "spk_id"]
         # Note: this may occurs c++ execption, but the program will execute fine
         # so we can ignore the execption 
         with Pool(cpu_count()) as p:
@@ -290,7 +286,7 @@ class VoxCeleb(Dataset):
 
     def prepare_data(self):
         # Audio of speakers in veri_test_file should not be included in training set.
-        logger.info("start to prepare the data csv file")
+        print("start to prepare the data csv file")
         enroll_files = set()
         test_files = set()
         # get the enroll and test audio file path
@@ -311,13 +307,13 @@ class VoxCeleb(Dataset):
         # get all the train and dev audios file path
         audio_files = []
         speakers = set()
+        print("Getting file list...")
         for path in [self.wav_path, self.vox2_base_path]:
             # if vox2 directory is not set and vox2 is not a directory 
             # we will not process this directory
             if not path or not os.path.exists(path):
-                logger.warning(
-                    f"{path} is an invalid path, please check again, "
-                    "and we will ignore the vox2 base path")
+                print(f"{path} is an invalid path, please check again, "
+                      "and we will ignore the vox2 base path")
                 continue
             for file in glob.glob(
                     os.path.join(path, "**", "*.wav"), recursive=True):
@@ -327,7 +323,7 @@ class VoxCeleb(Dataset):
                 speakers.add(spk)
                 audio_files.append(file)
 
-        logger.info(
+        print(
             f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
         )
         # encode the train and dev speakers label to spk_id2label.txt
diff --git a/paddleaudio/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py
index 4658352f..07d5eea8 100644
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@@ -37,7 +37,9 @@ def decompress(file: str):
     download._decompress(file)
 
 
-def download_and_decompress(archives: List[Dict[str, str]], path: str):
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            decompress: bool=True):
     """
     Download archieves and decompress to specific path.
     """
@@ -47,8 +49,8 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str):
     for archive in archives:
         assert 'url' in archive and 'md5' in archive, \
             'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-
-        download.get_path_from_url(archive['url'], path, archive['md5'])
+        download.get_path_from_url(
+            archive['url'], path, archive['md5'], decompress=decompress)
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index 44cbd204..0d09d211 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -14,12 +14,13 @@
 import argparse
 import os
 
+import time
 import numpy as np
 import paddle
 from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.backends import load as load_audio
-from paddleaudio.paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
@@ -39,7 +40,7 @@ def extract_audio_embedding(args, config):
     ecapa_tdnn = EcapaTdnn(**config.model)
 
     # stage4: build the speaker verification train instance with backbone model
-    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=1211)
+    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers)
     # stage 2: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
         os.path.expanduser(args.load_checkpoint))
@@ -60,7 +61,12 @@ def extract_audio_embedding(args, config):
     # feat type is numpy array, whose shape is [dim, time]
     # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
     # so the final shape is [1, dim, time]
-    feat = melspectrogram(x=waveform, **config.feature)
+    start_time = time.time()
+    feat = melspectrogram(x=waveform, 
+                          sr=config.sample_rate,
+                          n_mels=config.n_mels,
+                          window_size=config.window_size,
+                          hop_length=config.hop_length)
     feat = paddle.to_tensor(feat).unsqueeze(0)
 
     # in inference period, the lengths is all one without padding
@@ -71,9 +77,13 @@ def extract_audio_embedding(args, config):
     # model backbone network forward the feats and get the embedding
     embedding = model.backbone(
         feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
+    elapsed_time = time.time() - start_time
+    audio_length = waveform.shape[0] / sr
+
 
     # stage 5: do global norm with external mean and std
-    # todo
+    rtf = elapsed_time / audio_length
+    logger.info(f"{args.device} rft={rtf}")
     return embedding
 
 
@@ -92,10 +102,6 @@ if __name__ == "__main__":
                         type=str,
                         default='',
                         help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--global-embedding-norm",
-                        type=str,
-                        default=None,
-                        help="Apply global normalization on speaker embeddings.")
     parser.add_argument("--audio-path",
                         default="./data/demo.wav",
                         type=str,
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py
similarity index 82%
rename from paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
rename to paddlespeech/vector/exps/ecapa_tdnn/test.py
index 781bf2a5..03757033 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/speaker_verification_cosine.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@@ -23,8 +23,8 @@ from paddle.io import DataLoader
 from tqdm import tqdm
 from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.datasets import VoxCeleb
-from paddleaudio.paddleaudio.metric import compute_eer
+from paddleaudio.datasets import VoxCeleb
+from paddleaudio.metric import compute_eer
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import batch_feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
@@ -48,6 +48,9 @@ def main(args, config):
         backbone=ecapa_tdnn, num_class=config.num_speakers)
 
     # stage3: load the pre-trained model
+    #         we get the last model from the epoch and save_interval
+    last_save_epoch = (config.epochs // config.save_interval) * config.save_interval
+    args.load_checkpoint = os.path.join(args.load_checkpoint, "epoch_" + str(last_save_epoch))
     args.load_checkpoint = os.path.abspath(
         os.path.expanduser(args.load_checkpoint))
 
@@ -63,7 +66,9 @@ def main(args, config):
         target_dir=args.data_dir,
         feat_type='melspectrogram',
         random_chunk=False,
-        **config.feature)
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_length)
     enroll_sampler = BatchSampler(
         enroll_dataset, batch_size=config.batch_size,
         shuffle=True)  # Shuffle to make embedding normalization more robust.
@@ -73,13 +78,14 @@ def main(args, config):
                             x, mean_norm=True, std_norm=False),
                     num_workers=config.num_workers,
                     return_list=True,)
-
     test_dataset = VoxCeleb(
         subset='test',
         target_dir=args.data_dir,
         feat_type='melspectrogram',
         random_chunk=False,
-        **config.feature)
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_length)
 
     test_sampler = BatchSampler(
         test_dataset, batch_size=config.batch_size, shuffle=True)
@@ -89,19 +95,19 @@ def main(args, config):
                                 x, mean_norm=True, std_norm=False),
                             num_workers=config.num_workers,
                             return_list=True,)
-    # stage6: we must set the model to eval mode
+    # stage5: we must set the model to eval mode
     model.eval()
 
-    # stage7: global embedding norm to imporve the performance
-    print("global embedding norm: {}".format(args.global_embedding_norm))
-    if args.global_embedding_norm:
+    # stage6: global embedding norm to imporve the performance
+    logger.info(f"global embedding norm: {config.global_embedding_norm}")
+    if config.global_embedding_norm:
         global_embedding_mean = None
         global_embedding_std = None
-        mean_norm_flag = args.embedding_mean_norm
-        std_norm_flag = args.embedding_std_norm
+        mean_norm_flag = config.embedding_mean_norm
+        std_norm_flag = config.embedding_std_norm
         batch_count = 0
 
-    # stage8: Compute embeddings of audios in enrol and test dataset from model.
+    # stage7: Compute embeddings of audios in enrol and test dataset from model.
     id2embedding = {}
     # Run multi times to make embedding normalization more stable.
     for i in range(2):
@@ -121,7 +127,7 @@ def main(args, config):
                     # Global embedding normalization.
                     # if we use the global embedding norm
                     # eer can reduece about relative 10%
-                    if args.global_embedding_norm:
+                    if config.global_embedding_norm:
                         batch_count += 1
                         current_mean = embeddings.mean(
                             axis=0) if mean_norm_flag else 0
@@ -145,21 +151,22 @@ def main(args, config):
                     # Update embedding dict.
                     id2embedding.update(dict(zip(ids, embeddings)))
 
-    # stage 9: Compute cosine scores.
+    # stage 8: Compute cosine scores.
     labels = []
-    enrol_ids = []
+    enroll_ids = []
     test_ids = []
+    logger.info(f"read the trial from {VoxCeleb.veri_test_file}")
     with open(VoxCeleb.veri_test_file, 'r') as f:
         for line in f.readlines():
-            label, enrol_id, test_id = line.strip().split(' ')
+            label, enroll_id, test_id = line.strip().split(' ')
             labels.append(int(label))
-            enrol_ids.append(enrol_id.split('.')[0].replace('/', '--'))
-            test_ids.append(test_id.split('.')[0].replace('/', '--'))
+            enroll_ids.append(enroll_id.split('.')[0].replace('/', '-'))
+            test_ids.append(test_id.split('.')[0].replace('/', '-'))
 
     cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
     enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
-        np.asarray([id2embedding[id] for id in ids], dtype='float32')),
-                                            [enrol_ids, test_ids
+        np.asarray([id2embedding[uttid] for uttid in ids], dtype='float32')),
+                                            [enroll_ids, test_ids
                                              ])  # (N, emb_size)
     scores = cos_sim_func(enrol_embeddings, test_embeddings)
     EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
@@ -187,17 +194,6 @@ if __name__ == "__main__":
                         type=str,
                         default='',
                         help="Directory to load model checkpoint to contiune trainning.")
-    parser.add_argument("--global-embedding-norm",
-                        default=False,
-                        action="store_true",
-                        help="Apply global normalization on speaker embeddings.")
-    parser.add_argument("--embedding-mean-norm",
-                        default=True,
-                        help="Apply mean normalization on speaker embeddings.")
-    parser.add_argument("--embedding-std-norm",
-                        type=bool,
-                        default=False,
-                        help="Apply std normalization on speaker embeddings.")
     args = parser.parse_args()
     # yapf: enable
     # https://yaml.org/type/float.html
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index cb20ef16..0d62c69d 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -21,8 +21,8 @@ from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from yacs.config import CfgNode
 
-from paddleaudio.paddleaudio.compliance.librosa import melspectrogram
-from paddleaudio.paddleaudio.datasets.voxceleb import VoxCeleb
+from paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.io.augment import waveform_augment
@@ -68,6 +68,8 @@ def main(args, config):
         backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
 
     # stage5: build the optimizer, we now only construct the AdamW optimizer
+    #         140000 is single gpu steps
+    #         so, in multi-gpu mode, wo reduce the step_size to 140000//nranks to enable CyclicLRScheduler
     lr_schedule = CyclicLRScheduler(
         base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
     optimizer = paddle.optimizer.AdamW(
@@ -138,6 +140,10 @@ def main(args, config):
             waveforms, labels = batch['waveforms'], batch['labels']
 
             # stage 9-2: audio sample augment method, which is done on the audio sample point
+            #            the original wavefrom and the augmented waveform is concatented in a batch
+            #            eg. five augment method in the augment pipeline
+            #                the final data nums is batch_size * [five + one] 
+            #                -> five augmented waveform batch plus one original batch waveform
             if len(augment_pipeline) != 0:
                 waveforms = waveform_augment(waveforms, augment_pipeline)
                 labels = paddle.concat(
@@ -146,7 +152,11 @@ def main(args, config):
             # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
             feats = []
             for waveform in waveforms.numpy():
-                feat = melspectrogram(x=waveform, **config.feature)
+                feat = melspectrogram(x=waveform, 
+                                      sr=config.sample_rate, 
+                                      n_mels=config.n_mels, 
+                                      window_size=config.window_size, 
+                                      hop_length=config.hop_length)
                 feats.append(feat)
             feats = paddle.to_tensor(np.asarray(feats))
 
@@ -205,7 +215,7 @@ def main(args, config):
             # stage 9-12: construct the valid dataset dataloader
             dev_sampler = BatchSampler(
                 dev_dataset,
-                batch_size=config.batch_size // 4,
+                batch_size=config.batch_size,
                 shuffle=False,
                 drop_last=False)
             dev_loader = DataLoader(
@@ -228,8 +238,11 @@ def main(args, config):
 
                     feats = []
                     for waveform in waveforms.numpy():
-                        # feat = melspectrogram(x=waveform, **cpu_feat_conf)
-                        feat = melspectrogram(x=waveform, **config.feature)
+                        feat = melspectrogram(x=waveform, 
+                                              sr=config.sample_rate,
+                                              n_mels=config.n_mels,
+                                              window_size=config.window_size,
+                                              hop_length=config.hop_length)
                         feats.append(feat)
 
                     feats = paddle.to_tensor(np.asarray(feats))
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index f40ce41b..6e508c37 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -22,8 +22,8 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddleaudio.paddleaudio import load as load_audio
-from paddleaudio.paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddleaudio import load as load_audio
+from paddleaudio.datasets.rirs_noises import OpenRIRNoise
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.signal_processing import compute_amplitude
 from paddlespeech.vector.io.signal_processing import convolve1d
@@ -879,14 +879,18 @@ def waveform_augment(waveforms: paddle.Tensor,
     """process the augment pipeline and return all the waveforms
 
     Args:
-        waveforms (paddle.Tensor): _description_
-        augment_pipeline (List[paddle.nn.Layer]): _description_
+        waveforms (paddle.Tensor): original batch waveform
+        augment_pipeline (List[paddle.nn.Layer]): agument pipeline process
 
     Returns:
-        paddle.Tensor: _description_
+        paddle.Tensor: all the audio waveform including the original waveform and augmented waveform
     """
+    # stage 0: store the original waveforms
     waveforms_aug_list = [waveforms]
+
+    # augment the original batch waveform
     for aug in augment_pipeline:
+        # stage 1: augment the data
         waveforms_aug = aug(waveforms)  # (N, L)
         if waveforms_aug.shape[1] >= waveforms.shape[1]:
             # Trunc
@@ -897,6 +901,8 @@ def waveform_augment(waveforms: paddle.Tensor,
             waveforms_aug = F.pad(
                 waveforms_aug.unsqueeze(-1), [0, lengths_to_pad],
                 data_format='NLC').squeeze(-1)
+        # stage 2: append the augmented waveform into the list
         waveforms_aug_list.append(waveforms_aug)
 
+    # get the all the waveforms
     return paddle.concat(waveforms_aug_list, axis=0)
diff --git a/paddlespeech/vector/utils/download.py b/paddlespeech/vector/utils/download.py
deleted file mode 100644
index 476bfea7..00000000
--- a/paddlespeech/vector/utils/download.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from typing import Dict
-from typing import List
-
-from paddle.framework import load as load_state_dict
-from paddle.utils import download
-
-__all__ = [
-    'decompress',
-    'download_and_decompress',
-    'load_state_dict_from_url',
-]
-
-
-def decompress(file: str, path: str=os.PathLike):
-    """
-    Extracts all files from a compressed file to specific path.
-    """
-    assert os.path.isfile(file), "File: {} not exists.".format(file)
-
-    if path is None:
-        print("decompress the data: {}".format(file))
-        download._decompress(file)
-    else:
-        print("decompress the data: {} to {}".format(file, path))
-        if not os.path.isdir(path):
-            os.makedirs(path)
-
-        tmp_file = os.path.join(path, os.path.basename(file))
-        os.rename(file, tmp_file)
-        download._decompress(tmp_file)
-        os.rename(tmp_file, file)
-
-
-def download_and_decompress(archives: List[Dict[str, str]],
-                            path: str,
-                            decompress: bool=True):
-    """
-    Download archieves and decompress to specific path.
-    """
-    if not os.path.isdir(path):
-        os.makedirs(path)
-
-    for archive in archives:
-        assert 'url' in archive and 'md5' in archive, \
-            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-        download.get_path_from_url(
-            archive['url'], path, archive['md5'], decompress=decompress)
-
-
-def load_state_dict_from_url(url: str, path: str, md5: str=None):
-    """
-    Download and load a state dict from url
-    """
-    if not os.path.isdir(path):
-        os.makedirs(path)
-
-    download.get_path_from_url(url, path, md5)
-    return load_state_dict(os.path.join(path, os.path.basename(url)))

From d28ccfa96b7195068e335bddd53941eacb9203f1 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 21 Mar 2022 00:06:16 +0800
Subject: [PATCH 30/41] add vector cli component, test=doc

---
 examples/voxceleb/sv0/conf/ecapa_tdnn.yaml    |   4 +-
 examples/voxceleb/sv0/local/data.sh           |  27 +-
 examples/voxceleb/sv0/local/emb.sh            |  50 ++-
 examples/voxceleb/sv0/local/test.sh           |  42 ++-
 examples/voxceleb/sv0/local/train.sh          |  49 ++-
 examples/voxceleb/sv0/run.sh                  |  12 +-
 paddleaudio/paddleaudio/metric/eer.py         |  14 +-
 paddlespeech/cli/__init__.py                  |   1 +
 paddlespeech/cli/vector/__init__.py           |  14 +
 paddlespeech/cli/vector/infer.py              | 345 ++++++++++++++++++
 .../vector/exps/ecapa_tdnn/extract_emb.py     |   6 +-
 paddlespeech/vector/exps/ecapa_tdnn/test.py   |   7 +-
 paddlespeech/vector/exps/ecapa_tdnn/train.py  |  49 ++-
 paddlespeech/vector/io/batch.py               |  92 ++++-
 paddlespeech/vector/modules/loss.py           |  24 ++
 paddlespeech/vector/modules/sid_model.py      |  26 ++
 16 files changed, 712 insertions(+), 50 deletions(-)
 create mode 100644 paddlespeech/cli/vector/__init__.py
 create mode 100644 paddlespeech/cli/vector/infer.py

diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
index 0f4bf189..e58dca82 100644
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -14,10 +14,10 @@ random_chunk: True
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 # currently, we only support fbank
-sample_rate: 16000
+sr: 16000           # sample rate
 n_mels: 80
 window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
-hop_length: 160     #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 
 ###########################################################
 #                       MODEL SETTING                     #
diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
index ec9c4c58..42629c69 100755
--- a/examples/voxceleb/sv0/local/data.sh
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -1,15 +1,36 @@
 #!/bin/bash
-
-stage=-1
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+stage=0
 stop_stage=100
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
 
+if [ $# -ne 2 ] ; then
+   echo "Usage: $0 [options] <data-dir> <conf-path>";
+   echo "e.g.: $0 ./data/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   exit 1;
+fi
+
 dir=$1
 conf_path=$2
 mkdir -p ${dir}
 
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
     # we should use the local/convert.sh convert m4a to wav
     python3 local/data_prepare.py \
diff --git a/examples/voxceleb/sv0/local/emb.sh b/examples/voxceleb/sv0/local/emb.sh
index 482e658e..31d79e52 100755
--- a/examples/voxceleb/sv0/local/emb.sh
+++ b/examples/voxceleb/sv0/local/emb.sh
@@ -1,13 +1,51 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 . ./path.sh
 
-exp_dir=exp/ecapa-tdnn-vox12-big//epoch_10/            # experiment directory
+stage=0
+stop_stage=100
+exp_dir=exp/ecapa-tdnn-vox12-big/            # experiment directory
 conf_path=conf/ecapa_tdnn.yaml
 audio_path="demo/voxceleb/00001.wav"
+use_gpu=true
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 0 ] ; then
+   echo "Usage: $0 [options]";
+   echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --use-gpu <true,false|true>      # specify is gpu is to be used for training"
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   echo "  --exp-dir                        # experiment directorh, where is has the model.pdparams"
+   echo "  --conf-path                      # configuration file for extracting the embedding"
+   echo "  --audio-path                     # audio-path, which will be processed to extract the embedding"
+   exit 1;
+fi
 
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+# set the test device
+device="cpu"
+if ${use_gpu}; then
+    device="gpu"
+fi
 
-# extract the audio embedding
-python3 ${BIN_DIR}/extract_emb.py --device "gpu" \
-          --config ${conf_path} \
-          --audio-path ${audio_path} --load-checkpoint ${exp_dir}
\ No newline at end of file
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # extract the audio embedding
+    python3 ${BIN_DIR}/extract_emb.py --device ${device} \
+            --config ${conf_path} \
+            --audio-path ${audio_path} --load-checkpoint ${exp_dir}
+fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh
index d8a1a0ba..4460a165 100644
--- a/examples/voxceleb/sv0/local/test.sh
+++ b/examples/voxceleb/sv0/local/test.sh
@@ -1,8 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=1
+stop_stage=100
+use_gpu=true    # if true, we run on GPU.
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 3 ] ; then
+   echo "Usage: $0 [options] <data-dir> <exp-dir> <conf-path>";
+   echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --use-gpu <true,false|true>      # specify is gpu is to be used for training"
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   exit 1;
+fi
+
 dir=$1
 exp_dir=$2
 conf_path=$3
 
-python3 ${BIN_DIR}/test.py \
-        --config ${conf_path} \
-        --data-dir ${dir} \
-        --load-checkpoint ${exp_dir}
\ No newline at end of file
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+   # test the model and compute the eer metrics
+   python3 ${BIN_DIR}/test.py \
+         --data-dir ${dir} \
+         --load-checkpoint ${exp_dir} \
+         --config ${conf_path}
+fi
diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh
index 385e8caa..5477d0a3 100755
--- a/examples/voxceleb/sv0/local/train.sh
+++ b/examples/voxceleb/sv0/local/train.sh
@@ -1,18 +1,57 @@
 #!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=0
+stop_stage=100
+use_gpu=true    # if true, we run on GPU.
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 3 ] ; then
+   echo "Usage: $0 [options] <data-dir> <exp-dir> <conf-path>";
+   echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --use-gpu <true,false|true>      # specify is gpu is to be used for training"
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   exit 1;
+fi
 
 dir=$1
 exp_dir=$2
 conf_path=$3
 
+# get the gpu nums for training
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-# train the speaker identification task with voxceleb data
-# Note: we will store the log file in exp/log directory
-python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
-    ${BIN_DIR}/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
-    --data-dir ${dir} --config ${conf_path}
+# setting training device
+device="cpu"
+if ${use_gpu}; then
+    device="gpu"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train the speaker identification task with voxceleb data
+    # and we will create the trained model parameters in ${exp_dir}/model.pdparams as the soft link
+    # Note: we will store the log file in exp/log directory
+    python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+        ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
+        --data-dir ${dir} --config ${conf_path}
 
+fi 
 
 if [ $? -ne 0 ]; then
     echo "Failed in training!"
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index e38027e9..bbc9e3db 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -36,11 +36,10 @@ stop_stage=50
 # data directory
 # if we set the variable ${dir}, we will store the wav info to this directory
 # otherwise, we will store the wav info to vox1 and vox2 directory respectively
-# vox2 wav path, we must convert the m4a format to wav format 
-# dir=data-demo/                          # data info directory    
-dir=demo/                          # data info directory   
+# vox2 wav path, we must convert the m4a format to wav format    
+dir=data/                                 # data info directory   
 
-exp_dir=exp/ecapa-tdnn-vox12-big//            # experiment directory
+exp_dir=exp/ecapa-tdnn-vox12-big/            # experiment directory
 conf_path=conf/ecapa_tdnn.yaml          
 gpus=0,1,2,3
 
@@ -50,16 +49,15 @@ mkdir -p ${exp_dir}
 
 if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
-     # and we should specifiy the vox2 data in the data.sh
      bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
-fi 
+fi
 
 if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
      # stage 1: train the speaker identification model
      CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} 
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
      # stage 2: get the speaker verification scores with cosine function
      #          now we only support use cosine to get the scores
      CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
diff --git a/paddleaudio/paddleaudio/metric/eer.py b/paddleaudio/paddleaudio/metric/eer.py
index 7738987e..a1166d3f 100644
--- a/paddleaudio/paddleaudio/metric/eer.py
+++ b/paddleaudio/paddleaudio/metric/eer.py
@@ -19,9 +19,15 @@ from sklearn.metrics import roc_curve
 
 
 def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
-    '''
-    Compute EER and return score threshold.
-    '''
+    """Compute EER and return score threshold.
+
+    Args:
+        labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
+        scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
+
+    Returns:
+        List[float]: eer and the specific threshold
+    """
     fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
     fnr = 1 - tpr
     eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
@@ -54,7 +60,7 @@ def compute_minDCF(positive_scores,
         p_target (float, optional): Prior probability of having a target (default 0.01).
 
     Returns:
-        _type_: min dcf 
+        List[float]: min dcf and the specific threshold
     """
     # Computing candidate thresholds
     if len(positive_scores.shape) > 1:
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index b526a384..ddf0359b 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -21,5 +21,6 @@ from .st import STExecutor
 from .stats import StatsExecutor
 from .text import TextExecutor
 from .tts import TTSExecutor
+from .vector import VectorExecutor
 
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/vector/__init__.py b/paddlespeech/cli/vector/__init__.py
new file mode 100644
index 00000000..038596af
--- /dev/null
+++ b/paddlespeech/cli/vector/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import VectorExecutor
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
new file mode 100644
index 00000000..de4d6621
--- /dev/null
+++ b/paddlespeech/cli/vector/infer.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import sys
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import librosa
+import numpy as np
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from ..download import get_path_from_url
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
+    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
+    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
+    "ecapa_tdnn-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '76cb19ed857e6623856b7cd7ebbfeda4',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/wenetspeech',
+    },
+}
+
+model_alias = {
+    "ecapa_tdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+}
+
+
+@cli_register(
+    name="paddlespeech.vector",
+    description="Speech to vector embedding infer command.")
+class VectorExecutor(BaseExecutor):
+    def __init__(self):
+        super(VectorExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog="paddlespeech.vector", add_help=True)
+        self.parser.add_argument(
+            "--model",
+            type=str,
+            default="ecapa_tdnn-voxceleb12",
+            choices=["ecapa_tdnn"],
+            help="Choose model type of asr task.")
+        self.parser.add_argument(
+            "--task",
+            type=str,
+            default="spk",
+            choices=["spk"],
+            help="task type in vector domain")
+        self.parser.add_argument(
+            "--input", type=str, default=None, help="Audio file to recognize.")
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[16000, 8000],
+            help="Choose the audio sample rate of the model. 8000 or 16000")
+        self.parser.add_argument(
+            "--ckpt_path",
+            type=str,
+            default=None,
+            help="Checkpoint file of model.")
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of asr task. Use deault config when it is None.')
+        self.parser.add_argument(
+            "--device",
+            type=str,
+            default=paddle.get_device(),
+            help="Choose device to execute model inference.")
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def execute(self, argv: List[str]) -> bool:
+        """Command line entry for vector model
+
+        Args:
+            argv (List[str]): command line args list
+
+        Returns:
+            bool: 
+                 False: some audio occurs error
+                 True: all audio process success
+        """
+        # stage 0: parse the args and get the required args
+        parser_args = self.parser.parse_args(argv)
+        model = parser_args.model
+        sample_rate = parser_args.sample_rate
+        config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        device = parser_args.device
+
+        # stage 1: configurate the verbose flag
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        # stage 2: read the input data and store them as a list
+        task_source = self.get_task_source(parser_args.input)
+        logger.info(f"task source: {task_source}")
+
+        # stage 3: process the audio one by one
+        task_result = OrderedDict()
+        has_exceptions = False
+        for id_, input_ in task_source.items():
+            try:
+                res = self(input_, model, sample_rate, config, ckpt_path,
+                           device)
+                task_result[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_result[id_] = f'{e.__class__.__name__}: {e}'
+
+        logger.info("task result as follows: ")
+        logger.info(f"{task_result}")
+
+        # stage 4: process the all the task results
+        self.process_task_results(parser_args.input, task_result,
+                                  parser_args.job_dump_result)
+
+        # stage 5: return the exception flag
+        #          if return False, somen audio process occurs error
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(self,
+                 audio_file: os.PathLike,
+                 model: str='ecapa_tdnn-voxceleb12',
+                 sample_rate: int=16000,
+                 config: os.PathLike=None,
+                 ckpt_path: os.PathLike=None,
+                 force_yes: bool=False,
+                 device=paddle.get_device()):
+        audio_file = os.path.abspath(audio_file)
+        if not self._check(audio_file, sample_rate):
+            sys.exit(-1)
+
+        logger.info(f"device type: {device}")
+        paddle.device.set_device(device)
+        self._init_from_path(model, sample_rate, config, ckpt_path)
+        self.preprocess(model, audio_file)
+        self.infer(model)
+        res = self.postprocess()
+
+        return res
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, \
+            'The model "{}" you want to use has not been supported, \
+            please choose other models.\n \
+            The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+
+    def _init_from_path(self,
+                        model_type: str='ecapa_tdnn-voxceleb12',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None):
+        if hasattr(self, "model"):
+            logger.info("Model has been initialized")
+            return
+
+        # stage 1: get the model and config path
+        if cfg_path is None or ckpt_path is None:
+            sample_rate_str = "16k" if sample_rate == 16000 else "8k"
+            tag = model_type + "-" + sample_rate_str
+            res_path = self._get_pretrained_path(tag)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        logger.info(f"start to read the ckpt from {self.ckpt_path}")
+        logger.info(f"read the config from {self.cfg_path}")
+        logger.info(f"get the res path {self.res_path}")
+
+        # stage 2: read and config and init the model body
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        # stage 3: get the model name to instance the model network with dynamic_import
+        # Noet: we use the '-' to get the model name instead of '_'
+        logger.info("start to dynamic import the model class")
+        model_name = model_type[:model_type.rindex('-')]
+        logger.info(f"model name {model_name}")
+        model_class = dynamic_import(model_name, model_alias)
+        model_conf = self.config.model
+        backbone = model_class(**model_conf)
+        model = SpeakerIdetification(
+            backbone=backbone, num_class=self.config.num_speakers)
+        self.model = model
+        self.model.eval()
+
+        # stage 4: load the model parameters
+        logger.info("start to set the model parameters to model")
+        model_dict = paddle.load(self.ckpt_path)
+        self.model.set_state_dict(model_dict)
+
+        logger.info("create the model instance success")
+
+    @paddle.no_grad()
+    def infer(self, model_type: str):
+
+        feats = self._inputs["feats"]
+        lengths = self._inputs["lengths"]
+        logger.info(f"start to do backbone network model forward")
+        logger.info(
+            f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
+        # embedding from (1, emb_size, 1) -> (emb_size)
+        embedding = self.model.backbone(feats, lengths).squeeze().numpy()
+        logger.info(f"embedding size: {embedding.shape}")
+
+        self._outputs["embedding"] = embedding
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        return self._outputs["embedding"]
+
+    def preprocess(self, model_type: str, input_file: Union[str, os.PathLike]):
+        audio_file = input_file
+        if isinstance(audio_file, (str, os.PathLike)):
+            logger.info(f"Preprocess audio file: {audio_file}")
+
+        # stage 1: load the audio
+        waveform, sr = load_audio(audio_file)
+        logger.info(f"load the audio sample points, shape is: {waveform.shape}")
+
+        # stage 2: get the audio feat
+        try:
+            feat = melspectrogram(
+                x=waveform,
+                sr=self.config.sr,
+                n_mels=self.config.n_mels,
+                window_size=self.config.window_size,
+                hop_length=self.config.hop_size)
+            logger.info(f"extract the audio feat, shape is: {feat.shape}")
+        except Exception as e:
+            logger.info(f"feat occurs exception {e}")
+            sys.exit(-1)
+
+        feat = paddle.to_tensor(feat).unsqueeze(0)
+        # in inference period, the lengths is all one without padding
+        lengths = paddle.ones([1])
+        feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+        logger.info(f"feats shape: {feat.shape}")
+        self._inputs["feats"] = feat
+        self._inputs["lengths"] = lengths
+
+        logger.info("audio extract the feat success")
+
+    def _check(self, audio_file: str, sample_rate: int):
+        self.sample_rate = sample_rate
+        if self.sample_rate != 16000 and self.sample_rate != 8000:
+            logger.error(
+                "invalid sample rate, please input --sr 8000 or --sr 16000")
+            return False
+
+        if isinstance(audio_file, (str, os.PathLike)):
+            if not os.path.isfile(audio_file):
+                logger.error("Please input the right audio file path")
+                return False
+
+        logger.info("checking the aduio file format......")
+        try:
+            audio, audio_sample_rate = soundfile.read(
+                audio_file, dtype="float32", always_2d=True)
+        except Exception as e:
+            logger.exception(e)
+            logger.error(
+                "can not open the audio file, please check the audio file format is 'wav'. \n \
+                 you can try to use sox to change the file format.\n \
+                 For example: \n \
+                 sample rate: 16k \n \
+                 sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+                 sample rate: 8k \n \
+                 sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+                 ")
+            return False
+
+        logger.info(f"The sample rate is {audio_sample_rate}")
+
+        if audio_sample_rate != self.sample_rate:
+            logger.error("The sample rate of the input file is not {}.\n \
+                            The program will resample the wav file to {}.\n \
+                            If the result does not meet your expectations，\n \
+                            Please input the 16k 16 bit 1 channel wav file. \
+                        ".format(self.sample_rate, self.sample_rate))
+            sys.exit(-1)
+        else:
+            logger.info("The audio file format is right")
+
+        return True
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index 0d09d211..6dfcf06d 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -63,16 +63,16 @@ def extract_audio_embedding(args, config):
     # so the final shape is [1, dim, time]
     start_time = time.time()
     feat = melspectrogram(x=waveform, 
-                          sr=config.sample_rate,
+                          sr=config.sr,
                           n_mels=config.n_mels,
                           window_size=config.window_size,
-                          hop_length=config.hop_length)
+                          hop_length=config.hop_size)
     feat = paddle.to_tensor(feat).unsqueeze(0)
 
     # in inference period, the lengths is all one without padding
     lengths = paddle.ones([1])
     feat = feature_normalize(
-        feat, mean_norm=True, std_norm=False, convert_to_numpy=True)
+        feat, mean_norm=True, std_norm=False)
 
     # model backbone network forward the feats and get the embedding
     embedding = model.backbone(
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py
index 03757033..76832fd8 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/test.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@@ -49,8 +49,6 @@ def main(args, config):
 
     # stage3: load the pre-trained model
     #         we get the last model from the epoch and save_interval
-    last_save_epoch = (config.epochs // config.save_interval) * config.save_interval
-    args.load_checkpoint = os.path.join(args.load_checkpoint, "epoch_" + str(last_save_epoch))
     args.load_checkpoint = os.path.abspath(
         os.path.expanduser(args.load_checkpoint))
 
@@ -61,6 +59,7 @@ def main(args, config):
     logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
 
     # stage4: construct the enroll and test dataloader
+
     enroll_dataset = VoxCeleb(
         subset='enroll',
         target_dir=args.data_dir,
@@ -68,7 +67,7 @@ def main(args, config):
         random_chunk=False,
         n_mels=config.n_mels,
         window_size=config.window_size,
-        hop_length=config.hop_length)
+        hop_length=config.hop_size)
     enroll_sampler = BatchSampler(
         enroll_dataset, batch_size=config.batch_size,
         shuffle=True)  # Shuffle to make embedding normalization more robust.
@@ -85,7 +84,7 @@ def main(args, config):
         random_chunk=False,
         n_mels=config.n_mels,
         window_size=config.window_size,
-        hop_length=config.hop_length)
+        hop_length=config.hop_size)
 
     test_sampler = BatchSampler(
         test_dataset, batch_size=config.batch_size, shuffle=True)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index 0d62c69d..fb02d486 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -15,6 +15,7 @@ import argparse
 import os
 
 import numpy as np
+import time
 import paddle
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
@@ -35,6 +36,7 @@ from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.scheduler import CyclicLRScheduler
 from paddlespeech.vector.training.seeding import seed_everything
 from paddlespeech.vector.utils.time import Timer
+from paddlespeech.vector.io.batch import batch_pad_right
 
 logger = Log(__name__).getlog()
 
@@ -55,7 +57,7 @@ def main(args, config):
     train_dataset = VoxCeleb('train', target_dir=args.data_dir)
     dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
 
-    if args.augment:
+    if config.augment:
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
     else:
         augment_pipeline = []
@@ -126,6 +128,7 @@ def main(args, config):
     #         we will comment the training process
     steps_per_epoch = len(train_sampler)
     timer = Timer(steps_per_epoch * config.epochs)
+    last_saved_epoch = ""
     timer.start()
 
     for epoch in range(start_epoch + 1, config.epochs + 1):
@@ -135,9 +138,19 @@ def main(args, config):
         avg_loss = 0
         num_corrects = 0
         num_samples = 0
+        train_reader_cost = 0.0
+        train_feat_cost = 0.0
+        train_run_cost = 0.0
+
+        reader_start = time.time()
         for batch_idx, batch in enumerate(train_loader):
+            train_reader_cost += time.time() - reader_start
+            
             # stage 9-1: batch data is audio sample points and speaker id label
+            feat_start = time.time()
             waveforms, labels = batch['waveforms'], batch['labels']
+            waveforms, lengths = batch_pad_right(waveforms.numpy())
+            waveforms = paddle.to_tensor(waveforms)
 
             # stage 9-2: audio sample augment method, which is done on the audio sample point
             #            the original wavefrom and the augmented waveform is concatented in a batch
@@ -153,18 +166,20 @@ def main(args, config):
             feats = []
             for waveform in waveforms.numpy():
                 feat = melspectrogram(x=waveform, 
-                                      sr=config.sample_rate, 
+                                      sr=config.sr, 
                                       n_mels=config.n_mels, 
                                       window_size=config.window_size, 
-                                      hop_length=config.hop_length)
+                                      hop_length=config.hop_size)
                 feats.append(feat)
             feats = paddle.to_tensor(np.asarray(feats))
 
             # stage 9-4: feature normalize, which help converge and imporve the performance
             feats = feature_normalize(
                 feats, mean_norm=True, std_norm=False)  # Features normalization
+            train_feat_cost += time.time() - feat_start
 
             # stage 9-5: model forward, such ecapa-tdnn, x-vector
+            train_start = time.time()
             logits = model(feats)
 
             # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
@@ -177,6 +192,7 @@ def main(args, config):
                           paddle.optimizer.lr.LRScheduler):
                 optimizer._learning_rate.step()
             optimizer.clear_grad()
+            train_run_cost += time.time() - train_start
 
             # stage 9-8: Calculate average loss per batch
             avg_loss += loss.numpy()[0]
@@ -186,7 +202,7 @@ def main(args, config):
             num_corrects += (preds == labels).numpy().sum()
             num_samples += feats.shape[0]
             timer.count()  # step plus one in timer
-
+            
             # stage 9-10: print the log information only on 0-rank per log-freq batchs
             if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
@@ -197,6 +213,9 @@ def main(args, config):
                     epoch, config.epochs, batch_idx + 1, steps_per_epoch)
                 print_msg += ' loss={:.4f}'.format(avg_loss)
                 print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' avg_reader_cost: {:.5f} sec,'.format(train_reader_cost / config.log_interval)
+                print_msg += ' avg_feat_cost: {:.5f} sec,'.format(train_feat_cost / config.log_interval)
+                print_msg += ' avg_train_cost: {:.5f} sec,'.format(train_run_cost / config.log_interval)
                 print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
                     lr, timer.timing, timer.eta)
                 logger.info(print_msg)
@@ -204,6 +223,11 @@ def main(args, config):
                 avg_loss = 0
                 num_corrects = 0
                 num_samples = 0
+                train_reader_cost = 0.0
+                train_feat_cost = 0.0
+                train_run_cost = 0.0
+
+            reader_start = time.time()
 
         # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
         if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch:
@@ -239,10 +263,10 @@ def main(args, config):
                     feats = []
                     for waveform in waveforms.numpy():
                         feat = melspectrogram(x=waveform, 
-                                              sr=config.sample_rate,
+                                              sr=config.sr,
                                               n_mels=config.n_mels,
                                               window_size=config.window_size,
-                                              hop_length=config.hop_length)
+                                              hop_length=config.hop_size)
                         feats.append(feat)
 
                     feats = paddle.to_tensor(np.asarray(feats))
@@ -261,6 +285,7 @@ def main(args, config):
             # stage 9-14: Save model parameters
             save_dir = os.path.join(args.checkpoint_dir,
                                     'epoch_{}'.format(epoch))
+            last_saved_epoch = os.path.join('epoch_{}'.format(epoch), "model.pdparams")
             logger.info('Saving model checkpoint to {}'.format(save_dir))
             paddle.save(model.state_dict(),
                         os.path.join(save_dir, 'model.pdparams'))
@@ -270,6 +295,14 @@ def main(args, config):
             if nranks > 1:
                 paddle.distributed.barrier()  # Main process
 
+    # stage 10: create the final trained model.pdparams with soft link
+    if local_rank == 0:
+        final_model = os.path.join(args.checkpoint_dir, "model.pdparams")
+        logger.info(f"we will create the final model: {final_model}")
+        if os.path.islink(final_model):
+            logger.info(f"An {final_model} already exists, we will rm is and create it again")
+            os.unlink(final_model)
+        os.symlink(last_saved_epoch, final_model)
 
 if __name__ == "__main__":
     # yapf: disable
@@ -294,10 +327,6 @@ if __name__ == "__main__":
                         type=str,
                         default='./checkpoint',
                         help="Directory to save model checkpoints.")
-    parser.add_argument("--augment",
-                        action="store_true",
-                        default=False,
-                        help="Apply audio augments.")
 
     args = parser.parse_args()
     # yapf: enable
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 85f2ab8b..25522ebb 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import numpy as np
 import paddle
-
+import numpy
 
 def waveform_collate_fn(batch):
     waveforms = np.stack([item['feat'] for item in batch])
@@ -80,4 +80,92 @@ def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
     # we convert the original length of each utterance to the ratio of the max length
     lengths = (lengths / lengths.max()).astype(np.float32)
 
-    return {'ids': ids, 'feats': feats, 'lengths': lengths}
\ No newline at end of file
+    return {'ids': ids, 'feats': feats, 'lengths': lengths}
+
+
+def pad_right_to(array, target_shape, mode="constant", value=0):
+    """
+    This function takes a numpy array of arbitrary shape and pads it to target
+    shape by appending values on the right.
+
+    Args:
+        array: input numpy array. Input array whose dimension we need to pad.
+    target_shape : (list, tuple). Target shape we want for the target array its len must be equal to array.ndim
+    mode : str. Pad mode, please refer to numpy.pad documentation.
+    value : float. Pad value, please refer to numpy.pad documentation.
+
+    Returns:
+        array: numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+    assert len(target_shape) == array.ndim
+    pads = []  # this contains the abs length of the padding for each dimension.
+    valid_vals = []  # thic contains the relative lengths for each dimension.
+    i = 0 # iterating over target_shape ndims
+    while i < len(target_shape):
+        assert (
+            target_shape[i] >= array.shape[i]
+        ), "Target shape must be >= original shape for every dim"
+        pads.append([0, target_shape[i] - array.shape[i]])
+        valid_vals.append(array.shape[i] / target_shape[i])
+        i += 1
+
+    array = numpy.pad(array, pads, mode=mode, constant_values=value)
+
+    return array, valid_vals
+
+
+def batch_pad_right(arrays, mode="constant", value=0):
+    """Given a list of numpy arrays it batches them together by padding to the right
+    on each dimension in order to get same length for all.
+
+    Args:
+        arrays : list. List of array we wish to pad together.
+        mode : str. Padding mode see numpy.pad documentation.
+        value : float. Padding value see numpy.pad documentation.
+
+    Returns:
+        array : numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+
+    if not len(arrays):
+        raise IndexError("arrays list must not be empty")
+
+    if len(arrays) == 1:
+        # if there is only one array in the batch we simply unsqueeze it.
+        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
+
+    if not (
+        any(
+            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
+        )
+    ):
+        raise IndexError("All arrays must have same number of dimensions")
+
+    # FIXME we limit the support here: we allow padding of only the last dimension
+    # need to remove this when feat extraction is updated to handle multichannel.
+    max_shape = []
+    for dim in range(arrays[0].ndim):
+        if dim != (arrays[0].ndim - 1):
+            if not all(
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
+            ):
+                raise EnvironmentError(
+                    "arrays should have same dimensions except for last one"
+                )
+        max_shape.append(max([x.shape[dim] for x in arrays]))
+
+    batched = []
+    valid = []
+    for t in arrays:
+        # for each array we apply pad_right_to
+        padded, valid_percent = pad_right_to(
+            t, max_shape, mode=mode, value=value
+        )
+        batched.append(padded)
+        valid.append(valid_percent[-1])
+
+    batched = numpy.stack(batched)
+
+    return batched, numpy.array(valid)
diff --git a/paddlespeech/vector/modules/loss.py b/paddlespeech/vector/modules/loss.py
index 1aa0599a..1c80dda4 100644
--- a/paddlespeech/vector/modules/loss.py
+++ b/paddlespeech/vector/modules/loss.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# This is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/nnet/losses.py
 import math
 
 import paddle
@@ -20,6 +22,14 @@ import paddle.nn.functional as F
 
 class AngularMargin(nn.Layer):
     def __init__(self, margin=0.0, scale=1.0):
+        """An implementation of Angular Margin (AM) proposed in the following
+           paper: '''Margin Matters: Towards More Discriminative Deep Neural Network
+           Embeddings for Speaker Recognition''' (https://arxiv.org/abs/1906.07317)
+
+        Args:
+            margin (float, optional): The margin for cosine similiarity. Defaults to 0.0.
+            scale (float, optional): The scale for cosine similiarity. Defaults to 1.0.
+        """
         super(AngularMargin, self).__init__()
         self.margin = margin
         self.scale = scale
@@ -31,6 +41,15 @@ class AngularMargin(nn.Layer):
 
 class AdditiveAngularMargin(AngularMargin):
     def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
+        """The Implementation of Additive Angular Margin (AAM) proposed
+       in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
+       (https://arxiv.org/abs/1906.07317)
+
+        Args:
+            margin (float, optional): margin factor. Defaults to 0.0.
+            scale (float, optional): scale factor. Defaults to 1.0.
+            easy_margin (bool, optional): easy_margin flag. Defaults to False.
+        """
         super(AdditiveAngularMargin, self).__init__(margin, scale)
         self.easy_margin = easy_margin
 
@@ -53,6 +72,11 @@ class AdditiveAngularMargin(AngularMargin):
 
 class LogSoftmaxWrapper(nn.Layer):
     def __init__(self, loss_fn):
+        """Speaker identificatin loss function wrapper 
+           including all of compositions of the loss transformation
+        Args:
+            loss_fn (_type_): the loss value of a batch
+        """
         super(LogSoftmaxWrapper, self).__init__()
         self.loss_fn = loss_fn
         self.criterion = paddle.nn.KLDivLoss(reduction="sum")
diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py
index 8a46c3cd..dc13b2e0 100644
--- a/paddlespeech/vector/modules/sid_model.py
+++ b/paddlespeech/vector/modules/sid_model.py
@@ -24,13 +24,25 @@ class SpeakerIdetification(nn.Layer):
             lin_blocks=0,
             lin_neurons=192,
             dropout=0.1, ):
+        """_summary_
 
+        Args:
+            backbone (Paddle.nn.Layer class): the speaker identification backbone network model
+            num_class (_type_): the speaker class num in the training dataset
+            lin_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0.
+            lin_neurons (int, optional): the output dimension of final linear layer. Defaults to 192.
+            dropout (float, optional): the dropout factor on the embedding. Defaults to 0.1.
+        """
         super(SpeakerIdetification, self).__init__()
+        # speaker idenfication backbone network model
+        # the output of the backbond network is the target embedding
         self.backbone = backbone
         if dropout > 0:
             self.dropout = nn.Dropout(dropout)
         else:
             self.dropout = None
+        
+        # construct the speaker classifer
         input_size = self.backbone.emb_size
         self.blocks = nn.LayerList()
         for i in range(lin_blocks):
@@ -40,12 +52,26 @@ class SpeakerIdetification(nn.Layer):
             ])
             input_size = lin_neurons
 
+        # the final layer
         self.weight = paddle.create_parameter(
             shape=(input_size, num_class),
             dtype='float32',
             attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
 
     def forward(self, x, lengths=None):
+        """Do the speaker identification model forwrd, 
+           including the speaker embedding model and the classifier model network
+
+        Args:
+            x (Paddle.Tensor): input audio feats, 
+                               shape=[batch, dimension, times]
+            lengths (_type_, optional): input audio length.
+                                        shape=[batch, times]
+                                        Defaults to None.
+
+        Returns:
+            _type_: _description_
+        """
         # x.shape: (N, C, L)
         x = self.backbone(x, lengths).squeeze(
             -1)  # (N, emb_size, 1) -> (N, emb_size)

From 9c6735f921d767f116e47f1671ec6290d72315dd Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 21 Mar 2022 17:40:40 +0800
Subject: [PATCH 31/41] add vector voxceleb12 base mode url, test=doc

---
 paddlespeech/cli/vector/infer.py | 33 ++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index de4d6621..205d61f9 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -42,19 +42,19 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 pretrained_models = {
-    # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
-    # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
-    # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
-    # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
-    "ecapa_tdnn-16k": {
+    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
+    # e.g. "ecapa_tdnn-voxceleb12-16k".
+    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
+    # "paddlespeech vector --task spk --model ecapa_tdnn-voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapa_tdnn-voxceleb12-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
         'md5':
-        '76cb19ed857e6623856b7cd7ebbfeda4',
+        '85ff08ce0ef406b8c6d7b5ffc5b2b48f',
         'cfg_path':
-        'model.yaml',
+        'conf/model.yaml',
         'ckpt_path':
-        'exp/conformer/checkpoints/wenetspeech',
+        'model/model',
     },
 }
 
@@ -202,6 +202,14 @@ class VectorExecutor(BaseExecutor):
             The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
 
         res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
 
     def _init_from_path(self,
                         model_type: str='ecapa_tdnn-voxceleb12',
@@ -216,7 +224,12 @@ class VectorExecutor(BaseExecutor):
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = "16k" if sample_rate == 16000 else "8k"
             tag = model_type + "-" + sample_rate_str
+            logger.info(f"load the pretrained model: {tag}")
             res_path = self._get_pretrained_path(tag)
+            self.res_path = res_path
+
+            self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@@ -226,7 +239,7 @@ class VectorExecutor(BaseExecutor):
         logger.info(f"start to read the ckpt from {self.ckpt_path}")
         logger.info(f"read the config from {self.cfg_path}")
         logger.info(f"get the res path {self.res_path}")
-
+        
         # stage 2: read and config and init the model body
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)

From b9eafddd9494f6f62fbbffbd149b08e4cc36dccf Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Mon, 21 Mar 2022 17:49:39 +0800
Subject: [PATCH 32/41] change - to _ to distinguish field

---
 paddlespeech/cli/vector/infer.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 205d61f9..c942c850 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -43,10 +43,10 @@ from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 pretrained_models = {
     # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "ecapa_tdnn-voxceleb12-16k".
+    # e.g. "EcapaTdnn_voxceleb12-16k".
     # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model ecapa_tdnn-voxceleb12-16k --sr 16000 --input ./input.wav"
-    "ecapa_tdnn-voxceleb12-16k": {
+    # "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
+    "EcapaTdnn_voxceleb12-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
         'md5':
@@ -59,7 +59,7 @@ pretrained_models = {
 }
 
 model_alias = {
-    "ecapa_tdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+    "EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
 }
 
 
@@ -75,8 +75,8 @@ class VectorExecutor(BaseExecutor):
         self.parser.add_argument(
             "--model",
             type=str,
-            default="ecapa_tdnn-voxceleb12",
-            choices=["ecapa_tdnn"],
+            default="EcapaTdnn_voxceleb12",
+            choices=["EcapaTdnn_voxceleb12"],
             help="Choose model type of asr task.")
         self.parser.add_argument(
             "--task",
@@ -175,7 +175,7 @@ class VectorExecutor(BaseExecutor):
     @stats_wrapper
     def __call__(self,
                  audio_file: os.PathLike,
-                 model: str='ecapa_tdnn-voxceleb12',
+                 model: str='EcapaTdnn-voxceleb12',
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
@@ -212,7 +212,7 @@ class VectorExecutor(BaseExecutor):
         return decompressed_path
 
     def _init_from_path(self,
-                        model_type: str='ecapa_tdnn-voxceleb12',
+                        model_type: str='EcapaTdnn_voxceleb12',
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         ckpt_path: Optional[os.PathLike]=None):
@@ -247,7 +247,7 @@ class VectorExecutor(BaseExecutor):
         # stage 3: get the model name to instance the model network with dynamic_import
         # Noet: we use the '-' to get the model name instead of '_'
         logger.info("start to dynamic import the model class")
-        model_name = model_type[:model_type.rindex('-')]
+        model_name = model_type[:model_type.rindex('_')]
         logger.info(f"model name {model_name}")
         model_class = dynamic_import(model_name, model_alias)
         model_conf = self.config.model

From 9874fb7d75029fd015d49f4d61ec55d6e625bf1a Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Tue, 22 Mar 2022 14:25:27 +0800
Subject: [PATCH 33/41] add some comments in code

---
 paddlespeech/cli/vector/infer.py         | 46 +++++++++++-------------
 paddlespeech/vector/io/batch.py          | 29 +++++++--------
 paddlespeech/vector/modules/sid_model.py | 11 +++---
 3 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index c942c850..f1a0e79c 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -19,34 +19,28 @@ from typing import List
 from typing import Optional
 from typing import Union
 
-import librosa
-import numpy as np
 import paddle
 import soundfile
 from yacs.config import CfgNode
 
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
-from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
-from paddlespeech.vector.io.batch import feature_normalize
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 pretrained_models = {
     # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "EcapaTdnn_voxceleb12-16k".
+    # e.g. "ecapatdnn_voxceleb12-16k".
     # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
-    "EcapaTdnn_voxceleb12-16k": {
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
         'md5':
@@ -59,7 +53,7 @@ pretrained_models = {
 }
 
 model_alias = {
-    "EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
 }
 
 
@@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
         self.parser.add_argument(
             "--model",
             type=str,
-            default="EcapaTdnn_voxceleb12",
-            choices=["EcapaTdnn_voxceleb12"],
+            default="ecapatdnn_voxceleb12",
+            choices=["ecapatdnn_voxceleb12"],
             help="Choose model type of asr task.")
         self.parser.add_argument(
             "--task",
@@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
             "--sample_rate",
             type=int,
             default=16000,
-            choices=[16000, 8000],
+            choices=[16000],
             help="Choose the audio sample rate of the model. 8000 or 16000")
         self.parser.add_argument(
             "--ckpt_path",
@@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
     @stats_wrapper
     def __call__(self,
                  audio_file: os.PathLike,
-                 model: str='EcapaTdnn-voxceleb12',
+                 model: str='ecapatdnn-voxceleb12',
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
@@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
     def _get_pretrained_path(self, tag: str) -> os.PathLike:
         support_models = list(pretrained_models.keys())
         assert tag in pretrained_models, \
-            'The model "{}" you want to use has not been supported, \
-            please choose other models.\n \
-            The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+            'The model "{}" you want to use has not been supported,'\
+            'please choose other models.\n' \
+            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
 
         res_path = os.path.join(MODEL_HOME, tag)
         decompressed_path = download_and_decompress(pretrained_models[tag],
@@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
         return decompressed_path
 
     def _init_from_path(self,
-                        model_type: str='EcapaTdnn_voxceleb12',
+                        model_type: str='ecapatdnn_voxceleb12',
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         ckpt_path: Optional[os.PathLike]=None):
@@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
             res_path = self._get_pretrained_path(tag)
             self.res_path = res_path
 
-            self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path'])
-            self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(
+                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@@ -239,7 +235,7 @@ class VectorExecutor(BaseExecutor):
         logger.info(f"start to read the ckpt from {self.ckpt_path}")
         logger.info(f"read the config from {self.cfg_path}")
         logger.info(f"get the res path {self.res_path}")
-        
+
         # stage 2: read and config and init the model body
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
@@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):
 
         feats = self._inputs["feats"]
         lengths = self._inputs["lengths"]
-        logger.info(f"start to do backbone network model forward")
+        logger.info("start to do backbone network model forward")
         logger.info(
             f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
         # embedding from (1, emb_size, 1) -> (emb_size)
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 25522ebb..92ca990c 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy
 import numpy as np
 import paddle
-import numpy
+
 
 def waveform_collate_fn(batch):
     waveforms = np.stack([item['feat'] for item in batch])
@@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
 
     return np.pad(x, pad_width, mode=mode, **kwargs)
 
+
 def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
     ids = [item['id'] for item in batch]
     lengths = np.asarray([item['feat'].shape[1] for item in batch])
@@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
     """
     assert len(target_shape) == array.ndim
     pads = []  # this contains the abs length of the padding for each dimension.
-    valid_vals = []  # thic contains the relative lengths for each dimension.
-    i = 0 # iterating over target_shape ndims
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = 0  # iterating over target_shape ndims
     while i < len(target_shape):
-        assert (
-            target_shape[i] >= array.shape[i]
-        ), "Target shape must be >= original shape for every dim"
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
         pads.append([0, target_shape[i] - array.shape[i]])
         valid_vals.append(array.shape[i] / target_shape[i])
         i += 1
@@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
         # if there is only one array in the batch we simply unsqueeze it.
         return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
 
-    if not (
-        any(
-            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
-        )
-    ):
+    if not (any(
+        [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
         raise IndexError("All arrays must have same number of dimensions")
 
     # FIXME we limit the support here: we allow padding of only the last dimension
@@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
     for dim in range(arrays[0].ndim):
         if dim != (arrays[0].ndim - 1):
             if not all(
-                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
-            ):
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
                 raise EnvironmentError(
-                    "arrays should have same dimensions except for last one"
-                )
+                    "arrays should have same dimensions except for last one")
         max_shape.append(max([x.shape[dim] for x in arrays]))
 
     batched = []
@@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
     for t in arrays:
         # for each array we apply pad_right_to
         padded, valid_percent = pad_right_to(
-            t, max_shape, mode=mode, value=value
-        )
+            t, max_shape, mode=mode, value=value)
         batched.append(padded)
         valid.append(valid_percent[-1])
 
diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py
index dc13b2e0..4045f75d 100644
--- a/paddlespeech/vector/modules/sid_model.py
+++ b/paddlespeech/vector/modules/sid_model.py
@@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
             lin_blocks=0,
             lin_neurons=192,
             dropout=0.1, ):
-        """_summary_
+        """The speaker identification model, which includes the speaker backbone network 
+           and the a linear transform to speaker class num in training
 
         Args:
             backbone (Paddle.nn.Layer class): the speaker identification backbone network model
@@ -41,7 +42,7 @@ class SpeakerIdetification(nn.Layer):
             self.dropout = nn.Dropout(dropout)
         else:
             self.dropout = None
-        
+
         # construct the speaker classifer
         input_size = self.backbone.emb_size
         self.blocks = nn.LayerList()
@@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
            including the speaker embedding model and the classifier model network
 
         Args:
-            x (Paddle.Tensor): input audio feats, 
+            x (paddle.Tensor): input audio feats, 
                                shape=[batch, dimension, times]
-            lengths (_type_, optional): input audio length.
+            lengths (paddle.Tensor, optional): input audio length.
                                         shape=[batch, times]
                                         Defaults to None.
 
         Returns:
-            _type_: _description_
+            paddle.Tensor: return the logits of the feats
         """
         # x.shape: (N, C, L)
         x = self.backbone(x, lengths).squeeze(

From d85d1deef53b72dcf1c822bc26f431021027c9d9 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Tue, 22 Mar 2022 14:29:45 +0800
Subject: [PATCH 34/41] exec pre-commit in paddlespeech vector, test=doc

---
 paddlespeech/vector/__init__.py               |  2 +-
 .../vector/exps/ecapa_tdnn/extract_emb.py     | 21 ++++-----
 paddlespeech/vector/exps/ecapa_tdnn/train.py  | 47 +++++++++++--------
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/paddlespeech/vector/__init__.py b/paddlespeech/vector/__init__.py
index 61d5aa21..185a92b8 100644
--- a/paddlespeech/vector/__init__.py
+++ b/paddlespeech/vector/__init__.py
@@ -10,4 +10,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index 6dfcf06d..e30a50e4 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 import argparse
 import os
-
 import time
-import numpy as np
+
 import paddle
 from yacs.config import CfgNode
 
@@ -40,7 +39,8 @@ def extract_audio_embedding(args, config):
     ecapa_tdnn = EcapaTdnn(**config.model)
 
     # stage4: build the speaker verification train instance with backbone model
-    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers)
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
     # stage 2: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
         os.path.expanduser(args.load_checkpoint))
@@ -62,17 +62,17 @@ def extract_audio_embedding(args, config):
     # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
     # so the final shape is [1, dim, time]
     start_time = time.time()
-    feat = melspectrogram(x=waveform, 
-                          sr=config.sr,
-                          n_mels=config.n_mels,
-                          window_size=config.window_size,
-                          hop_length=config.hop_size)
+    feat = melspectrogram(
+        x=waveform,
+        sr=config.sr,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
     feat = paddle.to_tensor(feat).unsqueeze(0)
 
     # in inference period, the lengths is all one without padding
     lengths = paddle.ones([1])
-    feat = feature_normalize(
-        feat, mean_norm=True, std_norm=False)
+    feat = feature_normalize(feat, mean_norm=True, std_norm=False)
 
     # model backbone network forward the feats and get the embedding
     embedding = model.backbone(
@@ -80,7 +80,6 @@ def extract_audio_embedding(args, config):
     elapsed_time = time.time() - start_time
     audio_length = waveform.shape[0] / sr
 
-
     # stage 5: do global norm with external mean and std
     rtf = elapsed_time / audio_length
     logger.info(f"{args.device} rft={rtf}")
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index fb02d486..257b97ab 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 import argparse
 import os
+import time
 
 import numpy as np
-import time
 import paddle
 from paddle.io import BatchSampler
 from paddle.io import DataLoader
@@ -27,6 +27,7 @@ from paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.io.augment import waveform_augment
+from paddlespeech.vector.io.batch import batch_pad_right
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.io.batch import waveform_collate_fn
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
@@ -36,7 +37,6 @@ from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.scheduler import CyclicLRScheduler
 from paddlespeech.vector.training.seeding import seed_everything
 from paddlespeech.vector.utils.time import Timer
-from paddlespeech.vector.io.batch import batch_pad_right
 
 logger = Log(__name__).getlog()
 
@@ -145,7 +145,7 @@ def main(args, config):
         reader_start = time.time()
         for batch_idx, batch in enumerate(train_loader):
             train_reader_cost += time.time() - reader_start
-            
+
             # stage 9-1: batch data is audio sample points and speaker id label
             feat_start = time.time()
             waveforms, labels = batch['waveforms'], batch['labels']
@@ -165,11 +165,12 @@ def main(args, config):
             # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
             feats = []
             for waveform in waveforms.numpy():
-                feat = melspectrogram(x=waveform, 
-                                      sr=config.sr, 
-                                      n_mels=config.n_mels, 
-                                      window_size=config.window_size, 
-                                      hop_length=config.hop_size)
+                feat = melspectrogram(
+                    x=waveform,
+                    sr=config.sr,
+                    n_mels=config.n_mels,
+                    window_size=config.window_size,
+                    hop_length=config.hop_size)
                 feats.append(feat)
             feats = paddle.to_tensor(np.asarray(feats))
 
@@ -202,7 +203,7 @@ def main(args, config):
             num_corrects += (preds == labels).numpy().sum()
             num_samples += feats.shape[0]
             timer.count()  # step plus one in timer
-            
+
             # stage 9-10: print the log information only on 0-rank per log-freq batchs
             if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
@@ -213,9 +214,12 @@ def main(args, config):
                     epoch, config.epochs, batch_idx + 1, steps_per_epoch)
                 print_msg += ' loss={:.4f}'.format(avg_loss)
                 print_msg += ' acc={:.4f}'.format(avg_acc)
-                print_msg += ' avg_reader_cost: {:.5f} sec,'.format(train_reader_cost / config.log_interval)
-                print_msg += ' avg_feat_cost: {:.5f} sec,'.format(train_feat_cost / config.log_interval)
-                print_msg += ' avg_train_cost: {:.5f} sec,'.format(train_run_cost / config.log_interval)
+                print_msg += ' avg_reader_cost: {:.5f} sec,'.format(
+                    train_reader_cost / config.log_interval)
+                print_msg += ' avg_feat_cost: {:.5f} sec,'.format(
+                    train_feat_cost / config.log_interval)
+                print_msg += ' avg_train_cost: {:.5f} sec,'.format(
+                    train_run_cost / config.log_interval)
                 print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
                     lr, timer.timing, timer.eta)
                 logger.info(print_msg)
@@ -262,11 +266,12 @@ def main(args, config):
 
                     feats = []
                     for waveform in waveforms.numpy():
-                        feat = melspectrogram(x=waveform, 
-                                              sr=config.sr,
-                                              n_mels=config.n_mels,
-                                              window_size=config.window_size,
-                                              hop_length=config.hop_size)
+                        feat = melspectrogram(
+                            x=waveform,
+                            sr=config.sr,
+                            n_mels=config.n_mels,
+                            window_size=config.window_size,
+                            hop_length=config.hop_size)
                         feats.append(feat)
 
                     feats = paddle.to_tensor(np.asarray(feats))
@@ -285,7 +290,8 @@ def main(args, config):
             # stage 9-14: Save model parameters
             save_dir = os.path.join(args.checkpoint_dir,
                                     'epoch_{}'.format(epoch))
-            last_saved_epoch = os.path.join('epoch_{}'.format(epoch), "model.pdparams")
+            last_saved_epoch = os.path.join('epoch_{}'.format(epoch),
+                                            "model.pdparams")
             logger.info('Saving model checkpoint to {}'.format(save_dir))
             paddle.save(model.state_dict(),
                         os.path.join(save_dir, 'model.pdparams'))
@@ -300,10 +306,13 @@ def main(args, config):
         final_model = os.path.join(args.checkpoint_dir, "model.pdparams")
         logger.info(f"we will create the final model: {final_model}")
         if os.path.islink(final_model):
-            logger.info(f"An {final_model} already exists, we will rm is and create it again")
+            logger.info(
+                f"An {final_model} already exists, we will rm is and create it again"
+            )
             os.unlink(final_model)
         os.symlink(last_saved_epoch, final_model)
 
+
 if __name__ == "__main__":
     # yapf: disable
     parser = argparse.ArgumentParser(__doc__)

From 5221c2797f0e27f0e92893c7b2864f064a3174e3 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 23 Mar 2022 15:01:00 +0800
Subject: [PATCH 35/41] add voxceleb dataset and trial info, test=doc

---
 examples/voxceleb/README.md | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
index fc847cd8..a2e58e00 100644
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@@ -26,3 +26,31 @@ ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
 You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
 
 3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
+
+
+## voxceleb dataset summary
+
+
+|dataset | vox1 - dev | vox1 - test |vox2 - dev| vox2 - test|
+|---------|-----------|------------|-----------|----------|
+|spks    |  1211       |40     |      5994        | 118|
+|utts     | 148642    | 4874   | 1092009     |36273|
+| time(h) | 340.4 | 11.2  | 2360.2  |79.9 |
+
+
+## trial summary
+
+| trial     | filename |  nums | positive | negative |
+|--------|-----------|--------|-------|------|
+| VoxCeleb1 | veri_test.txt | 37720 | 18860 | 18860 | 
+| VoxCeleb1(cleaned) | veri_test2.txt | 37611 | 18802 | 18809 |
+| VoxCeleb1-H | list_test_hard.txt | 552536 | 276270 | 276266 |
+|VoxCeleb1-H(cleaned) |list_test_hard2.txt | 550894 | 275488 | 275406 |
+|VoxCeleb1-E | list_test_all.txt | 581480 | 290743 | 290737 | 
+|VoxCeleb1-E(cleaned) | list_test_all2.txt |579818 |289921 |289897 |
+
+
+
+
+
+

From e2684e71f226f8777941f9aef5eef788050fb065 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Wed, 23 Mar 2022 18:25:32 +0800
Subject: [PATCH 36/41] refactor the data prepare process

---
 examples/voxceleb/sv0/local/data.sh | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
index 42629c69..a3ff1c48 100755
--- a/examples/voxceleb/sv0/local/data.sh
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-stage=0
+stage=1
 stop_stage=100
 
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
@@ -36,4 +36,23 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 local/data_prepare.py \
                         --data-dir ${dir} \
                         --config ${conf_path}
-fi 
\ No newline at end of file
+fi 
+
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
+      --manifest_prefix="data/vox1/manifest" \
+      --target_dir="${TARGET_DIR}/voxceleb/vox1/"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare voxceleb failed. Terminated."
+        exit 1
+    fi
+
+   #  for dataset in train dev test; do
+   #      mv data/manifest.${dataset} data/manifest.${dataset}.raw
+   #  done
+fi
\ No newline at end of file

From 62cbce69152baf953282573435f3c164dd0bee24 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 24 Mar 2022 16:25:26 +0800
Subject: [PATCH 37/41] add vectorwrapper to extract audio embedding

---
 dataset/voxceleb/voxceleb1.py                 |  47 +++--
 dataset/voxceleb/voxceleb2.py                 | 163 ++++++++++++++++++
 .../vector/exps/ecapa_tdnn/extract_emb.py     |  93 ++++++++++
 3 files changed, 293 insertions(+), 10 deletions(-)
 create mode 100644 dataset/voxceleb/voxceleb2.py

diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index c6fc0695..d0978d9d 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -59,12 +59,17 @@ DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f5
 TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
 TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
 
-# kaldi trial
-# this trial file is organized by kaldi according the official file,
-# which is a little different with the official trial veri_test2.txt
-KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
-TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
-TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
+# voxceleb trial
+
+TRIAL_BASE_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/"
+TRIAL_LIST = {
+    "veri_test.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7",             # voxceleb1
+    "veri_test2.txt": "b73110731c9223c1461fe49cb48dddfc",            # voxceleb1(cleaned)
+    "list_test_hard.txt": "21c341b6b2168eea2634df0fb4b8fff1",        # voxceleb1-H
+    "list_test_hard2.txt": "857790e09d579a68eb2e339a090343c8",       # voxceleb1-H(cleaned)
+    "list_test_all.txt": "b9ecf7aa49d4b656aa927a8092844e4a",         # voxceleb1-E
+    "list_test_all2.txt": "a53e059deb562ffcfc092bf5d90d9f3a"         # voxceleb1-E(cleaned)
+    }
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@@ -82,7 +87,7 @@ args = parser.parse_args()
 
 
 def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
+    print(f"Creating manifest {manifest_path_prefix} from {data_dir}")
     json_lines = []
     data_path = os.path.join(data_dir, "wav", "**", "*.wav")
     total_sec = 0.0
@@ -114,6 +119,9 @@ def create_manifest(data_dir, manifest_path_prefix):
     # voxceleb1 is given explicit in the path
     data_dir_name = Path(data_dir).name
     manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+    if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+        os.makedirs(os.path.dirname(manifest_path_prefix))
+
     with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
         for line in json_lines:
             f.write(line + "\n")
@@ -133,11 +141,13 @@ def create_manifest(data_dir, manifest_path_prefix):
 def prepare_dataset(base_url, data_list, target_dir, manifest_path,
                     target_data):
     if not os.path.exists(target_dir):
-        os.mkdir(target_dir)
+        os.makedirs(target_dir)
 
     # wav directory already exists, it need do nothing
+    # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory 
     if not os.path.exists(os.path.join(target_dir, "wav")):
         # download all dataset part
+        print("start to download the vox1 dev zip package")
         for zip_part in data_list.keys():
             download_url = " --no-check-certificate " + base_url + "/" + zip_part
             download(
@@ -166,11 +176,20 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
     # create the manifest file
     create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
 
+def prepare_trial(base_url, data_list, target_dir):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
 
+    for trial, md5sum in data_list.items():
+        target_trial = os.path.join(target_dir, trial)
+        if not os.path.exists(os.path.join(target_dir, trial)):
+            download_url = " --no-check-certificate " + base_url + "/" + trial
+            download(url=download_url, md5sum=md5sum, target_dir=target_dir)
 def main():
     if args.target_dir.startswith('~'):
         args.target_dir = os.path.expanduser(args.target_dir)
-
+    
+    # prepare the vox1 dev data
     prepare_dataset(
         base_url=BASE_URL,
         data_list=DEV_LIST,
@@ -178,6 +197,7 @@ def main():
         manifest_path=args.manifest_prefix,
         target_data=DEV_TARGET_DATA)
 
+    # prepare the vox1 test data
     prepare_dataset(
         base_url=BASE_URL,
         data_list=TEST_LIST,
@@ -185,8 +205,15 @@ def main():
         manifest_path=args.manifest_prefix,
         target_data=TEST_TARGET_DATA)
 
+    # prepare the vox1 trial
+    prepare_trial(
+        base_url=TRIAL_BASE_URL,
+        data_list=TRIAL_LIST,
+        target_dir=os.path.dirname(args.manifest_prefix)
+    )
+
     print("Manifest prepare done!")
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py
new file mode 100644
index 00000000..ef7bb230
--- /dev/null
+++ b/dataset/voxceleb/voxceleb2.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare VoxCeleb2 dataset
+
+Download and unpack the voxceleb2 data files.
+Voxceleb2 data is stored as the m4a format, 
+so we need convert the m4a to wav with the convert.sh scripts
+"""
+import argparse
+import codecs
+import glob
+import json
+import os
+import subprocess
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import check_md5sum
+from utils.utility import download
+from utils.utility import unzip
+
+# all the data will be download in the current data/voxceleb directory default
+DATA_HOME = os.path.expanduser('.')
+
+BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
+
+# dev data
+DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
+DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
+
+
+# test data
+TEST_DATA_URL = BASE_URL +  '/vox2_test_aac.zip'
+TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/voxceleb2/",
+    type=str,
+    help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument("--download", 
+                    default=False, 
+                    action="store_true", 
+                    help="Download the voxceleb2 dataset. (default: %(default)s)")
+parser.add_argument("--generate", 
+                    default=False, 
+                    action="store_true", 
+                    help="Generate the manifest files. (default: %(default)s)")
+
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    data_path = os.path.join(data_dir, "**", "*.wav")
+    total_sec = 0.0
+    total_text = 0.0
+    total_num = 0
+    speakers = set()
+    for audio_path in glob.glob(data_path, recursive=True):
+        audio_id = "-".join(audio_path.split("/")[-3:])
+        utt2spk = audio_path.split("/")[-3]
+        duration = soundfile.info(audio_path).duration
+        text = ""
+        json_lines.append(
+            json.dumps(
+                {
+                    "utt": audio_id,
+                    "utt2spk": str(utt2spk),
+                    "feat": audio_path,
+                    "feat_shape": (duration, ),
+                    "text": text  # compatible with asr data format
+                },
+                ensure_ascii=False))
+
+        total_sec += duration
+        total_text += len(text)
+        total_num += 1
+        speakers.add(utt2spk)
+
+    # data_dir_name refer to dev or test
+    # voxceleb2 is given explicit in the path
+    data_dir_name = Path(data_dir).name
+    manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+
+    if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+        os.makedirs(os.path.dirname(manifest_path_prefix))
+    with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
+        for line in json_lines:
+            f.write(line + "\n")
+
+    manifest_dir = os.path.dirname(manifest_path_prefix)
+    meta_path = os.path.join(manifest_dir, "voxceleb2." +
+                             data_dir_name) + ".meta"
+    with codecs.open(meta_path, 'w', encoding='utf-8') as f:
+        print(f"{total_num} utts", file=f)
+        print(f"{len(speakers)} speakers", file=f)
+        print(f"{total_sec / (60 * 60)} h", file=f)
+        print(f"{total_text} text", file=f)
+        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def download_dataset(url, md5sum, target_dir, dataset):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    # wav directory already exists, it need do nothing
+    print("target dir {}".format(os.path.join(target_dir, dataset)))
+    # unzip the dev dataset will create the dev and unzip the m4a to dev dir
+    # but the test dataset will unzip to aac
+    # so, wo create the ${target_dir}/test and unzip the m4a to test dir
+    if not os.path.exists(os.path.join(target_dir, dataset)):
+        filepath = download(url, md5sum, target_dir)
+        if dataset == "test":
+            unzip(filepath, os.path.join(target_dir, "test"))
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+    
+    # download and unpack the vox2-dev data
+    print("download: {}".format(args.download))
+    if args.download:
+        download_dataset(
+            url=DEV_DATA_URL,
+            md5sum=DEV_MD5SUM,
+            target_dir=args.target_dir,
+            dataset="dev")
+
+        download_dataset(
+            url=TEST_DATA_URL,
+            md5sum=TEST_MD5SUM,
+            target_dir=args.target_dir,
+            dataset="test")
+
+        print("VoxCeleb2 download is done!")
+
+    if args.generate:
+        create_manifest(args.target_dir, manifest_path_prefix=args.manifest_prefix)
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index e30a50e4..ec24be51 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -28,6 +28,91 @@ from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
+class VectorWrapper:
+    """ VectorWrapper extract the audio embedding,
+        and single audio will get only an embedding
+    """
+    def __init__(self,
+                 device,
+                 config_path,
+                 model_path,):
+        super(VectorWrapper, self).__init__()
+        # stage 0: config the 
+        self.device = device
+        self.config_path = config_path
+        self.model_path = model_path
+
+        # stage 1: set the run host device
+        paddle.device.set_device(device)
+
+        # stage 2: read the yaml config and set the seed factor
+        self.read_yaml_config(self.config_path)
+        seed_everything(self.config.seed)
+
+        # stage 3: init the speaker verification model
+        self.init_vector_model(self.config, self.model_path)
+        
+    def read_yaml_config(self, config_path):
+        """Read the yaml config from the config path
+
+        Args:
+            config_path (str): yaml config path
+        """
+        config = CfgNode(new_allowed=True)
+
+        if config_path:
+            config.merge_from_file(config_path)
+
+        config.freeze()
+        self.config = config
+
+    def init_vector_model(self, config, model_path):
+        """Init the vector model from yaml config
+
+        Args:
+            config (CfgNode): yaml config 
+            model_path (str): pretrained model path and the stored model is named as model.pdparams
+        """
+        # get the backbone network instance
+        ecapa_tdnn = EcapaTdnn(**config.model)
+        
+        # get the sid instance
+        model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers)
+
+        # read the model parameters to sid model
+        model_path = os.path.abspath(os.path.expanduser(model_path))
+        state_dict = paddle.load(os.path.join(model_path, "model.pdparams"))
+        model.set_state_dict(state_dict)
+
+        model.eval()
+        self.model = model
+
+    def extract_audio_embedding(self, audio_path):
+        """Extract the audio embedding
+
+        Args:
+            audio_path (str): audio path, which will be extracted the embedding
+
+        Returns:
+            embedding (numpy.array) : audio embedding
+        """
+        waveform, sr = load_audio(audio_path)
+        feat = melspectrogram(x=waveform,
+                sr=self.config.sr,
+                n_mels=self.config.n_mels,
+                window_size=self.config.window_size,
+                hop_length=self.config.hop_size)
+        # conver the audio feat to batch shape, which means batch_size is equal to one
+        feat = paddle.to_tensor(feat).unsqueeze(0)
+
+        # in inference period, the lengths is all one without padding
+        lengths = paddle.ones([1])
+        feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+        
+        # model backbone network forward the feats and get the embedding
+        embedding = self.model.backbone(feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
+
+        return embedding
 
 def extract_audio_embedding(args, config):
     # stage 0: set the training device, cpu or gpu
@@ -83,6 +168,7 @@ def extract_audio_embedding(args, config):
     # stage 5: do global norm with external mean and std
     rtf = elapsed_time / audio_length
     logger.info(f"{args.device} rft={rtf}")
+    paddle.save(embedding, "emb1")
     return embedding
 
 
@@ -116,3 +202,10 @@ if __name__ == "__main__":
     print(config)
 
     extract_audio_embedding(args, config)
+
+    # use the VectorWrapper to extract the audio embedding
+    vector_inst = VectorWrapper(device="gpu", 
+                    config_path=args.config, 
+                    model_path=args.load_checkpoint)
+    
+    embedding = vector_inst.extract_audio_embedding(args.audio_path)

From 0bb67d8b8e09985d2275af497d8053ba479b455e Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 24 Mar 2022 16:55:23 +0800
Subject: [PATCH 38/41] add vector cli unit test, test=doc

---
 paddlespeech/cli/vector/infer.py | 2 +-
 tests/unit/cli/test_cli.sh       | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index f1a0e79c..91974761 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -39,7 +39,7 @@ pretrained_models = {
     # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
     # e.g. "ecapatdnn_voxceleb12-16k".
     # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
     "ecapatdnn_voxceleb12-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 748e5608..ec9f8d13 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -28,3 +28,7 @@ paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input
 
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
+
+# Speaker Verification 
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+paddlespeech vector --task spk --input 85236145389.wav

From 0f78d25f7619a7d99de404be935e92004f2cd413 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 24 Mar 2022 18:52:30 +0800
Subject: [PATCH 39/41] add vector cli batch and pipeline test demo, test=doc

---
 .../vector/exps/ecapa_tdnn/extract_emb.py     | 96 +------------------
 tests/unit/cli/test_cli.sh                    | 95 +++++++++---------
 2 files changed, 54 insertions(+), 137 deletions(-)

diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index ec24be51..686de936 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -28,91 +28,6 @@ from paddlespeech.vector.training.seeding import seed_everything
 
 logger = Log(__name__).getlog()
 
-class VectorWrapper:
-    """ VectorWrapper extract the audio embedding,
-        and single audio will get only an embedding
-    """
-    def __init__(self,
-                 device,
-                 config_path,
-                 model_path,):
-        super(VectorWrapper, self).__init__()
-        # stage 0: config the 
-        self.device = device
-        self.config_path = config_path
-        self.model_path = model_path
-
-        # stage 1: set the run host device
-        paddle.device.set_device(device)
-
-        # stage 2: read the yaml config and set the seed factor
-        self.read_yaml_config(self.config_path)
-        seed_everything(self.config.seed)
-
-        # stage 3: init the speaker verification model
-        self.init_vector_model(self.config, self.model_path)
-        
-    def read_yaml_config(self, config_path):
-        """Read the yaml config from the config path
-
-        Args:
-            config_path (str): yaml config path
-        """
-        config = CfgNode(new_allowed=True)
-
-        if config_path:
-            config.merge_from_file(config_path)
-
-        config.freeze()
-        self.config = config
-
-    def init_vector_model(self, config, model_path):
-        """Init the vector model from yaml config
-
-        Args:
-            config (CfgNode): yaml config 
-            model_path (str): pretrained model path and the stored model is named as model.pdparams
-        """
-        # get the backbone network instance
-        ecapa_tdnn = EcapaTdnn(**config.model)
-        
-        # get the sid instance
-        model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers)
-
-        # read the model parameters to sid model
-        model_path = os.path.abspath(os.path.expanduser(model_path))
-        state_dict = paddle.load(os.path.join(model_path, "model.pdparams"))
-        model.set_state_dict(state_dict)
-
-        model.eval()
-        self.model = model
-
-    def extract_audio_embedding(self, audio_path):
-        """Extract the audio embedding
-
-        Args:
-            audio_path (str): audio path, which will be extracted the embedding
-
-        Returns:
-            embedding (numpy.array) : audio embedding
-        """
-        waveform, sr = load_audio(audio_path)
-        feat = melspectrogram(x=waveform,
-                sr=self.config.sr,
-                n_mels=self.config.n_mels,
-                window_size=self.config.window_size,
-                hop_length=self.config.hop_size)
-        # conver the audio feat to batch shape, which means batch_size is equal to one
-        feat = paddle.to_tensor(feat).unsqueeze(0)
-
-        # in inference period, the lengths is all one without padding
-        lengths = paddle.ones([1])
-        feat = feature_normalize(feat, mean_norm=True, std_norm=False)
-        
-        # model backbone network forward the feats and get the embedding
-        embedding = self.model.backbone(feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
-
-        return embedding
 
 def extract_audio_embedding(args, config):
     # stage 0: set the training device, cpu or gpu
@@ -168,7 +83,7 @@ def extract_audio_embedding(args, config):
     # stage 5: do global norm with external mean and std
     rtf = elapsed_time / audio_length
     logger.info(f"{args.device} rft={rtf}")
-    paddle.save(embedding, "emb1")
+
     return embedding
 
 
@@ -177,7 +92,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(__doc__)
     parser.add_argument('--device',
                         choices=['cpu', 'gpu'],
-                        default="gpu",
+                        default="cpu",
                         help="Select which device to train model, defaults to gpu.")
     parser.add_argument("--config",
                         default=None,
@@ -202,10 +117,3 @@ if __name__ == "__main__":
     print(config)
 
     extract_audio_embedding(args, config)
-
-    # use the VectorWrapper to extract the audio embedding
-    vector_inst = VectorWrapper(device="gpu", 
-                    config_path=args.config, 
-                    model_path=args.load_checkpoint)
-    
-    embedding = vector_inst.extract_audio_embedding(args.audio_path)
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index fc3b8248..f2437eaf 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -1,49 +1,58 @@
 #!/bin/bash
 set -e
-# Audio classification
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
-paddlespeech cls --input ./cat.wav --topk 10
-
-# Punctuation_restoration
-paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
-
-# Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
-paddlespeech asr --input ./zh.wav
-paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
-
-# Text To Speech
-paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
-paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
-paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
-paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
-paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-
-
-# Speech Translation (only support linux)
-paddlespeech st --input ./en.wav
-
-
-# batch process
-echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
-
-# shell pipeline
-paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
-
-# stats
-paddlespeech stats --task asr
-paddlespeech stats --task tts
-paddlespeech stats --task cls
+# # Audio classification
+# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
+# paddlespeech cls --input ./cat.wav --topk 10
+
+# # Punctuation_restoration
+# paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
+
+# # Speech_recognition
+# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+# paddlespeech asr --input ./zh.wav
+# paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+
+# # Text To Speech
+# paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+# paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+# paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+# paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+# paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
+# paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
+# paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+# paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+
+
+# # Speech Translation (only support linux)
+# paddlespeech st --input ./en.wav
+
+
+# # batch process
+# echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+
+# # shell pipeline
+# paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+
+# # stats
+# paddlespeech stats --task asr
+# paddlespeech stats --task tts
+# paddlespeech stats --task cls
 
 # Speaker Verification 
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 paddlespeech vector --task spk --input 85236145389.wav
+
+echo "demo 85236145389.wav" > vec.job
+paddlespeech vector --task spk --input vec.job
+
+echo "demo 85236145389.wav" | paddlespeech vector --task spk
+rm 85236145389.wav 
+rm vec.job
+
+

From 3054659901ebb198ad8556500e38bb7558b22653 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 24 Mar 2022 18:59:36 +0800
Subject: [PATCH 40/41] remove debug info, test=doc

---
 tests/unit/cli/test_cli.sh | 86 +++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index f2437eaf..6fbb1570 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -1,48 +1,48 @@
 #!/bin/bash
 set -e
-# # Audio classification
-# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
-# paddlespeech cls --input ./cat.wav --topk 10
-
-# # Punctuation_restoration
-# paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
-
-# # Speech_recognition
-# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
-# paddlespeech asr --input ./zh.wav
-# paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
-
-# # Text To Speech
-# paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
-# paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
-# paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-# paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-# paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
-# paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
-# paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
-# paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-
-
-# # Speech Translation (only support linux)
-# paddlespeech st --input ./en.wav
-
-
-# # batch process
-# echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
-
-# # shell pipeline
-# paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
-
-# # stats
-# paddlespeech stats --task asr
-# paddlespeech stats --task tts
-# paddlespeech stats --task cls
+# Audio classification
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
+paddlespeech cls --input ./cat.wav --topk 10
+
+# Punctuation_restoration
+paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
+
+# Speech_recognition
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech asr --input ./zh.wav
+paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+
+# Text To Speech
+paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
+paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
+paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+
+
+# Speech Translation (only support linux)
+paddlespeech st --input ./en.wav
+
+
+# batch process
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+
+# shell pipeline
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+
+# stats
+paddlespeech stats --task asr
+paddlespeech stats --task tts
+paddlespeech stats --task cls
 
 # Speaker Verification 
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav

From faf6b8daf855fd06d96f80b2d592579ba4b61ab5 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Thu, 24 Mar 2022 19:19:05 +0800
Subject: [PATCH 41/41] add the vec cli test audio name, test=doc

---
 tests/unit/cli/test_cli.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 6fbb1570..96ab84d6 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -48,10 +48,10 @@ paddlespeech stats --task cls
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 paddlespeech vector --task spk --input 85236145389.wav
 
-echo "demo 85236145389.wav" > vec.job
+echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
 paddlespeech vector --task spk --input vec.job
 
-echo "demo 85236145389.wav" | paddlespeech vector --task spk
+echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
 rm 85236145389.wav 
 rm vec.job