Merge pull request #1630 from Honei/vox12

[vec]voxceleb convert dataset format to paddlespeech
4 years ago · 48e0177767
parent fc72295334 4af007c3fc
commit 48e0177767
18 changed files with 1316 additions and 139 deletions
--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@ -34,14 +34,14 @@ from utils.utility import unzip
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-URL_ROOT = 'http://www.openslr.org/resources/28'
+URL_ROOT = '--no-check-certificate http://www.openslr.org/resources/28'
 DATA_URL = URL_ROOT + '/rirs_noises.zip'
 MD5_DATA = 'e6f48e257286e05de56413b4779d8ffb'
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
    "--target_dir",
-    default=DATA_HOME + "/Aishell",
+    default=DATA_HOME + "/rirs_noise",
    type=str,
    help="Directory to save the dataset. (default: %(default)s)")
 parser.add_argument(
@ -81,6 +81,10 @@ def create_manifest(data_dir, manifest_path_prefix):
                        },
                        ensure_ascii=False))
        manifest_path = manifest_path_prefix + '.' + dtype
        if not os.path.exists(os.path.dirname(manifest_path)):
            os.makedirs(os.path.dirname(manifest_path))
        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
            for line in json_lines:
                fout.write(line + '\n')
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@ -149,7 +149,7 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
    # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory 
    if not os.path.exists(os.path.join(target_dir, "wav")):
        # download all dataset part
-        print("start to download the vox1 dev zip package")
+        print(f"start to download the vox1 zip package to {target_dir}")
        for zip_part in data_list.keys():
            download_url = " --no-check-certificate " + base_url + "/" + zip_part
            download(
--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@ -22,10 +22,12 @@ import codecs
 import glob
 import json
 import os
 import subprocess
 from pathlib import Path
 import soundfile
 from utils.utility import check_md5sum
 from utils.utility import download
 from utils.utility import unzip
@ -35,12 +37,22 @@ DATA_HOME = os.path.expanduser('.')
 BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
 # dev data
-DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
+DEV_LIST = {
-DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
+    "vox2_dev_aac_partaa": "da070494c573e5c0564b1d11c3b20577",
    "vox2_dev_aac_partab": "17fe6dab2b32b48abaf1676429cdd06f",
    "vox2_dev_aac_partac": "1de58e086c5edf63625af1cb6d831528",
    "vox2_dev_aac_partad": "5a043eb03e15c5a918ee6a52aad477f9",
    "vox2_dev_aac_partae": "cea401b624983e2d0b2a87fb5d59aa60",
    "vox2_dev_aac_partaf": "fc886d9ba90ab88e7880ee98effd6ae9",
    "vox2_dev_aac_partag": "d160ecc3f6ee3eed54d55349531cb42e",
    "vox2_dev_aac_partah": "6b84a81b9af72a9d9eecbb3b1f602e65",
 }
 DEV_TARGET_DATA = "vox2_dev_aac_parta* vox2_dev_aac.zip bbc063c46078a602ca71605645c2a402"
 # test data
-TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
+TEST_LIST = {"vox2_test_aac.zip": "0d2b3ea430a821c33263b5ea37ede312"}
-TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
+TEST_TARGET_DATA = "vox2_test_aac.zip vox2_test_aac.zip 0d2b3ea430a821c33263b5ea37ede312"
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@ -68,6 +80,14 @@ args = parser.parse_args()
 def create_manifest(data_dir, manifest_path_prefix):
    """Generate the voxceleb2 dataset manifest file.
    We will create the ${manifest_path_prefix}.vox2 as the final manifest file 
    The dev and test wav info will be put in one manifest file.
    Args:
        data_dir (str): voxceleb2 wav directory, which include dev and test subdataset
        manifest_path_prefix (str): manifest file prefix
    """
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []
    data_path = os.path.join(data_dir, "**", "*.wav")
@ -119,7 +139,19 @@ def create_manifest(data_dir, manifest_path_prefix):
        print(f"{total_sec / total_num} sec/utt", file=f)
-def download_dataset(url, md5sum, target_dir, dataset):
+def download_dataset(base_url, data_list, target_data, target_dir, dataset):
    """Download the voxceleb2 zip package
    Args:
        base_url (str): the voxceleb2 dataset download baseline url
        data_list (dict): the dataset part zip package and the md5 value
        target_data (str): the final dataset zip info
        target_dir (str): the dataset stored directory
        dataset (str): the dataset name, dev or test
    Raises:
        RuntimeError: the md5sum occurs error
    """
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
@ -129,9 +161,34 @@ def download_dataset(url, md5sum, target_dir, dataset):
    # but the test dataset will unzip to aac
    # so, wo create the ${target_dir}/test and unzip the m4a to test dir
    if not os.path.exists(os.path.join(target_dir, dataset)):
-        filepath = download(url, md5sum, target_dir)
+        print(f"start to download the vox2 zip package to {target_dir}")
        for zip_part in data_list.keys():
            download_url = " --no-check-certificate " + base_url + "/" + zip_part
            download(
                url=download_url,
                md5sum=data_list[zip_part],
                target_dir=target_dir)
        # pack the all part to target zip file
        all_target_part, target_name, target_md5sum = target_data.split()
        target_name = os.path.join(target_dir, target_name)
        if not os.path.exists(target_name):
            pack_part_cmd = "cat {}/{} > {}".format(target_dir, all_target_part,
                                                    target_name)
            subprocess.call(pack_part_cmd, shell=True)
        # check the target zip file md5sum
        if not check_md5sum(target_name, target_md5sum):
            raise RuntimeError("{} MD5 checkssum failed".format(target_name))
        else:
            print("Check {} md5sum successfully".format(target_name))
        if dataset == "test":
-            unzip(filepath, os.path.join(target_dir, "test"))
+            # we need make the test directory
            unzip(target_name, os.path.join(target_dir, "test"))
        else:
            # upzip dev zip pacakge and will create the dev directory
            unzip(target_name, target_dir)
 def main():
@ -142,14 +199,16 @@ def main():
    print("download: {}".format(args.download))
    if args.download:
        download_dataset(
-            url=DEV_DATA_URL,
+            base_url=BASE_URL,
-            md5sum=DEV_MD5SUM,
+            data_list=DEV_LIST,
            target_data=DEV_TARGET_DATA,
            target_dir=args.target_dir,
            dataset="dev")
        download_dataset(
-            url=TEST_DATA_URL,
+            base_url=BASE_URL,
-            md5sum=TEST_MD5SUM,
+            data_list=TEST_LIST,
            target_data=TEST_TARGET_DATA,
            target_dir=args.target_dir,
            dataset="test")
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@ -1,14 +1,16 @@
 ###########################################
 #                Data                 #
 ###########################################
 # we should explicitly specify the wav path of vox2 audio data converted from m4a
 vox2_base_path: 
 augment: True
-batch_size: 16
+batch_size: 32
 num_workers: 2
-num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
 skip_prep: False
 split_ratio: 0.9
 chunk_duration: 3.0 # seconds
 random_chunk: True
 verification_file: data/vox1/veri_test2.txt
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
@ -26,7 +28,6 @@ hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 # if we want use another model, please choose another configuration yaml file
 model:
  input_size: 80
  # "channels": [512, 512, 512, 512, 1536],
  channels: [1024, 1024, 1024, 1024, 3072]
  kernel_sizes: [5, 3, 3, 3, 1]
  dilations: [1, 2, 3, 4, 1]
@ -38,8 +39,8 @@ model:
 ###########################################
 seed: 1986 # according from speechbrain configuration
 epochs: 10
-save_interval: 1
+save_interval: 10
-log_interval: 1
+log_interval: 10
 learning_rate: 1e-8
--- a/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml
@ -0,0 +1,53 @@
 ###########################################
 #                Data                 #
 ###########################################
 augment: True
 batch_size: 16
 num_workers: 2
 num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
 shuffle: True
 skip_prep: False
 split_ratio: 0.9
 chunk_duration: 3.0 # seconds
 random_chunk: True
 verification_file: data/vox1/veri_test2.txt
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 # currently, we only support fbank
 sr: 16000           # sample rate
 n_mels: 80
 window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
 hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################
 # currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
 # if we want use another model, please choose another configuration yaml file
 model:
  input_size: 80
  channels: [512, 512, 512, 512, 1536]
  kernel_sizes: [5, 3, 3, 3, 1]
  dilations: [1, 2, 3, 4, 1]
  attention_channels: 128
  lin_neurons: 192
 ###########################################
 #                Training                 #
 ###########################################
 seed: 1986 # according from speechbrain configuration
 epochs: 100
 save_interval: 10
 log_interval: 10
 learning_rate: 1e-8
 ###########################################
 #                Testing                  #
 ###########################################
 global_embedding_norm: True
 embedding_mean_norm: True
 embedding_std_norm: False
--- a/examples/voxceleb/sv0/local/data.sh
+++ b/examples/voxceleb/sv0/local/data.sh
@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-stage=1
+stage=0
 stop_stage=100
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
@ -30,29 +30,114 @@ dir=$1
 conf_path=$2
 mkdir -p ${dir}
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+# Generally the `MAIN_ROOT` refers to the root of PaddleSpeech,
-    # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+# which is defined in the path.sh
-    # we should use the local/convert.sh convert m4a to wav
+# And we will download the voxceleb data and rirs noise to ${MAIN_ROOT}/dataset
    python3 local/data_prepare.py \
                        --data-dir ${dir} \
                        --config ${conf_path}
 fi 
 TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # download data, generate manifests
+   # download data, generate manifests
-    python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
+   # we will generate the manifest.{dev,test} file from ${TARGET_DIR}/voxceleb/vox1/{dev,test} directory
-      --manifest_prefix="data/vox1/manifest" \
+   # and generate the meta info and download the trial file
   # manifest.dev: 148642
   # manifest.test: 4847
   echo "Start to download vox1 dataset and generate the manifest files "
   python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
      --manifest_prefix="${dir}/vox1/manifest" \
      --target_dir="${TARGET_DIR}/voxceleb/vox1/"
-    if [ $? -ne 0 ]; then
+   if [ $? -ne 0 ]; then
-        echo "Prepare voxceleb failed. Terminated."
+      echo "Prepare voxceleb1 failed. Terminated."
-        exit 1
+      exit 1
-    fi
+   fi
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   # download voxceleb2 data
   # we will download the data and unzip the package
   # and we will store the m4a file in ${TARGET_DIR}/voxceleb/vox2/{dev,test}
   echo "start to download vox2 dataset"
   python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
      --download \
      --target_dir="${TARGET_DIR}/voxceleb/vox2/"
   if [ $? -ne 0 ]; then
      echo "Download voxceleb2 dataset failed. Terminated."
      exit 1
   fi
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   # convert the m4a to wav
   # and we will not delete the original m4a file
   echo "start to convert the m4a to wav"
   bash local/convert.sh ${TARGET_DIR}/voxceleb/vox2/test/ || exit 1;
   if [ $? -ne 0 ]; then
      echo "Convert voxceleb2 dataset from m4a to wav failed. Terminated."
      exit 1
   fi
   echo "m4a convert to wav operation finished"
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # generate the vox2 manifest file from wav file
   # we will generate the ${dir}/vox2/manifest.vox2
   # because we use all the vox2 dataset to train, so collect all the vox2 data in one file
   echo "start generate the vox2 manifest files"
   python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \
      --generate \
      --manifest_prefix="${dir}/vox2/manifest" \
      --target_dir="${TARGET_DIR}/voxceleb/vox2/"
-   #  for dataset in train dev test; do
+   if [ $? -ne 0 ]; then
-   #      mv data/manifest.${dataset} data/manifest.${dataset}.raw
+      echo "Prepare voxceleb2 dataset failed. Terminated."
-   #  done
+      exit 1
-fi
+   fi
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
   # generate the vox csv file
   # Currently, our training system use csv file for dataset
   echo "convert the json format to csv format to be compatible with training process"
   python3 local/make_vox_csv_dataset_from_json.py\
      --train "${dir}/vox1/manifest.dev" "${dir}/vox2/manifest.vox2"\
      --test "${dir}/vox1/manifest.test" \
      --target_dir "${dir}/vox/" \
      --config ${conf_path}
   if [ $? -ne 0 ]; then
      echo "Prepare voxceleb failed. Terminated."
      exit 1
   fi
 fi
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
   # generate the open rir noise manifest file
   echo "generate the open rir noise manifest file"
   python3 ${TARGET_DIR}/rir_noise/rir_noise.py\
      --manifest_prefix="${dir}/rir_noise/manifest" \
      --target_dir="${TARGET_DIR}/rir_noise/"
   if [ $? -ne 0 ]; then
      echo "Prepare rir_noise failed. Terminated."
      exit 1
   fi
 fi
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
   # generate the open rir noise manifest file
   echo "generate the open rir noise csv file"
   python3 local/make_rirs_noise_csv_dataset_from_json.py \
      --noise_dir="${TARGET_DIR}/rir_noise/" \
      --data_dir="${dir}/rir_noise/" \
      --config ${conf_path}
   if [ $? -ne 0 ]; then
      echo "Prepare rir_noise failed. Terminated."
      exit 1
   fi
 fi
--- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
@ -0,0 +1,167 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Convert the PaddleSpeech jsonline format data to csv format data in voxceleb experiment.
 Currently, Speaker Identificaton Training process use csv format.
 """
 import argparse
 import csv
 import os
 from typing import List
 import tqdm
 from yacs.config import CfgNode
 from paddleaudio import load as load_audio
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.vector_utils import get_chunks
 logger = Log(__name__).getlog()
 def get_chunks_list(wav_file: str,
                    split_chunks: bool,
                    base_path: str,
                    chunk_duration: float=3.0) -> List[List[str]]:
    """Get the single audio file info 
    Args:
        wav_file (list): the wav audio file and get this audio segment info list
        split_chunks (bool): audio split flag
        base_path (str): the audio base path 
        chunk_duration (float): the chunk duration. 
                                if set the split_chunks, we split the audio into multi-chunks segment.
    """
    waveform, sr = load_audio(wav_file)
    audio_id = wav_file.split("/rir_noise/")[-1].split(".")[0]
    audio_duration = waveform.shape[0] / sr
    ret = []
    if split_chunks and audio_duration > chunk_duration:  # Split into pieces of self.chunk_duration seconds.
        uniq_chunks_list = get_chunks(chunk_duration, audio_id, audio_duration)
        for idx, chunk in enumerate(uniq_chunks_list):
            s, e = chunk.split("_")[-2:]  # Timestamps of start and end
            start_sample = int(float(s) * sr)
            end_sample = int(float(e) * sr)
            # currently, all vector csv data format use one representation
            # id, duration, wav, start, stop, label
            # in rirs noise, all the label name is 'noise'
            # the label is string type and we will convert it to integer type in training
            ret.append([
                chunk, audio_duration, wav_file, start_sample, end_sample,
                "noise"
            ])
    else:  # Keep whole audio.
        ret.append(
            [audio_id, audio_duration, wav_file, 0, waveform.shape[0], "noise"])
    return ret
 def generate_csv(wav_files,
                 output_file: str,
                 base_path: str,
                 split_chunks: bool=True):
    """Prepare the csv file according the wav files
    Args:
        wav_files (list): all the audio list to prepare the csv file
        output_file (str): the output csv file
        config (CfgNode): yaml configuration content
        split_chunks (bool): audio split flag
    """
    logger.info(f'Generating csv: {output_file}')
    header = ["utt_id", "duration", "wav", "start", "stop", "label"]
    csv_lines = []
    for item in tqdm.tqdm(wav_files):
        csv_lines.extend(
            get_chunks_list(
                item, base_path=base_path, split_chunks=split_chunks))
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
    with open(output_file, mode="w") as csv_f:
        csv_writer = csv.writer(
            csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(header)
        for line in csv_lines:
            csv_writer.writerow(line)
 def prepare_data(args, config):
    """Convert the jsonline format to csv format
    Args:
        args (argparse.Namespace): scripts args
        config (CfgNode): yaml configuration content
    """
    # if external config set the skip_prep flat, we will do nothing
    if config.skip_prep:
        return
    base_path = args.noise_dir
    wav_path = os.path.join(base_path, "RIRS_NOISES")
    logger.info(f"base path: {base_path}")
    logger.info(f"wav path: {wav_path}")
    rir_list = os.path.join(wav_path, "real_rirs_isotropic_noises", "rir_list")
    rir_files = []
    with open(rir_list, 'r') as f:
        for line in f.readlines():
            rir_file = line.strip().split(' ')[-1]
            rir_files.append(os.path.join(base_path, rir_file))
    noise_list = os.path.join(wav_path, "pointsource_noises", "noise_list")
    noise_files = []
    with open(noise_list, 'r') as f:
        for line in f.readlines():
            noise_file = line.strip().split(' ')[-1]
            noise_files.append(os.path.join(base_path, noise_file))
    csv_path = os.path.join(args.data_dir, 'csv')
    logger.info(f"csv path: {csv_path}")
    generate_csv(
        rir_files, os.path.join(csv_path, 'rir.csv'), base_path=base_path)
    generate_csv(
        noise_files, os.path.join(csv_path, 'noise.csv'), base_path=base_path)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--noise_dir",
        default=None,
        required=True,
        help="The noise dataset dataset directory.")
    parser.add_argument(
        "--data_dir",
        default=None,
        required=True,
        help="The target directory stores the csv files")
    parser.add_argument(
        "--config",
        default=None,
        required=True,
        type=str,
        help="configuration file")
    args = parser.parse_args()
    # parse the yaml config file
    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    # prepare the csv file from jsonlines files
    prepare_data(args, config)
--- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
@ -0,0 +1,251 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Convert the PaddleSpeech jsonline format data to csv format data in voxceleb experiment.
 Currently, Speaker Identificaton Training process use csv format.
 """
 import argparse
 import csv
 import json
 import os
 import random
 import tqdm
 from yacs.config import CfgNode
 from paddleaudio import load as load_audio
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.vector_utils import get_chunks
 logger = Log(__name__).getlog()
 def prepare_csv(wav_files, output_file, config, split_chunks=True):
    """Prepare the csv file according the wav files
    Args:
        wav_files (list): all the audio list to prepare the csv file
        output_file (str): the output csv file
        config (CfgNode): yaml configuration content
        split_chunks (bool, optional): audio split flag. Defaults to True.
    """
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
    csv_lines = []
    header = ["utt_id", "duration", "wav", "start", "stop", "label"]
    # voxceleb meta info for each training utterance segment
    # we extract a segment from a utterance to train 
    # and the segment' period is between start and stop time point in the original wav file
    # each field in the meta info means as follows:
    # utt_id: the utterance segment name, which is uniq in training dataset
    # duration: the total utterance time
    # wav: utterance file path, which should be absoulute path
    # start: start point in the original wav file sample point range
    # stop: stop point in the original wav file sample point range
    # label: the utterance segment's label name, 
    #        which is speaker name in speaker verification domain
    for item in tqdm.tqdm(wav_files, total=len(wav_files)):
        item = json.loads(item.strip())
        audio_id = item['utt'].replace(".wav",
                                       "")  # we remove the wav suffix name
        audio_duration = item['feat_shape'][0]
        wav_file = item['feat']
        label = audio_id.split('-')[
            0]  # speaker name in speaker verification domain
        waveform, sr = load_audio(wav_file)
        if split_chunks:
            uniq_chunks_list = get_chunks(config.chunk_duration, audio_id,
                                          audio_duration)
            for chunk in uniq_chunks_list:
                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
                start_sample = int(float(s) * sr)
                end_sample = int(float(e) * sr)
                # id, duration, wav, start, stop, label
                # in vector, the label in speaker id
                csv_lines.append([
                    chunk, audio_duration, wav_file, start_sample, end_sample,
                    label
                ])
        else:
            csv_lines.append([
                audio_id, audio_duration, wav_file, 0, waveform.shape[0], label
            ])
    with open(output_file, mode="w") as csv_f:
        csv_writer = csv.writer(
            csv_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(header)
        for line in csv_lines:
            csv_writer.writerow(line)
 def get_enroll_test_list(dataset_list, verification_file):
    """Get the enroll and test utterance list from all the voxceleb1 test utterance dataset.
       Generally, we get the enroll and test utterances from the verfification file.
       The verification file format as follows:
       target/nontarget enroll-utt test-utt,
       we set 0 as nontarget and 1 as target, eg:
       0 a.wav b.wav
       1 a.wav a.wav
    Args:
        dataset_list (list): all the dataset to get the test utterances
        verification_file (str): voxceleb1 trial file
    """
    logger.info(f"verification file: {verification_file}")
    enroll_audios = set()
    test_audios = set()
    with open(verification_file, 'r') as f:
        for line in f:
            _, enroll_file, test_file = line.strip().split(' ')
            enroll_audios.add('-'.join(enroll_file.split('/')))
            test_audios.add('-'.join(test_file.split('/')))
    enroll_files = []
    test_files = []
    for dataset in dataset_list:
        with open(dataset, 'r') as f:
            for line in f:
                # audio_id may be in enroll and test at the same time
                # eg: 1 a.wav a.wav
                # the audio a.wav is enroll and test file at the same time
                audio_id = json.loads(line.strip())['utt']
                if audio_id in enroll_audios:
                    enroll_files.append(line)
                if audio_id in test_audios:
                    test_files.append(line)
    enroll_files = sorted(enroll_files)
    test_files = sorted(test_files)
    return enroll_files, test_files
 def get_train_dev_list(dataset_list, target_dir, split_ratio):
    """Get the train and dev utterance list from all the training utterance dataset.
       Generally, we use the split_ratio as the train dataset ratio,
       and the remaining utterance (ratio is 1 - split_ratio) is the dev dataset
    Args:
        dataset_list (list): all the dataset to get the all utterances
        target_dir (str): the target train and dev directory, 
                          we will create the csv directory to store the {train,dev}.csv file
        split_ratio (float): train dataset ratio in all utterance list
    """
    logger.info("start to get train and dev utt list")
    if not os.path.exists(os.path.join(target_dir, "meta")):
        os.makedirs(os.path.join(target_dir, "meta"))
    audio_files = []
    speakers = set()
    for dataset in dataset_list:
        with open(dataset, 'r') as f:
            for line in f:
                # the label is speaker name
                label_name = json.loads(line.strip())['utt2spk']
                speakers.add(label_name)
                audio_files.append(line.strip())
    speakers = sorted(speakers)
    logger.info(f"we get {len(speakers)} speakers from all the train dataset")
    with open(os.path.join(target_dir, "meta", "label2id.txt"), 'w') as f:
        for label_id, label_name in enumerate(speakers):
            f.write(f'{label_name} {label_id}\n')
    logger.info(
        f'we store the speakers to {os.path.join(target_dir, "meta", "label2id.txt")}'
    )
    # the split_ratio is for train dataset 
    # the remaining is for dev dataset
    split_idx = int(split_ratio * len(audio_files))
    audio_files = sorted(audio_files)
    random.shuffle(audio_files)
    train_files, dev_files = audio_files[:split_idx], audio_files[split_idx:]
    logger.info(
        f"we get train utterances: {len(train_files)}, dev utterance: {len(dev_files)}"
    )
    return train_files, dev_files
 def prepare_data(args, config):
    """Convert the jsonline format to csv format
    Args:
        args (argparse.Namespace): scripts args
        config (CfgNode): yaml configuration content
    """
    # stage0: set the random seed
    random.seed(config.seed)
    # if external config set the skip_prep flat, we will do nothing
    if config.skip_prep:
        return
    # stage 1: prepare the enroll and test csv file
    #          And we generate the speaker to label file label2id.txt
    logger.info("start to prepare the data csv file")
    enroll_files, test_files = get_enroll_test_list(
        [args.test], verification_file=config.verification_file)
    prepare_csv(
        enroll_files,
        os.path.join(args.target_dir, "csv", "enroll.csv"),
        config,
        split_chunks=False)
    prepare_csv(
        test_files,
        os.path.join(args.target_dir, "csv", "test.csv"),
        config,
        split_chunks=False)
    # stage 2: prepare the train and dev csv file
    #          we get the train dataset ratio as config.split_ratio
    #          and the remaining is dev dataset
    logger.info("start to prepare the data csv file")
    train_files, dev_files = get_train_dev_list(
        args.train, target_dir=args.target_dir, split_ratio=config.split_ratio)
    prepare_csv(train_files,
                os.path.join(args.target_dir, "csv", "train.csv"), config)
    prepare_csv(dev_files,
                os.path.join(args.target_dir, "csv", "dev.csv"), config)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--train",
        required=True,
        nargs='+',
        help="The jsonline files list for train.")
    parser.add_argument(
        "--test", required=True, help="The jsonline file for test")
    parser.add_argument(
        "--target_dir",
        default=None,
        required=True,
        help="The target directory stores the csv files and meta file.")
    parser.add_argument(
        "--config",
        default=None,
        required=True,
        type=str,
        help="configuration file")
    args = parser.parse_args()
    # parse the yaml config file
    config = CfgNode(new_allowed=True)
    if args.config:
        config.merge_from_file(args.config)
    # prepare the csv file from jsonlines files
    prepare_data(args, config)
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@ -18,24 +18,22 @@ set -e
 #######################################################################
 # stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
-#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
+#          voxceleb2 data is m4a format, so we need convert the m4a to wav yourselves with the script local/convert.sh
 # stage 1: train the speaker identification model
 # stage 2: test speaker identification 
-# stage 3: extract the training embeding to train the LDA and PLDA
+# stage 3: (todo)extract the training embeding to train the LDA and PLDA
 ######################################################################
 # we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset 
 # default the dataset will be stored in the ~/.paddleaudio/
 # the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
-# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
+# and put all of them to ${MAIN_ROOT}/datasets/vox2
-# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
+# we will find the wav from ${MAIN_ROOT}/datasets/vox1/{dev,test}/wav and ${MAIN_ROOT}/datasets/vox2/wav
-# export PPAUDIO_HOME=
+
 stage=0
 stop_stage=50
 # data directory
 # if we set the variable ${dir}, we will store the wav info to this directory
-# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+# otherwise, we will store the wav info to data/vox1 and data/vox2 directory respectively
 # vox2 wav path, we must convert the m4a format to wav format    
 dir=data/                                 # data info directory   
@ -64,6 +62,6 @@ if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 fi
 # if [ $stage -le 3 ]; then
-#      # stage 2: extract the training embeding to train the LDA and PLDA
+#      # stage 3: extract the training embeding to train the LDA and PLDA
 #      # todo: extract the training embedding
 # fi 
--- a/paddleaudio/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@ -261,7 +261,7 @@ class VoxCeleb(Dataset):
                     output_file: str,
                     split_chunks: bool=True):
        print(f'Generating csv: {output_file}')
-        header = ["ID", "duration", "wav", "start", "stop", "spk_id"]
+        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
        # Note: this may occurs c++ execption, but the program will execute fine
        # so we can ignore the execption 
        with Pool(cpu_count()) as p:
--- a/paddlespeech/vector/exps/ecapa_tdnn/test.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@ -21,10 +21,11 @@ from paddle.io import DataLoader
 from tqdm import tqdm
 from yacs.config import CfgNode
 from paddleaudio.datasets import VoxCeleb
 from paddleaudio.metric import compute_eer
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import batch_feature_normalize
 from paddlespeech.vector.io.dataset import CSVDataset
 from paddlespeech.vector.io.embedding_norm import InputNormalization
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.seeding import seed_everything
@ -32,6 +33,91 @@ from paddlespeech.vector.training.seeding import seed_everything
 logger = Log(__name__).getlog()
 def compute_dataset_embedding(data_loader, model, mean_var_norm_emb, config,
                              id2embedding):
    """compute the dataset embeddings
    Args:
        data_loader (_type_): _description_
        model (_type_): _description_
        mean_var_norm_emb (_type_): _description_
        config (_type_): _description_
    """
    logger.info(
        f'Computing embeddings on {data_loader.dataset.csv_path} dataset')
    with paddle.no_grad():
        for batch_idx, batch in enumerate(tqdm(data_loader)):
            # stage 8-1: extrac the audio embedding
            ids, feats, lengths = batch['ids'], batch['feats'], batch['lengths']
            embeddings = model.backbone(feats, lengths).squeeze(
                -1)  # (N, emb_size, 1) -> (N, emb_size)
            # Global embedding normalization.
            # if we use the global embedding norm
            # eer can reduece about relative 10%
            if config.global_embedding_norm and mean_var_norm_emb:
                lengths = paddle.ones([embeddings.shape[0]])
                embeddings = mean_var_norm_emb(embeddings, lengths)
            # Update embedding dict.
            id2embedding.update(dict(zip(ids, embeddings)))
 def compute_verification_scores(id2embedding, train_cohort, config):
    labels = []
    enroll_ids = []
    test_ids = []
    logger.info(f"read the trial from {config.verification_file}")
    cos_sim_func = paddle.nn.CosineSimilarity(axis=-1)
    scores = []
    with open(config.verification_file, 'r') as f:
        for line in f.readlines():
            label, enroll_id, test_id = line.strip().split(' ')
            enroll_id = enroll_id.split('.')[0].replace('/', '-')
            test_id = test_id.split('.')[0].replace('/', '-')
            labels.append(int(label))
            enroll_emb = id2embedding[enroll_id]
            test_emb = id2embedding[test_id]
            score = cos_sim_func(enroll_emb, test_emb).item()
            if "score_norm" in config:
                # Getting norm stats for enroll impostors
                enroll_rep = paddle.tile(
                    enroll_emb, repeat_times=[train_cohort.shape[0], 1])
                score_e_c = cos_sim_func(enroll_rep, train_cohort)
                if "cohort_size" in config:
                    score_e_c, _ = paddle.topk(
                        score_e_c, k=config.cohort_size, axis=0)
                mean_e_c = paddle.mean(score_e_c, axis=0)
                std_e_c = paddle.std(score_e_c, axis=0)
                # Getting norm stats for test impostors
                test_rep = paddle.tile(
                    test_emb, repeat_times=[train_cohort.shape[0], 1])
                score_t_c = cos_sim_func(test_rep, train_cohort)
                if "cohort_size" in config:
                    score_t_c, _ = paddle.topk(
                        score_t_c, k=config.cohort_size, axis=0)
                mean_t_c = paddle.mean(score_t_c, axis=0)
                std_t_c = paddle.std(score_t_c, axis=0)
                if config.score_norm == "s-norm":
                    score_e = (score - mean_e_c) / std_e_c
                    score_t = (score - mean_t_c) / std_t_c
                    score = 0.5 * (score_e + score_t)
                elif config.score_norm == "z-norm":
                    score = (score - mean_e_c) / std_e_c
                elif config.score_norm == "t-norm":
                    score = (score - mean_t_c) / std_t_c
            scores.append(score)
    return scores, labels
 def main(args, config):
    # stage0: set the training device, cpu or gpu
    paddle.set_device(args.device)
@ -58,9 +144,8 @@ def main(args, config):
    # stage4: construct the enroll and test dataloader
-    enroll_dataset = VoxCeleb(
+    enroll_dataset = CSVDataset(
-        subset='enroll',
+        os.path.join(args.data_dir, "vox/csv/enroll.csv"),
        target_dir=args.data_dir,
        feat_type='melspectrogram',
        random_chunk=False,
        n_mels=config.n_mels,
@ -68,16 +153,15 @@ def main(args, config):
        hop_length=config.hop_size)
    enroll_sampler = BatchSampler(
        enroll_dataset, batch_size=config.batch_size,
-        shuffle=True)  # Shuffle to make embedding normalization more robust.
+        shuffle=False)  # Shuffle to make embedding normalization more robust.
-    enrol_loader = DataLoader(enroll_dataset,
+    enroll_loader = DataLoader(enroll_dataset,
                    batch_sampler=enroll_sampler,
                    collate_fn=lambda x: batch_feature_normalize(
-                            x, mean_norm=True, std_norm=False),
+                                x, mean_norm=True, std_norm=False),
                    num_workers=config.num_workers,
                    return_list=True,)
-    test_dataset = VoxCeleb(
+    test_dataset = CSVDataset(
-        subset='test',
+        os.path.join(args.data_dir, "vox/csv/test.csv"),
        target_dir=args.data_dir,
        feat_type='melspectrogram',
        random_chunk=False,
        n_mels=config.n_mels,
@ -85,7 +169,7 @@ def main(args, config):
        hop_length=config.hop_size)
    test_sampler = BatchSampler(
-        test_dataset, batch_size=config.batch_size, shuffle=True)
+        test_dataset, batch_size=config.batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset,
                            batch_sampler=test_sampler,
                            collate_fn=lambda x: batch_feature_normalize(
@ -97,75 +181,65 @@ def main(args, config):
    # stage6: global embedding norm to imporve the performance
    logger.info(f"global embedding norm: {config.global_embedding_norm}")
    if config.global_embedding_norm:
        global_embedding_mean = None
        global_embedding_std = None
        mean_norm_flag = config.embedding_mean_norm
        std_norm_flag = config.embedding_std_norm
        batch_count = 0
    # stage7: Compute embeddings of audios in enrol and test dataset from model.
    if config.global_embedding_norm:
        mean_var_norm_emb = InputNormalization(
            norm_type="global",
            mean_norm=config.embedding_mean_norm,
            std_norm=config.embedding_std_norm)
    if "score_norm" in config:
        logger.info(f"we will do score norm: {config.score_norm}")
        train_dataset = CSVDataset(
            os.path.join(args.data_dir, "vox/csv/train.csv"),
            feat_type='melspectrogram',
            n_train_snts=config.n_train_snts,
            random_chunk=False,
            n_mels=config.n_mels,
            window_size=config.window_size,
            hop_length=config.hop_size)
        train_sampler = BatchSampler(
            train_dataset, batch_size=config.batch_size, shuffle=False)
        train_loader = DataLoader(train_dataset,
                            batch_sampler=train_sampler,
                            collate_fn=lambda x: batch_feature_normalize(
                                x, mean_norm=True, std_norm=False),
                            num_workers=config.num_workers,
                            return_list=True,)
    id2embedding = {}
    # Run multi times to make embedding normalization more stable.
-    for i in range(2):
+    logger.info("First loop for enroll and test dataset")
-        for dl in [enrol_loader, test_loader]:
+    compute_dataset_embedding(enroll_loader, model, mean_var_norm_emb, config,
-            logger.info(
+                              id2embedding)
-                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+    compute_dataset_embedding(test_loader, model, mean_var_norm_emb, config,
-            )
+                              id2embedding)
-            with paddle.no_grad():
+
-                for batch_idx, batch in enumerate(tqdm(dl)):
+    logger.info("Second loop for enroll and test dataset")
-
+    compute_dataset_embedding(enroll_loader, model, mean_var_norm_emb, config,
-                    # stage 8-1: extrac the audio embedding
+                              id2embedding)
-                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
+    compute_dataset_embedding(test_loader, model, mean_var_norm_emb, config,
-                        'lengths']
+                              id2embedding)
-                    embeddings = model.backbone(feats, lengths).squeeze(
+    mean_var_norm_emb.save(
-                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
+        os.path.join(args.load_checkpoint, "mean_var_norm_emb"))
                    # Global embedding normalization.
                    # if we use the global embedding norm
                    # eer can reduece about relative 10%
                    if config.global_embedding_norm:
                        batch_count += 1
                        current_mean = embeddings.mean(
                            axis=0) if mean_norm_flag else 0
                        current_std = embeddings.std(
                            axis=0) if std_norm_flag else 1
                        # Update global mean and std.
                        if global_embedding_mean is None and global_embedding_std is None:
                            global_embedding_mean, global_embedding_std = current_mean, current_std
                        else:
                            weight = 1 / batch_count  # Weight decay by batches.
                            global_embedding_mean = (
                                1 - weight
                            ) * global_embedding_mean + weight * current_mean
                            global_embedding_std = (
                                1 - weight
                            ) * global_embedding_std + weight * current_std
                        # Apply global embedding normalization.
                        embeddings = (embeddings - global_embedding_mean
                                      ) / global_embedding_std
                    # Update embedding dict.
                    id2embedding.update(dict(zip(ids, embeddings)))
    # stage 8: Compute cosine scores.
-    labels = []
+    train_cohort = None
-    enroll_ids = []
+    if "score_norm" in config:
-    test_ids = []
+        train_embeddings = {}
-    logger.info(f"read the trial from {VoxCeleb.veri_test_file}")
+        # cohort embedding not do mean and std norm
-    with open(VoxCeleb.veri_test_file, 'r') as f:
+        compute_dataset_embedding(train_loader, model, None, config,
-        for line in f.readlines():
+                                  train_embeddings)
-            label, enroll_id, test_id = line.strip().split(' ')
+        train_cohort = paddle.stack(list(train_embeddings.values()))
-            labels.append(int(label))
+
-            enroll_ids.append(enroll_id.split('.')[0].replace('/', '-'))
+    # compute the scores
-            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+    scores, labels = compute_verification_scores(id2embedding, train_cohort,
-
+                                                 config)
-    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+
-    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+    # compute the EER and threshold
-        np.asarray([id2embedding[uttid] for uttid in ids], dtype='float32')),
+    scores = paddle.to_tensor(scores)
                                            [enroll_ids, test_ids
                                             ])  # (N, emb_size)
    scores = cos_sim_func(enrol_embeddings, test_embeddings)
    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
    logger.info(
        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@ -23,13 +23,13 @@ from paddle.io import DistributedBatchSampler
 from yacs.config import CfgNode
 from paddleaudio.compliance.librosa import melspectrogram
 from paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.io.augment import waveform_augment
 from paddlespeech.vector.io.batch import batch_pad_right
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.io.batch import waveform_collate_fn
 from paddlespeech.vector.io.dataset import CSVDataset
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.loss import AdditiveAngularMargin
 from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
@ -54,8 +54,12 @@ def main(args, config):
    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
-    train_dataset = VoxCeleb('train', target_dir=args.data_dir)
+    train_dataset = CSVDataset(
-    dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
+        csv_path=os.path.join(args.data_dir, "vox/csv/train.csv"),
        label2id_path=os.path.join(args.data_dir, "vox/meta/label2id.txt"))
    dev_dataset = CSVDataset(
        csv_path=os.path.join(args.data_dir, "vox/csv/dev.csv"),
        label2id_path=os.path.join(args.data_dir, "vox/meta/label2id.txt"))
    if config.augment:
        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
@ -67,7 +71,7 @@ def main(args, config):
    # stage4: build the speaker verification train instance with backbone model
    model = SpeakerIdetification(
-        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
    # stage5: build the optimizer, we now only construct the AdamW optimizer
    #         140000 is single gpu steps
@ -193,15 +197,15 @@ def main(args, config):
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            optimizer.clear_grad()
            train_run_cost += time.time() - train_start
            # stage 9-8: Calculate average loss per batch
-            avg_loss += loss.numpy()[0]
+            avg_loss = loss.item()
            # stage 9-9: Calculate metrics, which is one-best accuracy
            preds = paddle.argmax(logits, axis=1)
            num_corrects += (preds == labels).numpy().sum()
            num_samples += feats.shape[0]
            train_run_cost += time.time() - train_start
            timer.count()  # step plus one in timer
            # stage 9-10: print the log information only on 0-rank per log-freq batchs
@ -220,8 +224,9 @@ def main(args, config):
                    train_feat_cost / config.log_interval)
                print_msg += ' avg_train_cost: {:.5f} sec,'.format(
                    train_run_cost / config.log_interval)
-                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
+
-                    lr, timer.timing, timer.eta)
+                print_msg += ' lr={:.4E} step/sec={:.2f} ips:{:.5f}| ETA {}'.format(
                    lr, timer.timing, timer.ips, timer.eta)
                logger.info(print_msg)
                avg_loss = 0
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@ -14,6 +14,7 @@
 # this is modified from SpeechBrain
 # https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
 import math
 import os
 from typing import List
 import numpy as np
@ -21,8 +22,8 @@ import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddleaudio.datasets.rirs_noises import OpenRIRNoise
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.dataset import CSVDataset
 from paddlespeech.vector.io.signal_processing import compute_amplitude
 from paddlespeech.vector.io.signal_processing import convolve1d
 from paddlespeech.vector.io.signal_processing import dB_to_amplitude
@ -509,7 +510,7 @@ class AddNoise(nn.Layer):
                    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
                    return np.pad(x, [0, w], mode=mode, **kwargs)
-                ids = [item['id'] for item in batch]
+                ids = [item['utt_id'] for item in batch]
                lengths = np.asarray([item['feat'].shape[0] for item in batch])
                waveforms = list(
                    map(lambda x: pad(x, max(max_length, lengths.max().item())),
@ -589,7 +590,7 @@ class AddReverb(nn.Layer):
                assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
                return np.pad(x, [0, w], mode=mode, **kwargs)
-            ids = [item['id'] for item in batch]
+            ids = [item['utt_id'] for item in batch]
            lengths = np.asarray([item['feat'].shape[0] for item in batch])
            waveforms = list(
                map(lambda x: pad(x, lengths.max().item()),
@ -839,8 +840,10 @@ def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
        List[paddle.nn.Layer]: all augment process
    """
    logger.info("start to build the augment pipeline")
-    noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
+    noise_dataset = CSVDataset(csv_path=os.path.join(target_dir,
-    rir_dataset = OpenRIRNoise('rir', target_dir=target_dir)
+                                                     "rir_noise/csv/noise.csv"))
    rir_dataset = CSVDataset(csv_path=os.path.join(target_dir,
                                                   "rir_noise/csv/rir.csv"))
    wavedrop = TimeDomainSpecAugment(
        sample_rate=16000,
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@ -17,6 +17,17 @@ import paddle
 def waveform_collate_fn(batch):
    """Wrap the waveform into a batch form
    Args:
        batch (list): the waveform list from the dataloader
                      the item of data include several field
                      feat: the utterance waveform data
                      label: the utterance label encoding data
    Returns:
        dict: the batch data to dataloader
    """
    waveforms = np.stack([item['feat'] for item in batch])
    labels = np.stack([item['label'] for item in batch])
@ -27,6 +38,18 @@ def feature_normalize(feats: paddle.Tensor,
                      mean_norm: bool=True,
                      std_norm: bool=True,
                      convert_to_numpy: bool=False):
    """Do one utterance feature normalization
    Args:
        feats (paddle.Tensor): the original utterance feat, such as fbank, mfcc
        mean_norm (bool, optional): mean norm flag. Defaults to True.
        std_norm (bool, optional): std norm flag. Defaults to True.
        convert_to_numpy (bool, optional): convert the paddle.tensor to numpy 
                                           and do feature norm with numpy. Defaults to False.
    Returns:
        paddle.Tensor : the normalized feats
    """
    # Features normalization if needed
    # numpy.mean is a little with paddle.mean about 1e-6
    if convert_to_numpy:
@ -60,7 +83,17 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
 def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
-    ids = [item['id'] for item in batch]
+    """Do batch utterance features normalization
    Args:
        batch (list): the batch feature from dataloader
        mean_norm (bool, optional): mean normalization flag. Defaults to True.
        std_norm (bool, optional): std normalization flag. Defaults to True.
    Returns:
        dict: the normalized batch features
    """
    ids = [item['utt_id'] for item in batch]
    lengths = np.asarray([item['feat'].shape[1] for item in batch])
    feats = list(
        map(lambda x: pad_right_2d(x, lengths.max()),
--- a/paddlespeech/vector/io/dataset.py
+++ b/paddlespeech/vector/io/dataset.py
@ -0,0 +1,192 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
 from dataclasses import fields
 from paddle.io import Dataset
 from paddleaudio import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
 # the audio meta info in the vector CSVDataset
 # utt_id: the utterance segment name
 # duration: utterance segment time
 # wav: utterance file path
 # start: start point in the original wav file
 # stop: stop point in the original wav file
 # label: the utterance segment's label id
@dataclass
 class meta_info:
    """the audio meta info in the vector CSVDataset
    Args:
        utt_id (str): the utterance segment name
        duration (float): utterance segment time
        wav (str): utterance file path
        start (int): start point in the original wav file
        stop (int): stop point in the original wav file
        lab_id (str): the utterance segment's label id
    """
    utt_id: str
    duration: float
    wav: str
    start: int
    stop: int
    label: str
 # csv dataset support feature type
 # raw: return the pcm data sample point
 # melspectrogram: fbank feature
 feat_funcs = {
    'raw': None,
    'melspectrogram': melspectrogram,
 }
 class CSVDataset(Dataset):
    def __init__(self,
                 csv_path,
                 label2id_path=None,
                 config=None,
                 random_chunk=True,
                 feat_type: str="raw",
                 n_train_snts: int=-1,
                 **kwargs):
        """Implement the CSV Dataset
        Args:
            csv_path (str): csv dataset file path
            label2id_path (str): the utterance label to integer id map file path
            config (CfgNode): yaml config
            feat_type (str): dataset feature type. if it is raw, it return pcm data.
            n_train_snts (int): select the n_train_snts sample from the dataset. 
                                if n_train_snts = -1, dataset will load all the sample.
                                Default value is -1.
            kwargs : feature type args
        """
        super().__init__()
        self.csv_path = csv_path
        self.label2id_path = label2id_path
        self.config = config
        self.random_chunk = random_chunk
        self.feat_type = feat_type
        self.n_train_snts = n_train_snts
        self.feat_config = kwargs
        self.id2label = {}
        self.label2id = {}
        self.data = self.load_data_csv()
        self.load_speaker_to_label()
    def load_data_csv(self):
        """Load the csv dataset content and store them in the data property
        the csv dataset's format has six fields, 
        that is audio_id or utt_id, audio duration, segment start point, segment stop point 
        and utterance label.
        Note in training period, the utterance label must has a map to integer id in label2id_path 
        Returns:
            list: the csv data with meta_info type
        """
        data = []
        with open(self.csv_path, 'r') as rf:
            for line in rf.readlines()[1:]:
                audio_id, duration, wav, start, stop, spk_id = line.strip(
                ).split(',')
                data.append(
                    meta_info(audio_id,
                              float(duration), wav,
                              int(start), int(stop), spk_id))
        if self.n_train_snts > 0:
            sample_num = min(self.n_train_snts, len(data))
            data = data[0:sample_num]
        return data
    def load_speaker_to_label(self):
        """Load the utterance label map content.
        In vector domain, we call the utterance label as speaker label.
        The speaker label is real speaker label in speaker verification domain,
        and in language identification is language label.
        """
        if not self.label2id_path:
            logger.warning("No speaker id to label file")
            return
        with open(self.label2id_path, 'r') as f:
            for line in f.readlines():
                label_name, label_id = line.strip().split(' ')
                self.label2id[label_name] = int(label_id)
                self.id2label[int(label_id)] = label_name
    def convert_to_record(self, idx: int):
        """convert the dataset sample to training record the CSV Dataset
        Args:
            idx (int) : the request index in all the dataset
        """
        sample = self.data[idx]
        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in fields(sample):
            record[field.name] = getattr(sample, field.name)
        waveform, sr = load_audio(record['wav'])
        # random select a chunk audio samples from the audio
        if self.config and self.config.random_chunk:
            num_wav_samples = waveform.shape[0]
            num_chunk_samples = int(self.config.chunk_duration * sr)
            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
            stop = start + num_chunk_samples
        else:
            start = record['start']
            stop = record['stop']
        # we only return the waveform as feat
        waveform = waveform[start:stop]
        # all availabel feature type is in feat_funcs
        assert self.feat_type in feat_funcs.keys(), \
            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sr=sr, **self.feat_config) if feat_func else waveform
        record.update({'feat': feat})
        if self.label2id:
            record.update({'label': self.label2id[record['label']]})
        return record
    def __getitem__(self, idx):
        """Return the specific index sample
        Args:
            idx (int) : the request index in all the dataset
        """
        return self.convert_to_record(idx)
    def __len__(self):
        """Return the dataset length
        Returns:
            int: the length num of the dataset
        """
        return len(self.data)
--- a/paddlespeech/vector/io/embedding_norm.py
+++ b/paddlespeech/vector/io/embedding_norm.py
@ -0,0 +1,214 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Dict
 import paddle
 class InputNormalization:
    spk_dict_mean: Dict[int, paddle.Tensor]
    spk_dict_std: Dict[int, paddle.Tensor]
    spk_dict_count: Dict[int, int]
    def __init__(
            self,
            mean_norm=True,
            std_norm=True,
            norm_type="global", ):
        """Do feature or embedding mean and std norm
        Args:
            mean_norm (bool, optional): mean norm flag. Defaults to True.
            std_norm (bool, optional): std norm flag. Defaults to True.
            norm_type (str, optional): norm type. Defaults to "global".
        """
        super().__init__()
        self.training = True
        self.mean_norm = mean_norm
        self.std_norm = std_norm
        self.norm_type = norm_type
        self.glob_mean = paddle.to_tensor([0], dtype="float32")
        self.glob_std = paddle.to_tensor([0], dtype="float32")
        self.spk_dict_mean = {}
        self.spk_dict_std = {}
        self.spk_dict_count = {}
        self.weight = 1.0
        self.count = 0
        self.eps = 1e-10
    def __call__(self,
                 x,
                 lengths,
                 spk_ids=paddle.to_tensor([], dtype="float32")):
        """Returns the tensor with the surrounding context.
        Args:
            x (paddle.Tensor): A batch of tensors.
            lengths (paddle.Tensor): A batch of tensors containing the relative length of each
                                    sentence (e.g, [0.7, 0.9, 1.0]). It is used to avoid
                                    computing stats on zero-padded steps.
            spk_ids (_type_, optional): tensor containing the ids of each speaker (e.g, [0 10 6]).
                                        It is used to perform per-speaker normalization when
                                        norm_type='speaker'. Defaults to paddle.to_tensor([], dtype="float32").
        Returns:
            paddle.Tensor: The normalized feature or embedding
        """
        N_batches = x.shape[0]
        # print(f"x shape: {x.shape[1]}")
        current_means = []
        current_stds = []
        for snt_id in range(N_batches):
            # Avoiding padded time steps
            # actual size is the actual time data length
            actual_size = paddle.round(lengths[snt_id] *
                                       x.shape[1]).astype("int32")
            # computing actual time data statistics
            current_mean, current_std = self._compute_current_stats(
                x[snt_id, 0:actual_size, ...].unsqueeze(0))
            current_means.append(current_mean)
            current_stds.append(current_std)
        if self.norm_type == "global":
            current_mean = paddle.mean(paddle.stack(current_means), axis=0)
            current_std = paddle.mean(paddle.stack(current_stds), axis=0)
            if self.norm_type == "global":
                if self.training:
                    if self.count == 0:
                        self.glob_mean = current_mean
                        self.glob_std = current_std
                    else:
                        self.weight = 1 / (self.count + 1)
                        self.glob_mean = (
                            1 - self.weight
                        ) * self.glob_mean + self.weight * current_mean
                        self.glob_std = (
                            1 - self.weight
                        ) * self.glob_std + self.weight * current_std
                    self.glob_mean.detach()
                    self.glob_std.detach()
                    self.count = self.count + 1
                x = (x - self.glob_mean) / (self.glob_std)
        return x
    def _compute_current_stats(self, x):
        """Returns the tensor with the surrounding context.
        Args:
            x (paddle.Tensor): A batch of tensors.
        Returns:
             the statistics of the data
        """
        # Compute current mean
        if self.mean_norm:
            current_mean = paddle.mean(x, axis=0).detach()
        else:
            current_mean = paddle.to_tensor([0.0], dtype="float32")
        # Compute current std
        if self.std_norm:
            current_std = paddle.std(x, axis=0).detach()
        else:
            current_std = paddle.to_tensor([1.0], dtype="float32")
        # Improving numerical stability of std
        current_std = paddle.maximum(current_std,
                                     self.eps * paddle.ones_like(current_std))
        return current_mean, current_std
    def _statistics_dict(self):
        """Fills the dictionary containing the normalization statistics.
        """
        state = {}
        state["count"] = self.count
        state["glob_mean"] = self.glob_mean
        state["glob_std"] = self.glob_std
        state["spk_dict_mean"] = self.spk_dict_mean
        state["spk_dict_std"] = self.spk_dict_std
        state["spk_dict_count"] = self.spk_dict_count
        return state
    def _load_statistics_dict(self, state):
        """Loads the dictionary containing the statistics.
        Arguments
        ---------
        state : dict
            A dictionary containing the normalization statistics.
        """
        self.count = state["count"]
        if isinstance(state["glob_mean"], int):
            self.glob_mean = state["glob_mean"]
            self.glob_std = state["glob_std"]
        else:
            self.glob_mean = state["glob_mean"]  # .to(self.device_inp)
            self.glob_std = state["glob_std"]  # .to(self.device_inp)
        # Loading the spk_dict_mean in the right device
        self.spk_dict_mean = {}
        for spk in state["spk_dict_mean"]:
            self.spk_dict_mean[spk] = state["spk_dict_mean"][spk]
        # Loading the spk_dict_std in the right device
        self.spk_dict_std = {}
        for spk in state["spk_dict_std"]:
            self.spk_dict_std[spk] = state["spk_dict_std"][spk]
        self.spk_dict_count = state["spk_dict_count"]
        return state
    def to(self, device):
        """Puts the needed tensors in the right device.
        """
        self = super(InputNormalization, self).to(device)
        self.glob_mean = self.glob_mean.to(device)
        self.glob_std = self.glob_std.to(device)
        for spk in self.spk_dict_mean:
            self.spk_dict_mean[spk] = self.spk_dict_mean[spk].to(device)
            self.spk_dict_std[spk] = self.spk_dict_std[spk].to(device)
        return self
    def save(self, path):
        """Save statistic dictionary.
        Args:
            path (str): A path where to save the dictionary.
        """
        stats = self._statistics_dict()
        paddle.save(stats, path)
    def _load(self, path, end_of_epoch=False, device=None):
        """Load statistic dictionary.
        Arguments
        ---------
        path : str
            The path of the statistic dictionary
        device : str, None
            Passed to paddle.load(..., map_location=device)
        """
        del end_of_epoch  # Unused here.
        stats = paddle.load(path, map_location=device)
        self._load_statistics_dict(stats)
--- a/paddlespeech/vector/utils/time.py
+++ b/paddlespeech/vector/utils/time.py
@ -23,6 +23,7 @@ class Timer(object):
        self.last_start_step = 0
        self.current_step = 0
        self._is_running = True
        self.cur_ips = 0
    def start(self):
        self.last_time = time.time()
@ -43,12 +44,17 @@ class Timer(object):
        self.last_start_step = self.current_step
        time_used = time.time() - self.last_time
        self.last_time = time.time()
        self.cur_ips = run_steps / time_used
        return time_used / run_steps
    @property
    def is_running(self) -> bool:
        return self._is_running
    @property
    def ips(self) -> float:
        return self.cur_ips
    @property
    def eta(self) -> str:
        if not self.is_running:
--- a/paddlespeech/vector/utils/vector_utils.py
+++ b/paddlespeech/vector/utils/vector_utils.py
@ -0,0 +1,32 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 def get_chunks(seg_dur, audio_id, audio_duration):
    """Get all chunk segments from a utterance
    Args:
        seg_dur (float): segment chunk duration, seconds
        audio_id (str): utterance name, 
        audio_duration (float): utterance duration, seconds
    Returns:
        List: all the chunk segments 
    """
    num_chunks = int(audio_duration / seg_dur)  # all in seconds
    chunk_lst = [
        audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
        for i in range(num_chunks)
    ]
    return chunk_lst