PaddleSpeech/audio/paddleaudio/datasets/voxceleb.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import csv
import glob
import os
import random
from multiprocessing import cpu_count
from typing import List

from paddle.io import Dataset
from pathos.multiprocessing import Pool
from tqdm import tqdm

from ..backends.soundfile_backend import soundfile_load as load_audio
from ..utils import DATA_HOME
from ..utils import decompress
from ..utils.download import download_and_decompress
from .dataset import feat_funcs

__all__ = ['VoxCeleb']


class VoxCeleb(Dataset):
    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
    archieves_audio_dev = [
        {
            'url': source_url + 'vox1_dev_wav_partaa',
            'md5': 'e395d020928bc15670b570a21695ed96',
        },
        {
            'url': source_url + 'vox1_dev_wav_partab',
            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
        },
        {
            'url': source_url + 'vox1_dev_wav_partac',
            'md5': '017d579a2a96a077f40042ec33e51512',
        },
        {
            'url': source_url + 'vox1_dev_wav_partad',
            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
        },
    ]
    archieves_audio_test = [
        {
            'url': source_url + 'vox1_test_wav.zip',
            'md5': '185fdc63c3c739954633d50379a3d102',
        },
    ]
    archieves_meta = [
        {
            'url':
            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
            'md5':
            'b73110731c9223c1461fe49cb48dddfc',
        },
    ]

    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
    sample_rate = 16000
    meta_info = collections.namedtuple(
        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
    base_path = os.path.join(DATA_HOME, 'vox1')
    wav_path = os.path.join(base_path, 'wav')
    meta_path = os.path.join(base_path, 'meta')
    veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
    csv_path = os.path.join(base_path, 'csv')
    subsets = ['train', 'dev', 'enroll', 'test']

    def __init__(
            self,
            subset: str='train',
            feat_type: str='raw',
            random_chunk: bool=True,
            chunk_duration: float=3.0,  # seconds
            split_ratio: float=0.9,  # train split ratio
            seed: int=0,
            target_dir: str=None,
            vox2_base_path=None,
            **kwargs):
        """VoxCeleb data prepare and get the specific dataset audio info

        Args:
            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
        """
        assert subset in self.subsets, \
            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)

        self.subset = subset
        self.spk_id2label = {}
        self.feat_type = feat_type
        self.feat_config = kwargs
        self.random_chunk = random_chunk
        self.chunk_duration = chunk_duration
        self.split_ratio = split_ratio
        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
        self.vox2_base_path = vox2_base_path

        # if we set the target dir, we will change the vox data info data from base path to target dir
        VoxCeleb.csv_path = os.path.join(
            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
        VoxCeleb.meta_path = os.path.join(
            target_dir, "voxceleb",
            'meta') if target_dir else VoxCeleb.meta_path
        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
                                               'veri_test2.txt')
        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
        self._data = self._get_data()
        super(VoxCeleb, self).__init__()

        # Set up a seed to reproduce training or predicting result.
        # random.seed(seed)

    def _get_data(self):
        # Download audio files.
        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
        # so, we check the vox1/wav dir status
        print(f"wav base path: {self.wav_path}")
        if not os.path.isdir(self.wav_path):
            print("start to download the voxceleb1 dataset")
            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
                self.archieves_audio_dev,
                self.base_path,
                decompress=False)
            download_and_decompress(  # download the vox1_test_wav.zip and unzip
                self.archieves_audio_test,
                self.base_path,
                decompress=True)

            # Download all parts and concatenate the files into one zip file.
            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
            print(f'Concatenating all parts to: {dev_zipfile}')
            os.system(
                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
            )

            # Extract all audio files of dev and test set.
            decompress(dev_zipfile, self.base_path)

        # Download meta files.
        if not os.path.isdir(self.meta_path):
            print("prepare the meta data")
            download_and_decompress(
                self.archieves_meta, self.meta_path, decompress=False)

        # Data preparation.
        if not os.path.isdir(self.csv_path):
            os.makedirs(self.csv_path)
            self.prepare_data()

        data = []
        print(
            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
        )
        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
            for line in rf.readlines()[1:]:
                audio_id, duration, wav, start, stop, spk_id = line.strip(
                ).split(',')
                data.append(
                    self.meta_info(audio_id,
                                   float(duration), wav,
                                   int(start), int(stop), spk_id))

        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
            for line in f.readlines():
                spk_id, label = line.strip().split(' ')
                self.spk_id2label[spk_id] = int(label)

        return data

    def _convert_to_record(self, idx: int):
        sample = self._data[idx]

        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)

        waveform, sr = load_audio(record['wav'])

        # random select a chunk audio samples from the audio
        if self.random_chunk:
            num_wav_samples = waveform.shape[0]
            num_chunk_samples = int(self.chunk_duration * sr)
            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
            stop = start + num_chunk_samples
        else:
            start = record['start']
            stop = record['stop']

        waveform = waveform[start:stop]

        assert self.feat_type in feat_funcs.keys(), \
            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sr=sr, **self.feat_config) if feat_func else waveform

        record.update({'feat': feat})
        if self.subset in ['train',
                           'dev']:  # Labels are available in train and dev.
            record.update({'label': self.spk_id2label[record['spk_id']]})

        return record

    @staticmethod
    def _get_chunks(seg_dur, audio_id, audio_duration):
        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds

        chunk_lst = [
            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
            for i in range(num_chunks)
        ]
        return chunk_lst

    def _get_audio_info(self, wav_file: str,
                        split_chunks: bool) -> List[List[str]]:
        waveform, sr = load_audio(wav_file)
        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
        audio_duration = waveform.shape[0] / sr

        ret = []
        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
                                                audio_duration)

            for chunk in uniq_chunks_list:
                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
                start_sample = int(float(s) * sr)
                end_sample = int(float(e) * sr)
                # id, duration, wav, start, stop, spk_id
                ret.append([
                    chunk, audio_duration, wav_file, start_sample, end_sample,
                    spk_id
                ])
        else:  # Keep whole audio.
            ret.append([
                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
            ])
        return ret

    def generate_csv(self,
                     wav_files: List[str],
                     output_file: str,
                     split_chunks: bool=True):
        print(f'Generating csv: {output_file}')
        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
        # Note: this may occurs c++ execption, but the program will execute fine
        # so we can ignore the execption 
        with Pool(cpu_count()) as p:
            infos = list(
                tqdm(
                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
                           wav_files),
                    total=len(wav_files)))

        csv_lines = []
        for info in infos:
            csv_lines.extend(info)

        with open(output_file, mode="w") as csv_f:
            csv_writer = csv.writer(
                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow(header)
            for line in csv_lines:
                csv_writer.writerow(line)

    def prepare_data(self):
        # Audio of speakers in veri_test_file should not be included in training set.
        print("start to prepare the data csv file")
        enroll_files = set()
        test_files = set()
        # get the enroll and test audio file path
        with open(self.veri_test_file, 'r') as f:
            for line in f.readlines():
                _, enrol_file, test_file = line.strip().split(' ')
                enroll_files.add(os.path.join(self.wav_path, enrol_file))
                test_files.add(os.path.join(self.wav_path, test_file))
            enroll_files = sorted(enroll_files)
            test_files = sorted(test_files)

        # get the enroll and test speakers
        test_spks = set()
        for file in (enroll_files + test_files):
            spk = file.split('/wav/')[1].split('/')[0]
            test_spks.add(spk)

        # get all the train and dev audios file path
        audio_files = []
        speakers = set()
        print("Getting file list...")
        for path in [self.wav_path, self.vox2_base_path]:
            # if vox2 directory is not set and vox2 is not a directory 
            # we will not process this directory
            if not path or not os.path.exists(path):
                print(f"{path} is an invalid path, please check again, "
                      "and we will ignore the vox2 base path")
                continue
            for file in glob.glob(
                    os.path.join(path, "**", "*.wav"), recursive=True):
                spk = file.split('/wav/')[1].split('/')[0]
                if spk in test_spks:
                    continue
                speakers.add(spk)
                audio_files.append(file)

        print(
            f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
        )
        # encode the train and dev speakers label to spk_id2label.txt
        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
            for label, spk_id in enumerate(
                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
                f.write(f'{spk_id} {label}\n')

        audio_files = sorted(audio_files)
        random.shuffle(audio_files)
        split_idx = int(self.split_ratio * len(audio_files))
        # split_ratio to train
        train_files, dev_files = audio_files[:split_idx], audio_files[
            split_idx:]

        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))

        self.generate_csv(
            enroll_files,
            os.path.join(self.csv_path, 'enroll.csv'),
            split_chunks=False)
        self.generate_csv(
            test_files,
            os.path.join(self.csv_path, 'test.csv'),
            split_chunks=False)

    def __getitem__(self, idx):
        return self._convert_to_record(idx)

    def __len__(self):
        return len(self._data)
move the csv vox format to paddleaudio, test=doc 3 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import collections`
			`import csv`
			`import glob`
			`import os`
			`import random`
add state 0 to prepare the voxcele data and augment data 3 years ago			`from multiprocessing import cpu_count`
move the csv vox format to paddleaudio, test=doc 3 years ago			`from typing import List`

			`from paddle.io import Dataset`
			`from pathos.multiprocessing import Pool`
			`from tqdm import tqdm`

[audio] mv paddlespeech/audio to paddleaudio (#2706) * split paddlespeech/audio to paddleaudio. * add sox io ,sox effect, kaldi native fbank to paddleaudio. 2 years ago			`from ..backends.soundfile_backend import soundfile_load as load_audio`
refactor to compilance paddleaudio 3 years ago			`from ..utils import DATA_HOME`
			`from ..utils import decompress`
change the code style to s2t code style, test=doc 3 years ago			`from ..utils.download import download_and_decompress`
add vox2 data into VoxCeleb class 3 years ago			`from .dataset import feat_funcs`
add waveform augment pipeline, test=doc 3 years ago
add vox2 data into VoxCeleb class 3 years ago			`__all__ = ['VoxCeleb']`
move the csv vox format to paddleaudio, test=doc 3 years ago

add vox2 data into VoxCeleb class 3 years ago			`class VoxCeleb(Dataset):`
move the csv vox format to paddleaudio, test=doc 3 years ago			`source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'`
			`archieves_audio_dev = [`
			`{`
			`'url': source_url + 'vox1_dev_wav_partaa',`
			`'md5': 'e395d020928bc15670b570a21695ed96',`
			`},`
			`{`
			`'url': source_url + 'vox1_dev_wav_partab',`
			`'md5': 'bbfaaccefab65d82b21903e81a8a8020',`
			`},`
			`{`
			`'url': source_url + 'vox1_dev_wav_partac',`
			`'md5': '017d579a2a96a077f40042ec33e51512',`
			`},`
			`{`
			`'url': source_url + 'vox1_dev_wav_partad',`
			`'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',`
			`},`
			`]`
			`archieves_audio_test = [`
			`{`
			`'url': source_url + 'vox1_test_wav.zip',`
			`'md5': '185fdc63c3c739954633d50379a3d102',`
			`},`
			`]`
			`archieves_meta = [`
			`{`
			`'url':`
			`'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',`
			`'md5':`
			`'b73110731c9223c1461fe49cb48dddfc',`
			`},`
			`]`

			`num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41`
			`sample_rate = 16000`
			`meta_info = collections.namedtuple(`
			`'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))`
			`base_path = os.path.join(DATA_HOME, 'vox1')`
			`wav_path = os.path.join(base_path, 'wav')`
repair the code according to the part comment, test=doc 3 years ago			`meta_path = os.path.join(base_path, 'meta')`
			`veri_test_file = os.path.join(meta_path, 'veri_test2.txt')`
			`csv_path = os.path.join(base_path, 'csv')`
refactor to compilance paddleaudio 3 years ago			`subsets = ['train', 'dev', 'enroll', 'test']`
move the csv vox format to paddleaudio, test=doc 3 years ago
			`def __init__(`
			`self,`
			`subset: str='train',`
			`feat_type: str='raw',`
			`random_chunk: bool=True,`
			`chunk_duration: float=3.0, # seconds`
			`split_ratio: float=0.9, # train split ratio`
			`seed: int=0,`
			`target_dir: str=None,`
add vox2 data into VoxCeleb class 3 years ago			`vox2_base_path=None,`
move the csv vox format to paddleaudio, test=doc 3 years ago			`**kwargs):`
add vox2 data into VoxCeleb class 3 years ago			`"""VoxCeleb data prepare and get the specific dataset audio info`

			`Args:`
			`subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.`
			`feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.`
			`random_chunk (bool, optional): random select a duration from audio. Defaults to True.`
			`chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.`
			`target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.`
			`vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.`
			`"""`
move the csv vox format to paddleaudio, test=doc 3 years ago			`assert subset in self.subsets, \`
			`'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)`

			`self.subset = subset`
			`self.spk_id2label = {}`
			`self.feat_type = feat_type`
			`self.feat_config = kwargs`
			`self.random_chunk = random_chunk`
			`self.chunk_duration = chunk_duration`
			`self.split_ratio = split_ratio`
add vox2 data into VoxCeleb class 3 years ago			`self.target_dir = target_dir if target_dir else VoxCeleb.base_path`
			`self.vox2_base_path = vox2_base_path`
add state 0 to prepare the voxcele data and augment data 3 years ago
			`# if we set the target dir, we will change the vox data info data from base path to target dir`
add vox2 data into VoxCeleb class 3 years ago			`VoxCeleb.csv_path = os.path.join(`
			`target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path`
			`VoxCeleb.meta_path = os.path.join(`
add state 0 to prepare the voxcele data and augment data 3 years ago			`target_dir, "voxceleb",`
add vox2 data into VoxCeleb class 3 years ago			`'meta') if target_dir else VoxCeleb.meta_path`
			`VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,`
			`'veri_test2.txt')`
move the csv vox format to paddleaudio, test=doc 3 years ago			`# self._data = self._get_data()[:1000] # KP: Small dataset test.`
			`self._data = self._get_data()`
add vox2 data into VoxCeleb class 3 years ago			`super(VoxCeleb, self).__init__()`
move the csv vox format to paddleaudio, test=doc 3 years ago
			`# Set up a seed to reproduce training or predicting result.`
			`# random.seed(seed)`

			`def _get_data(self):`
			`# Download audio files.`
			`# We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir`
			`# so, we check the vox1/wav dir status`
change the code style to s2t code style, test=doc 3 years ago			`print(f"wav base path: {self.wav_path}")`
move the csv vox format to paddleaudio, test=doc 3 years ago			`if not os.path.isdir(self.wav_path):`
change the vector output to numpy.array 3 years ago			`print("start to download the voxceleb1 dataset")`
move the csv vox format to paddleaudio, test=doc 3 years ago			`download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip`
			`self.archieves_audio_dev,`
			`self.base_path,`
			`decompress=False)`
			`download_and_decompress( # download the vox1_test_wav.zip and unzip`
			`self.archieves_audio_test,`
			`self.base_path,`
			`decompress=True)`

			`# Download all parts and concatenate the files into one zip file.`
			`dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')`
change the code style to s2t code style, test=doc 3 years ago			`print(f'Concatenating all parts to: {dev_zipfile}')`
move the csv vox format to paddleaudio, test=doc 3 years ago			`os.system(`
			`f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'`
			`)`

			`# Extract all audio files of dev and test set.`
			`decompress(dev_zipfile, self.base_path)`

			`# Download meta files.`
			`if not os.path.isdir(self.meta_path):`
change the code style to s2t code style, test=doc 3 years ago			`print("prepare the meta data")`
move the csv vox format to paddleaudio, test=doc 3 years ago			`download_and_decompress(`
			`self.archieves_meta, self.meta_path, decompress=False)`

			`# Data preparation.`
			`if not os.path.isdir(self.csv_path):`
			`os.makedirs(self.csv_path)`
			`self.prepare_data()`

			`data = []`
change the code style to s2t code style, test=doc 3 years ago			`print(`
add waveform augment pipeline, test=doc 3 years ago			`f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"`
			`)`
move the csv vox format to paddleaudio, test=doc 3 years ago			`with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:`
			`for line in rf.readlines()[1:]:`
			`audio_id, duration, wav, start, stop, spk_id = line.strip(`
			`).split(',')`
			`data.append(`
			`self.meta_info(audio_id,`
			`float(duration), wav,`
			`int(start), int(stop), spk_id))`

			`with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:`
			`for line in f.readlines():`
			`spk_id, label = line.strip().split(' ')`
			`self.spk_id2label[spk_id] = int(label)`

			`return data`

			`def _convert_to_record(self, idx: int):`
			`sample = self._data[idx]`

			`record = {}`
			# To show all fields in a namedtuple: `type(sample)._fields`
			`for field in type(sample)._fields:`
			`record[field] = getattr(sample, field)`

			`waveform, sr = load_audio(record['wav'])`

			`# random select a chunk audio samples from the audio`
			`if self.random_chunk:`
			`num_wav_samples = waveform.shape[0]`
			`num_chunk_samples = int(self.chunk_duration * sr)`
			`start = random.randint(0, num_wav_samples - num_chunk_samples - 1)`
			`stop = start + num_chunk_samples`
			`else:`
			`start = record['start']`
			`stop = record['stop']`

			`waveform = waveform[start:stop]`

			`assert self.feat_type in feat_funcs.keys(), \`
			`f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"`
			`feat_func = feat_funcs[self.feat_type]`
			`feat = feat_func(`
			`waveform, sr=sr, **self.feat_config) if feat_func else waveform`

			`record.update({'feat': feat})`
			`if self.subset in ['train',`
			`'dev']: # Labels are available in train and dev.`
			`record.update({'label': self.spk_id2label[record['spk_id']]})`

			`return record`

			`@staticmethod`
			`def _get_chunks(seg_dur, audio_id, audio_duration):`
			`num_chunks = int(audio_duration / seg_dur) # all in milliseconds`

			`chunk_lst = [`
			`audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)`
			`for i in range(num_chunks)`
			`]`
			`return chunk_lst`

			`def _get_audio_info(self, wav_file: str,`
			`split_chunks: bool) -> List[List[str]]:`
			`waveform, sr = load_audio(wav_file)`
			`spk_id, sess_id, utt_id = wav_file.split("/")[-3:]`
			`audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])`
			`audio_duration = waveform.shape[0] / sr`

			`ret = []`
			`if split_chunks: # Split into pieces of self.chunk_duration seconds.`
			`uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,`
			`audio_duration)`

			`for chunk in uniq_chunks_list:`
			`s, e = chunk.split("_")[-2:] # Timestamps of start and end`
			`start_sample = int(float(s) * sr)`
			`end_sample = int(float(e) * sr)`
			`# id, duration, wav, start, stop, spk_id`
			`ret.append([`
			`chunk, audio_duration, wav_file, start_sample, end_sample,`
			`spk_id`
			`])`
			`else: # Keep whole audio.`
			`ret.append([`
			`audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id`
			`])`
			`return ret`

			`def generate_csv(self,`
			`wav_files: List[str],`
			`output_file: str,`
			`split_chunks: bool=True):`
change the code style to s2t code style, test=doc 3 years ago			`print(f'Generating csv: {output_file}')`
convert jsonfile to csv file 3 years ago			`header = ["id", "duration", "wav", "start", "stop", "spk_id"]`
add state 0 to prepare the voxcele data and augment data 3 years ago			`# Note: this may occurs c++ execption, but the program will execute fine`
			`# so we can ignore the execption`
			`with Pool(cpu_count()) as p:`
move the csv vox format to paddleaudio, test=doc 3 years ago			`infos = list(`
			`tqdm(`
			`p.imap(lambda x: self._get_audio_info(x, split_chunks),`
			`wav_files),`
			`total=len(wav_files)))`

			`csv_lines = []`
			`for info in infos:`
			`csv_lines.extend(info)`

			`with open(output_file, mode="w") as csv_f:`
			`csv_writer = csv.writer(`
			`csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)`
			`csv_writer.writerow(header)`
			`for line in csv_lines:`
			`csv_writer.writerow(line)`

			`def prepare_data(self):`
			`# Audio of speakers in veri_test_file should not be included in training set.`
change the code style to s2t code style, test=doc 3 years ago			`print("start to prepare the data csv file")`
add state 0 to prepare the voxcele data and augment data 3 years ago			`enroll_files = set()`
move the csv vox format to paddleaudio, test=doc 3 years ago			`test_files = set()`
			`# get the enroll and test audio file path`
			`with open(self.veri_test_file, 'r') as f:`
			`for line in f.readlines():`
			`_, enrol_file, test_file = line.strip().split(' ')`
add state 0 to prepare the voxcele data and augment data 3 years ago			`enroll_files.add(os.path.join(self.wav_path, enrol_file))`
move the csv vox format to paddleaudio, test=doc 3 years ago			`test_files.add(os.path.join(self.wav_path, test_file))`
add state 0 to prepare the voxcele data and augment data 3 years ago			`enroll_files = sorted(enroll_files)`
move the csv vox format to paddleaudio, test=doc 3 years ago			`test_files = sorted(test_files)`

			`# get the enroll and test speakers`
			`test_spks = set()`
add state 0 to prepare the voxcele data and augment data 3 years ago			`for file in (enroll_files + test_files):`
move the csv vox format to paddleaudio, test=doc 3 years ago			`spk = file.split('/wav/')[1].split('/')[0]`
			`test_spks.add(spk)`

			`# get all the train and dev audios file path`
			`audio_files = []`
			`speakers = set()`
change the code style to s2t code style, test=doc 3 years ago			`print("Getting file list...")`
add vox2 data into VoxCeleb class 3 years ago			`for path in [self.wav_path, self.vox2_base_path]:`
			`# if vox2 directory is not set and vox2 is not a directory`
			`# we will not process this directory`
			`if not path or not os.path.exists(path):`
change the code style to s2t code style, test=doc 3 years ago			`print(f"{path} is an invalid path, please check again, "`
			`"and we will ignore the vox2 base path")`
add vox2 data into VoxCeleb class 3 years ago			`continue`
move the csv vox format to paddleaudio, test=doc 3 years ago			`for file in glob.glob(`
			`os.path.join(path, "*", ".wav"), recursive=True):`
			`spk = file.split('/wav/')[1].split('/')[0]`
			`if spk in test_spks:`
			`continue`
			`speakers.add(spk)`
			`audio_files.append(file)`

change the code style to s2t code style, test=doc 3 years ago			`print(`
add state 0 to prepare the voxcele data and augment data 3 years ago			`f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"`
			`)`
move the csv vox format to paddleaudio, test=doc 3 years ago			`# encode the train and dev speakers label to spk_id2label.txt`
			`with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:`
			`for label, spk_id in enumerate(`
			`sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2`
			`f.write(f'{spk_id} {label}\n')`

			`audio_files = sorted(audio_files)`
			`random.shuffle(audio_files)`
			`split_idx = int(self.split_ratio * len(audio_files))`
			`# split_ratio to train`
			`train_files, dev_files = audio_files[:split_idx], audio_files[`
			`split_idx:]`

			`self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))`
			`self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))`
add state 0 to prepare the voxcele data and augment data 3 years ago
move the csv vox format to paddleaudio, test=doc 3 years ago			`self.generate_csv(`
add state 0 to prepare the voxcele data and augment data 3 years ago			`enroll_files,`
refactor to compilance paddleaudio 3 years ago			`os.path.join(self.csv_path, 'enroll.csv'),`
move the csv vox format to paddleaudio, test=doc 3 years ago			`split_chunks=False)`
			`self.generate_csv(`
			`test_files,`
			`os.path.join(self.csv_path, 'test.csv'),`
			`split_chunks=False)`

			`def __getitem__(self, idx):`
			`return self._convert_to_record(idx)`

			`def __len__(self):`
			`return len(self._data)`