PaddleSpeech/paddlespeech/vector/io/dataset.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from dataclasses import fields

from paddle.io import Dataset

from paddlespeech.audio import load as load_audio
from paddlespeech.audio.compliance.librosa import melspectrogram
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()

# the audio meta info in the vector CSVDataset
# utt_id: the utterance segment name
# duration: utterance segment time
# wav: utterance file path
# start: start point in the original wav file
# stop: stop point in the original wav file
# label: the utterance segment's label id


@dataclass
class meta_info:
    """the audio meta info in the vector CSVDataset

    Args:
        utt_id (str): the utterance segment name
        duration (float): utterance segment time
        wav (str): utterance file path
        start (int): start point in the original wav file
        stop (int): stop point in the original wav file
        lab_id (str): the utterance segment's label id
    """
    utt_id: str
    duration: float
    wav: str
    start: int
    stop: int
    label: str


# csv dataset support feature type
# raw: return the pcm data sample point
# melspectrogram: fbank feature
feat_funcs = {
    'raw': None,
    'melspectrogram': melspectrogram,
}


class CSVDataset(Dataset):
    def __init__(self,
                 csv_path,
                 label2id_path=None,
                 config=None,
                 random_chunk=True,
                 feat_type: str="raw",
                 n_train_snts: int=-1,
                 **kwargs):
        """Implement the CSV Dataset

        Args:
            csv_path (str): csv dataset file path
            label2id_path (str): the utterance label to integer id map file path
            config (CfgNode): yaml config
            feat_type (str): dataset feature type. if it is raw, it return pcm data.
            n_train_snts (int): select the n_train_snts sample from the dataset. 
                                if n_train_snts = -1, dataset will load all the sample.
                                Default value is -1.
            kwargs : feature type args
        """
        super().__init__()
        self.csv_path = csv_path
        self.label2id_path = label2id_path
        self.config = config
        self.random_chunk = random_chunk
        self.feat_type = feat_type
        self.n_train_snts = n_train_snts
        self.feat_config = kwargs
        self.id2label = {}
        self.label2id = {}
        self.data = self.load_data_csv()
        self.load_speaker_to_label()

    def load_data_csv(self):
        """Load the csv dataset content and store them in the data property
        the csv dataset's format has six fields, 
        that is audio_id or utt_id, audio duration, segment start point, segment stop point 
        and utterance label.
        Note in training period, the utterance label must has a map to integer id in label2id_path 

        Returns:
            list: the csv data with meta_info type
        """
        data = []

        with open(self.csv_path, 'r') as rf:
            for line in rf.readlines()[1:]:
                audio_id, duration, wav, start, stop, spk_id = line.strip(
                ).split(',')
                data.append(
                    meta_info(audio_id,
                              float(duration), wav,
                              int(start), int(stop), spk_id))
        if self.n_train_snts > 0:
            sample_num = min(self.n_train_snts, len(data))
            data = data[0:sample_num]

        return data

    def load_speaker_to_label(self):
        """Load the utterance label map content.
        In vector domain, we call the utterance label as speaker label.
        The speaker label is real speaker label in speaker verification domain,
        and in language identification is language label.
        """
        if not self.label2id_path:
            logger.warning("No speaker id to label file")
            return

        with open(self.label2id_path, 'r') as f:
            for line in f.readlines():
                label_name, label_id = line.strip().split(' ')
                self.label2id[label_name] = int(label_id)
                self.id2label[int(label_id)] = label_name

    def convert_to_record(self, idx: int):
        """convert the dataset sample to training record the CSV Dataset

        Args:
            idx (int) : the request index in all the dataset
        """
        sample = self.data[idx]

        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in fields(sample):
            record[field.name] = getattr(sample, field.name)

        waveform, sr = load_audio(record['wav'])

        # random select a chunk audio samples from the audio
        if self.config and self.config.random_chunk:
            num_wav_samples = waveform.shape[0]
            num_chunk_samples = int(self.config.chunk_duration * sr)
            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
            stop = start + num_chunk_samples
        else:
            start = record['start']
            stop = record['stop']

        # we only return the waveform as feat
        waveform = waveform[start:stop]

        # all availabel feature type is in feat_funcs
        assert self.feat_type in feat_funcs.keys(), \
            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sr=sr, **self.feat_config) if feat_func else waveform

        record.update({'feat': feat})
        if self.label2id:
            record.update({'label': self.label2id[record['label']]})

        return record

    def __getitem__(self, idx):
        """Return the specific index sample

        Args:
            idx (int) : the request index in all the dataset
        """
        return self.convert_to_record(idx)

    def __len__(self):
        """Return the dataset length

        Returns:
            int: the length num of the dataset
        """
        return len(self.data)
add voxceleb and rirs noise dataset 3 years ago			`# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add vector csv dataset format, test=doc 3 years ago			`from dataclasses import dataclass`
			`from dataclasses import fields`
format 3 years ago
add voxceleb and rirs noise dataset 3 years ago			`from paddle.io import Dataset`
format code 3 years ago
Refactor paddleaudio to paddlespeech.audio 2 years ago			`from paddlespeech.audio import load as load_audio`
			`from paddlespeech.audio.compliance.librosa import melspectrogram`
add vector csv dataset format, test=doc 3 years ago			`from paddlespeech.s2t.utils.log import Log`
			`logger = Log(__name__).getlog()`

			`# the audio meta info in the vector CSVDataset`
			`# utt_id: the utterance segment name`
			`# duration: utterance segment time`
			`# wav: utterance file path`
			`# start: start point in the original wav file`
			`# stop: stop point in the original wav file`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`# label: the utterance segment's label id`
add vector csv dataset format, test=doc 3 years ago
add voxceleb and rirs noise dataset 3 years ago
add vector csv dataset format, test=doc 3 years ago			`@dataclass`
			`class meta_info:`
add some annotations, test=doc 3 years ago			`"""the audio meta info in the vector CSVDataset`

			`Args:`
			`utt_id (str): the utterance segment name`
			`duration (float): utterance segment time`
			`wav (str): utterance file path`
			`start (int): start point in the original wav file`
			`stop (int): stop point in the original wav file`
			`lab_id (str): the utterance segment's label id`
			`"""`
add vector csv dataset format, test=doc 3 years ago			`utt_id: str`
			`duration: float`
			`wav: str`
			`start: int`
			`stop: int`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`label: str`
add voxceleb and rirs noise dataset 3 years ago

test.py update the CSVDataset, test=doc 3 years ago			`# csv dataset support feature type`
			`# raw: return the pcm data sample point`
			`# melspectrogram: fbank feature`
			`feat_funcs = {`
			`'raw': None,`
			`'melspectrogram': melspectrogram,`
			`}`


add vector csv dataset format, test=doc 3 years ago			`class CSVDataset(Dataset):`
test.py update the CSVDataset, test=doc 3 years ago			`def __init__(self,`
			`csv_path,`
			`label2id_path=None,`
			`config=None,`
			`random_chunk=True,`
			`feat_type: str="raw",`
wrap the embedding mean and std norm, test=doc 3 years ago			`n_train_snts: int=-1,`
test.py update the CSVDataset, test=doc 3 years ago			`**kwargs):`
add some annotations, test=doc 3 years ago			`"""Implement the CSV Dataset`

			`Args:`
			`csv_path (str): csv dataset file path`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`label2id_path (str): the utterance label to integer id map file path`
add some annotations, test=doc 3 years ago			`config (CfgNode): yaml config`
test.py update the CSVDataset, test=doc 3 years ago			`feat_type (str): dataset feature type. if it is raw, it return pcm data.`
wrap the embedding mean and std norm, test=doc 3 years ago			`n_train_snts (int): select the n_train_snts sample from the dataset.`
			`if n_train_snts = -1, dataset will load all the sample.`
			`Default value is -1.`
test.py update the CSVDataset, test=doc 3 years ago			`kwargs : feature type args`
add some annotations, test=doc 3 years ago			`"""`
add voxceleb and rirs noise dataset 3 years ago			`super().__init__()`
			`self.csv_path = csv_path`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`self.label2id_path = label2id_path`
add voxceleb and rirs noise dataset 3 years ago			`self.config = config`
test.py update the CSVDataset, test=doc 3 years ago			`self.random_chunk = random_chunk`
			`self.feat_type = feat_type`
wrap the embedding mean and std norm, test=doc 3 years ago			`self.n_train_snts = n_train_snts`
test.py update the CSVDataset, test=doc 3 years ago			`self.feat_config = kwargs`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`self.id2label = {}`
			`self.label2id = {}`
add voxceleb and rirs noise dataset 3 years ago			`self.data = self.load_data_csv()`
add some annotations, test=doc 3 years ago			`self.load_speaker_to_label()`
add voxceleb and rirs noise dataset 3 years ago
			`def load_data_csv(self):`
add some annotations, test=doc 3 years ago			`"""Load the csv dataset content and store them in the data property`
			`the csv dataset's format has six fields,`
			`that is audio_id or utt_id, audio duration, segment start point, segment stop point`
			`and utterance label.`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`Note in training period, the utterance label must has a map to integer id in label2id_path`
wrap the embedding mean and std norm, test=doc 3 years ago
			`Returns:`
			`list: the csv data with meta_info type`
add some annotations, test=doc 3 years ago			`"""`
add voxceleb and rirs noise dataset 3 years ago			`data = []`
add vector csv dataset format, test=doc 3 years ago
add voxceleb and rirs noise dataset 3 years ago			`with open(self.csv_path, 'r') as rf:`
			`for line in rf.readlines()[1:]:`
			`audio_id, duration, wav, start, stop, spk_id = line.strip(`
			`).split(',')`
			`data.append(`
add vector csv dataset format, test=doc 3 years ago			`meta_info(audio_id,`
			`float(duration), wav,`
			`int(start), int(stop), spk_id))`
wrap the embedding mean and std norm, test=doc 3 years ago			`if self.n_train_snts > 0:`
			`sample_num = min(self.n_train_snts, len(data))`
			`data = data[0:sample_num]`

add voxceleb and rirs noise dataset 3 years ago			`return data`

			`def load_speaker_to_label(self):`
add some annotations, test=doc 3 years ago			`"""Load the utterance label map content.`
			`In vector domain, we call the utterance label as speaker label.`
			`The speaker label is real speaker label in speaker verification domain,`
			`and in language identification is language label.`
			`"""`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`if not self.label2id_path:`
add vector csv dataset format, test=doc 3 years ago			`logger.warning("No speaker id to label file")`
			`return`
change the vector csv.spk_id to csv.label, test=doc 3 years ago
			`with open(self.label2id_path, 'r') as f:`
add voxceleb and rirs noise dataset 3 years ago			`for line in f.readlines():`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`label_name, label_id = line.strip().split(' ')`
			`self.label2id[label_name] = int(label_id)`
			`self.id2label[int(label_id)] = label_name`
add voxceleb and rirs noise dataset 3 years ago
			`def convert_to_record(self, idx: int):`
add some annotations, test=doc 3 years ago			`"""convert the dataset sample to training record the CSV Dataset`

			`Args:`
			`idx (int) : the request index in all the dataset`
			`"""`
add voxceleb and rirs noise dataset 3 years ago			`sample = self.data[idx]`

			`record = {}`
			# To show all fields in a namedtuple: `type(sample)._fields`
add vector csv dataset format, test=doc 3 years ago			`for field in fields(sample):`
			`record[field.name] = getattr(sample, field.name)`
add voxceleb and rirs noise dataset 3 years ago
			`waveform, sr = load_audio(record['wav'])`

			`# random select a chunk audio samples from the audio`
add vector csv dataset format, test=doc 3 years ago			`if self.config and self.config.random_chunk:`
add voxceleb and rirs noise dataset 3 years ago			`num_wav_samples = waveform.shape[0]`
			`num_chunk_samples = int(self.config.chunk_duration * sr)`
			`start = random.randint(0, num_wav_samples - num_chunk_samples - 1)`
			`stop = start + num_chunk_samples`
			`else:`
			`start = record['start']`
			`stop = record['stop']`

			`# we only return the waveform as feat`
			`waveform = waveform[start:stop]`
test.py update the CSVDataset, test=doc 3 years ago
			`# all availabel feature type is in feat_funcs`
			`assert self.feat_type in feat_funcs.keys(), \`
			`f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"`
			`feat_func = feat_funcs[self.feat_type]`
			`feat = feat_func(`
			`waveform, sr=sr, **self.feat_config) if feat_func else waveform`

			`record.update({'feat': feat})`
change the vector csv.spk_id to csv.label, test=doc 3 years ago			`if self.label2id:`
			`record.update({'label': self.label2id[record['label']]})`
add voxceleb and rirs noise dataset 3 years ago
			`return record`

			`def __getitem__(self, idx):`
add some annotations, test=doc 3 years ago			`"""Return the specific index sample`

			`Args:`
			`idx (int) : the request index in all the dataset`
			`"""`
add voxceleb and rirs noise dataset 3 years ago			`return self.convert_to_record(idx)`

			`def __len__(self):`
add some annotations, test=doc 3 years ago			`"""Return the dataset length`
wrap the embedding mean and std norm, test=doc 3 years ago
			`Returns:`
			`int: the length num of the dataset`
add some annotations, test=doc 3 years ago			`"""`
add voxceleb and rirs noise dataset 3 years ago			`return len(self.data)`