diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml index 78c34cd9..6117b354 100644 --- a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml +++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml @@ -4,9 +4,9 @@ # we should explicitly specify the wav path of vox2 audio data converted from m4a vox2_base_path: augment: True -batch_size: 16 +batch_size: 32 num_workers: 2 -num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 +num_speakers: 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41 shuffle: True skip_prep: False split_ratio: 0.9 diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh index fd50cd40..3f41b9c8 100755 --- a/examples/voxceleb/sv0/local/data.sh +++ b/examples/voxceleb/sv0/local/data.sh @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -stage=7 +stage=0 stop_stage=100 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1; @@ -32,7 +32,7 @@ mkdir -p ${dir} # Generally the `MAIN_ROOT` refers to the root of PaddleSpeech, # which is defined in the path.sh -# And we will download the +# And we will download the voxceleb data and rirs noise to ${MAIN_ROOT}/dataset TARGET_DIR=${MAIN_ROOT}/dataset mkdir -p ${TARGET_DIR} @@ -98,7 +98,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # generate the vox csv file # Currently, our training system use csv file for dataset echo "convert the json format to csv format to be compatible with training process" - python3 local/make_csv_dataset_from_json.py\ + python3 local/make_vox_csv_dataset_from_json.py\ --train "${dir}/vox1/manifest.dev" \ --test "${dir}/vox1/manifest.test" \ --target_dir "${dir}/vox/" \ diff --git a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py index 513b53bf..26015aed 100644 --- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py +++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py @@ -20,31 +20,29 @@ import csv import os from typing import List -import paddle import tqdm from yacs.config import CfgNode -from paddlespeech.s2t.utils.log import Log -from paddlespeech.vector.training.seeding import seed_everything -logger = Log(__name__).getlog() from paddleaudio import load as load_audio -from paddleaudio import save as save_wav - +from paddlespeech.s2t.utils.log import Log +from paddlespeech.vector.utils.utils import get_chunks -def get_chunks(seg_dur, audio_id, audio_duration): - num_chunks = int(audio_duration / seg_dur) # all in milliseconds +logger = Log(__name__).getlog() - chunk_lst = [ - audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) - for i in range(num_chunks) - ] - return chunk_lst +def get_chunks_list(wav_file: str, + split_chunks: bool, + base_path: str, + chunk_duration: float=3.0) -> List[List[str]]: + """Get the single audio file info -def get_audio_info(wav_file: str, - split_chunks: bool, - base_path: str, - chunk_duration: float=3.0) -> List[List[str]]: + Args: + wav_file (list): the wav audio file and get this audio segment info list + split_chunks (bool): audio split flag + base_path (str): the audio base path + chunk_duration (float): the chunk duration. + if set the split_chunks, we split the audio into multi-chunks segment. + """ waveform, sr = load_audio(wav_file) audio_id = wav_file.split("/rir_noise/")[-1].split(".")[0] audio_duration = waveform.shape[0] / sr @@ -57,13 +55,16 @@ def get_audio_info(wav_file: str, s, e = chunk.split("_")[-2:] # Timestamps of start and end start_sample = int(float(s) * sr) end_sample = int(float(e) * sr) - new_wav_file = os.path.join(base_path, - audio_id + f'_chunk_{idx+1:02}.wav') - save_wav(waveform[start_sample:end_sample], sr, new_wav_file) - # id, duration, new_wav - ret.append([chunk, chunk_duration, new_wav_file]) + + # currently, all vector csv data format use one representation + # id, duration, wav, start, stop, spk_id + ret.append([ + chunk, audio_duration, wav_file, start_sample, end_sample, + "noise" + ]) else: # Keep whole audio. - ret.append([audio_id, audio_duration, wav_file]) + ret.append( + [audio_id, audio_duration, wav_file, 0, waveform.shape[0], "noise"]) return ret @@ -71,12 +72,20 @@ def generate_csv(wav_files, output_file: str, base_path: str, split_chunks: bool=True): - print(f'Generating csv: {output_file}') - header = ["id", "duration", "wav"] + """Prepare the csv file according the wav files + + Args: + wav_files (list): all the audio list to prepare the csv file + output_file (str): the output csv file + config (CfgNode): yaml configuration content + split_chunks (bool): audio split flag + """ + logger.info(f'Generating csv: {output_file}') + header = ["utt_id", "duration", "wav", "start", "stop", "lab_id"] csv_lines = [] for item in tqdm.tqdm(wav_files): csv_lines.extend( - get_audio_info( + get_chunks_list( item, base_path=base_path, split_chunks=split_chunks)) if not os.path.exists(os.path.dirname(output_file)): @@ -91,11 +100,12 @@ def generate_csv(wav_files, def prepare_data(args, config): - # stage0: set the cpu device, - # all data prepare process will be done in cpu mode - paddle.device.set_device("cpu") - # set the random seed, it is a must for multiprocess training - seed_everything(config.seed) + """Convert the jsonline format to csv format + + Args: + args (argparse.Namespace): scripts args + config (CfgNode): yaml configuration content + """ # if external config set the skip_prep flat, we will do nothing if config.skip_prep: return @@ -119,6 +129,7 @@ def prepare_data(args, config): noise_files.append(os.path.join(base_path, noise_file)) csv_path = os.path.join(args.data_dir, 'csv') + logger.info(f"csv path: {csv_path}") generate_csv( rir_files, os.path.join(csv_path, 'rir.csv'), base_path=base_path) generate_csv( diff --git a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py index 6873596c..576a3c8b 100644 --- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py +++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py @@ -21,51 +21,34 @@ import json import os import random -import paddle import tqdm from yacs.config import CfgNode from paddleaudio import load as load_audio from paddlespeech.s2t.utils.log import Log -from paddlespeech.vector.training.seeding import seed_everything -logger = Log(__name__).getlog() - - -def get_chunks(seg_dur, audio_id, audio_duration): - """Get all chunk segments from a utterance +from paddlespeech.vector.utils.utils import get_chunks - Args: - seg_dur (float): segment chunk duration - audio_id (str): utterance name - audio_duration (float): utterance duration - - Returns: - List: all the chunk segments - """ - num_chunks = int(audio_duration / seg_dur) # all in milliseconds - chunk_lst = [ - audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) - for i in range(num_chunks) - ] - return chunk_lst +logger = Log(__name__).getlog() def prepare_csv(wav_files, output_file, config, split_chunks=True): """Prepare the csv file according the wav files Args: - dataset_list (list): all the dataset to get the test utterances - verification_file (str): voxceleb1 trial file + wav_files (list): all the audio list to prepare the csv file + output_file (str): the output csv file + config (CfgNode): yaml configuration content + split_chunks (bool): audio split flag """ if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) csv_lines = [] - header = ["id", "duration", "wav", "start", "stop", "spk_id"] + header = ["utt_id", "duration", "wav", "start", "stop", "lab_id"] # voxceleb meta info for each training utterance segment # we extract a segment from a utterance to train # and the segment' period is between start and stop time point in the original wav file # each field in the meta means as follows: - # id: the utterance segment name + # utt_id: the utterance segment name # duration: utterance segment time # wav: utterance file path # start: start point in the original wav file @@ -194,11 +177,9 @@ def prepare_data(args, config): args (argparse.Namespace): scripts args config (CfgNode): yaml configuration content """ - # stage0: set the cpu device, - # all data prepare process will be done in cpu mode - paddle.device.set_device("cpu") - # set the random seed, it is a must for multiprocess training - seed_everything(config.seed) + # stage0: set the random seed + random.seed(config.seed) + # if external config set the skip_prep flat, we will do nothing if config.skip_prep: return diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py index 9f48a4c4..d6919d23 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/train.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py @@ -29,7 +29,7 @@ from paddlespeech.vector.io.augment import waveform_augment from paddlespeech.vector.io.batch import batch_pad_right from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.io.batch import waveform_collate_fn -from paddlespeech.vector.io.dataset import VoxCelebDataset +from paddlespeech.vector.io.dataset import CSVDataset from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn from paddlespeech.vector.modules.loss import AdditiveAngularMargin from paddlespeech.vector.modules.loss import LogSoftmaxWrapper @@ -55,11 +55,11 @@ def main(args, config): # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline # note: some cmd must do in rank==0, so wo will refactor the data prepare code - train_dataset = VoxCelebDataset( + train_dataset = CSVDataset( csv_path=os.path.join(args.data_dir, "vox/csv/train.csv"), spk_id2label_path=os.path.join(args.data_dir, "vox/meta/spk_id2label.txt")) - dev_dataset = VoxCelebDataset( + dev_dataset = CSVDataset( csv_path=os.path.join(args.data_dir, "vox/csv/dev.csv"), spk_id2label_path=os.path.join(args.data_dir, "vox/meta/spk_id2label.txt")) @@ -74,7 +74,7 @@ def main(args, config): # stage4: build the speaker verification train instance with backbone model model = SpeakerIdetification( - backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers) + backbone=ecapa_tdnn, num_class=config.num_speakers) # stage5: build the optimizer, we now only construct the AdamW optimizer # 140000 is single gpu steps @@ -148,6 +148,7 @@ def main(args, config): train_reader_cost = 0.0 train_feat_cost = 0.0 train_run_cost = 0.0 + train_misce_cost = 0.0 reader_start = time.time() for batch_idx, batch in enumerate(train_loader): @@ -203,12 +204,14 @@ def main(args, config): train_run_cost += time.time() - train_start # stage 9-8: Calculate average loss per batch - avg_loss += loss.numpy()[0] + train_misce_start = time.time() + avg_loss = loss.item() # stage 9-9: Calculate metrics, which is one-best accuracy preds = paddle.argmax(logits, axis=1) num_corrects += (preds == labels).numpy().sum() num_samples += feats.shape[0] + timer.count() # step plus one in timer # stage 9-10: print the log information only on 0-rank per log-freq batchs @@ -227,6 +230,7 @@ def main(args, config): train_feat_cost / config.log_interval) print_msg += ' avg_train_cost: {:.5f} sec,'.format( train_run_cost / config.log_interval) + print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format( lr, timer.timing, timer.eta) logger.info(print_msg) diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py index 8b7ccc11..0aa89c6a 100644 --- a/paddlespeech/vector/io/augment.py +++ b/paddlespeech/vector/io/augment.py @@ -14,6 +14,7 @@ # this is modified from SpeechBrain # https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py import math +import os from typing import List import numpy as np @@ -22,13 +23,12 @@ import paddle.nn as nn import paddle.nn.functional as F from paddlespeech.s2t.utils.log import Log -from paddlespeech.vector.io.dataset import RIRSNoiseDataset +from paddlespeech.vector.io.dataset import CSVDataset from paddlespeech.vector.io.signal_processing import compute_amplitude from paddlespeech.vector.io.signal_processing import convolve1d from paddlespeech.vector.io.signal_processing import dB_to_amplitude from paddlespeech.vector.io.signal_processing import notch_filter from paddlespeech.vector.io.signal_processing import reverberate -# from paddleaudio.datasets.rirs_noises import OpenRIRNoise logger = Log(__name__).getlog() @@ -510,7 +510,7 @@ class AddNoise(nn.Layer): assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}' return np.pad(x, [0, w], mode=mode, **kwargs) - ids = [item['id'] for item in batch] + ids = [item['utt_id'] for item in batch] lengths = np.asarray([item['feat'].shape[0] for item in batch]) waveforms = list( map(lambda x: pad(x, max(max_length, lengths.max().item())), @@ -590,7 +590,7 @@ class AddReverb(nn.Layer): assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}' return np.pad(x, [0, w], mode=mode, **kwargs) - ids = [item['id'] for item in batch] + ids = [item['utt_id'] for item in batch] lengths = np.asarray([item['feat'].shape[0] for item in batch]) waveforms = list( map(lambda x: pad(x, lengths.max().item()), @@ -840,10 +840,10 @@ def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]: List[paddle.nn.Layer]: all augment process """ logger.info("start to build the augment pipeline") - noise_dataset = RIRSNoiseDataset(csv_path=os.path.join( - target_dir, "rir_noise/csv/noise.csv")) - rir_dataset = OpenRIRNoise(csv_path=os.path.join(target_dir, - "rir_noise/csv/rir.csv")) + noise_dataset = CSVDataset(csv_path=os.path.join(target_dir, + "rir_noise/csv/noise.csv")) + rir_dataset = CSVDataset(csv_path=os.path.join(target_dir, + "rir_noise/csv/rir.csv")) wavedrop = TimeDomainSpecAugment( sample_rate=16000, diff --git a/paddlespeech/vector/io/dataset.py b/paddlespeech/vector/io/dataset.py index c8a4299e..ea2106cd 100644 --- a/paddlespeech/vector/io/dataset.py +++ b/paddlespeech/vector/io/dataset.py @@ -11,18 +11,38 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import collections - +from dataclasses import dataclass +from dataclasses import fields from paddle.io import Dataset from paddleaudio import load as load_audio +from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + +# the audio meta info in the vector CSVDataset +# utt_id: the utterance segment name +# duration: utterance segment time +# wav: utterance file path +# start: start point in the original wav file +# stop: stop point in the original wav file +# lab_id: the utterance segment's label id + +@dataclass +class meta_info: + utt_id: str + duration: float + wav: str + start: int + stop: int + lab_id: str -class VoxCelebDataset(Dataset): - meta_info = collections.namedtuple( - 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) - def __init__(self, csv_path, spk_id2label_path, config): +class CSVDataset(Dataset): + # meta_info = collections.namedtuple( + # 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id')) + + def __init__(self, csv_path, spk_id2label_path=None, config=None): super().__init__() self.csv_path = csv_path self.spk_id2label_path = spk_id2label_path @@ -32,34 +52,41 @@ class VoxCelebDataset(Dataset): def load_data_csv(self): data = [] + with open(self.csv_path, 'r') as rf: for line in rf.readlines()[1:]: audio_id, duration, wav, start, stop, spk_id = line.strip( ).split(',') data.append( - self.meta_info(audio_id, - float(duration), wav, - int(start), int(stop), spk_id)) + meta_info(audio_id, + float(duration), wav, + int(start), int(stop), spk_id)) return data def load_speaker_to_label(self): + if not self.spk_id2label_path: + logger.warning("No speaker id to label file") + return + spk_id2label = {} with open(self.spk_id2label_path, 'r') as f: for line in f.readlines(): spk_id, label = line.strip().split(' ') - self.spk_id2label[spk_id] = int(label) + spk_id2label[spk_id] = int(label) + + return spk_id2label def convert_to_record(self, idx: int): sample = self.data[idx] record = {} # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) + for field in fields(sample): + record[field.name] = getattr(sample, field.name) waveform, sr = load_audio(record['wav']) # random select a chunk audio samples from the audio - if self.config.random_chunk: + if self.config and self.config.random_chunk: num_wav_samples = waveform.shape[0] num_chunk_samples = int(self.config.chunk_duration * sr) start = random.randint(0, num_wav_samples - num_chunk_samples - 1) @@ -71,46 +98,9 @@ class VoxCelebDataset(Dataset): # we only return the waveform as feat waveform = waveform[start:stop] record.update({'feat': waveform}) - record.update({'label': self.spk_id2label[record['spk_id']]}) - - return record - - def __getitem__(self, idx): - return self.convert_to_record(idx) - - def __len__(self): - return len(self.data) - - -class RIRSNoiseDataset(Dataset): - meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav')) - - def __init__(self, csv_path): - super().__init__() - self.csv_path = csv_path - self.data = self.load_csv_data() + if self.spk_id2label: + record.update({'label': self.spk_id2label[record['lab_id']]}) - def load_csv_data(self): - data = [] - with open(self.csv_path, 'r') as rf: - for line in rf.readlines()[1:]: - audio_id, duration, wav = line.strip().split(',') - data.append(self.meta_info(audio_id, float(duration), wav)) - - random.shuffle(data) - return data - - def convert_to_record(self, idx: int): - sample = self.data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = load_audio(record['wav']) - - record.update({'feat': waveform}) return record def __getitem__(self, idx): diff --git a/paddlespeech/vector/utils/utils.py b/paddlespeech/vector/utils/utils.py new file mode 100644 index 00000000..892b19c7 --- /dev/null +++ b/paddlespeech/vector/utils/utils.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def get_chunks(seg_dur, audio_id, audio_duration): + """Get all chunk segments from a utterance + + Args: + seg_dur (float): segment chunk duration + audio_id (str): utterance name + audio_duration (float): utterance duration + + Returns: + List: all the chunk segments + """ + num_chunks = int(audio_duration / seg_dur) # all in milliseconds + chunk_lst = [ + audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur) + for i in range(num_chunks) + ] + return chunk_lst