diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh index da44d431b..d6010ec66 100755 --- a/examples/voxceleb/sv0/local/data.sh +++ b/examples/voxceleb/sv0/local/data.sh @@ -85,7 +85,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # generate the vox2 manifest file from wav file - # we will generate the manifest.vox2 in ${dir}/vox2 directory + # we will generate the ${dir}/vox2/manifest.vox2 # because we use all the vox2 dataset to train, so collect all the vox2 data in one file echo "start generate the vox2 manifest files" python3 ${TARGET_DIR}/voxceleb/voxceleb2.py \ diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py index 7ff6cb695..c1590c8f3 100644 --- a/paddlespeech/vector/exps/ecapa_tdnn/train.py +++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py @@ -37,7 +37,6 @@ from paddlespeech.vector.modules.sid_model import SpeakerIdetification from paddlespeech.vector.training.scheduler import CyclicLRScheduler from paddlespeech.vector.training.seeding import seed_everything from paddlespeech.vector.utils.time import Timer -# from paddleaudio.datasets.voxceleb import VoxCeleb logger = Log(__name__).getlog() diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py index b85563e7a..5049d1946 100644 --- a/paddlespeech/vector/io/batch.py +++ b/paddlespeech/vector/io/batch.py @@ -17,6 +17,17 @@ import paddle def waveform_collate_fn(batch): + """Wrap the waveform into a batch form + + Args: + batch (list): the waveform list from the dataloader + the item of data include several field + feat: the utterance waveform data + label: the utterance label encoding data + + Returns: + dict: the batch data to dataloader + """ waveforms = np.stack([item['feat'] for item in batch]) labels = np.stack([item['label'] for item in batch]) @@ -27,6 +38,18 @@ def feature_normalize(feats: paddle.Tensor, mean_norm: bool=True, std_norm: bool=True, convert_to_numpy: bool=False): + """Do one utterance feature normalization + + Args: + feats (paddle.Tensor): the original utterance feat, such as fbank, mfcc + mean_norm (bool, optional): mean norm flag. Defaults to True. + std_norm (bool, optional): std norm flag. Defaults to True. + convert_to_numpy (bool, optional): convert the paddle.tensor to numpy + and do feature norm with numpy. Defaults to False. + + Returns: + paddle.Tensor : the normalized feats + """ # Features normalization if needed # numpy.mean is a little with paddle.mean about 1e-6 if convert_to_numpy: @@ -60,6 +83,16 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs): def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True): + """Do batch utterance features normalization + + Args: + batch (list): the batch feature from dataloader + mean_norm (bool, optional): mean normalization flag. Defaults to True. + std_norm (bool, optional): std normalization flag. Defaults to True. + + Returns: + dict: the normalized batch features + """ ids = [item['utt_id'] for item in batch] lengths = np.asarray([item['feat'].shape[1] for item in batch]) feats = list(