You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/deepspeech/io/sampler.py

257 lines
10 KiB

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import random
import tarfile
import logging
import numpy as np
from collections import namedtuple
from functools import partial
import paddle
from paddle.io import BatchSampler
from paddle.io import DistributedBatchSampler
from paddle import distributed as dist
logger = logging.getLogger(__name__)
__all__ = [
"SortagradDistributedBatchSampler",
"SortagradBatchSampler",
]
def _batch_shuffle(indices, batch_size, epoch, clipped=False):
"""Put similarly-sized instances into minibatches for better efficiency
and make a batch-wise shuffle.
1. Sort the audio clips by duration.
2. Generate a random number `k`, k in [0, batch_size).
3. Randomly shift `k` instances in order to create different batches
for different epochs. Create minibatches.
4. Shuffle the minibatches.
:param indices: indexes. List of int.
:type indices: list
:param batch_size: Batch size. This size is also used for generate
a random number for batch shuffle.
:type batch_size: int
:param clipped: Whether to clip the heading (small shift) and trailing
(incomplete batch) instances.
:type clipped: bool
:return: Batch shuffled mainifest.
:rtype: list
"""
rng = np.random.RandomState(epoch)
shift_len = rng.randint(0, batch_size - 1)
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
rng.shuffle(batch_indices)
batch_indices = [item for batch in batch_indices for item in batch]
assert (clipped == False)
if not clipped:
res_len = len(indices) - shift_len - len(batch_indices)
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
if res_len != 0:
batch_indices.extend(indices[-res_len:])
batch_indices.extend(indices[0:shift_len])
assert len(indices) == len(
batch_indices
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
return batch_indices
class SortagradDistributedBatchSampler(DistributedBatchSampler):
def __init__(self,
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=False,
drop_last=False,
sortagrad=False,
shuffle_method="batch_shuffle"):
"""Sortagrad Sampler for multi gpus.
Args:
dataset (paddle.io.Dataset):
batch_size (int): batch size for one gpu
num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
rank (int, optional): rank id. Defaults to None.
shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
"""
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
drop_last)
self._sortagrad = sortagrad
self._shuffle_method = shuffle_method
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
# sort (by duration) or batch-wise shuffle the manifest
if self.shuffle:
if self.epoch == 0 and self._sortagrad:
logger.info(
f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}'
)
else:
logger.info(
f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
)
if self._shuffle_method == "batch_shuffle":
# using `batch_size * nrank`, or will cause instability loss and nan or inf grad,
# since diff batch examlpe length in batches case instability loss in diff rank,
# e.g. rank0 maxlength 20, rank3 maxlength 1000
indices = _batch_shuffle(
indices,
self.batch_size * self.nranks,
self.epoch,
clipped=False)
elif self._shuffle_method == "instance_shuffle":
np.random.RandomState(self.epoch).shuffle(indices)
else:
raise ValueError("Unknown shuffle method %s." %
self._shuffle_method)
assert len(
indices
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
# slice `self.batch_size` examples by rank id
def _get_indices_by_batch_size(indices):
subsampled_indices = []
last_batch_size = self.total_size % (self.batch_size * self.nranks)
assert last_batch_size % self.nranks == 0
last_local_batch_size = last_batch_size // self.nranks
for i in range(self.local_rank * self.batch_size,
len(indices) - last_batch_size,
self.batch_size * self.nranks):
subsampled_indices.extend(indices[i:i + self.batch_size])
indices = indices[len(indices) - last_batch_size:]
subsampled_indices.extend(
indices[self.local_rank * last_local_batch_size:(
self.local_rank + 1) * last_local_batch_size])
return subsampled_indices
if self.nranks > 1:
indices = _get_indices_by_batch_size(indices)
assert len(indices) == self.num_samples
_sample_iter = iter(indices)
batch_indices = []
for idx in _sample_iter:
batch_indices.append(idx)
if len(batch_indices) == self.batch_size:
logger.info(
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
yield batch_indices
batch_indices = []
if not self.drop_last and len(batch_indices) > 0:
yield batch_indices
def __len__(self):
num_samples = self.num_samples
num_samples += int(not self.drop_last) * (self.batch_size - 1)
return num_samples // self.batch_size
class SortagradBatchSampler(BatchSampler):
def __init__(self,
dataset,
batch_size,
shuffle=False,
drop_last=False,
sortagrad=False,
shuffle_method="batch_shuffle"):
"""Sortagrad Sampler for one gpu.
Args:
dataset (paddle.io.Dataset):
batch_size (int): batch size for one gpu
shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
"""
self.dataset = dataset
assert isinstance(batch_size, int) and batch_size > 0, \
"batch_size should be a positive integer"
self.batch_size = batch_size
assert isinstance(shuffle, bool), \
"shuffle should be a boolean value"
self.shuffle = shuffle
assert isinstance(drop_last, bool), \
"drop_last should be a boolean number"
self.drop_last = drop_last
self.epoch = 0
self.num_samples = int(math.ceil(len(self.dataset) * 1.0))
self.total_size = self.num_samples
self._sortagrad = sortagrad
self._shuffle_method = shuffle_method
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
# sort (by duration) or batch-wise shuffle the manifest
if self.shuffle:
if self.epoch == 0 and self._sortagrad:
logger.info(f'dataset sortagrad! epoch {self.epoch}')
else:
logger.info(f'dataset shuffle! epoch {self.epoch}')
if self._shuffle_method == "batch_shuffle":
indices = _batch_shuffle(
indices, self.batch_size, self.epoch, clipped=False)
elif self._shuffle_method == "instance_shuffle":
np.random.RandomState(self.epoch).shuffle(indices)
else:
raise ValueError("Unknown shuffle method %s." %
self._shuffle_method)
assert len(
indices
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
assert len(indices) == self.num_samples
_sample_iter = iter(indices)
batch_indices = []
for idx in _sample_iter:
batch_indices.append(idx)
if len(batch_indices) == self.batch_size:
logger.info(
f"rank: {dist.get_rank()} batch index: {batch_indices} ")
yield batch_indices
batch_indices = []
if not self.drop_last and len(batch_indices) > 0:
yield batch_indices
self.epoch += 1
def __len__(self):
num_samples = self.num_samples
num_samples += int(not self.drop_last) * (self.batch_size - 1)
return num_samples // self.batch_size