From 00147dc060c603c4a2ac5899acd2764d47fbce76 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 4 Mar 2021 06:26:26 +0000 Subject: [PATCH] fix instability loss and grad nan or inf for librispeech training --- deepspeech/io/sampler.py | 149 ++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 81 deletions(-) diff --git a/deepspeech/io/sampler.py b/deepspeech/io/sampler.py index a08a3983d..a0cc469ff 100644 --- a/deepspeech/io/sampler.py +++ b/deepspeech/io/sampler.py @@ -33,6 +33,45 @@ __all__ = [ ] +def _batch_shuffle(indices, batch_size, clipped=False): + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. + 4. Shuffle the minibatches. + + :param indices: indexes. List of int. + :type indices: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool + :return: Batch shuffled mainifest. + :rtype: list + """ + rng = np.random.RandomState(self.epoch) + shift_len = rng.randint(0, batch_size - 1) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + rng.shuffle(batch_indices) + batch_indices = [item for batch in batch_indices for item in batch] + assert (clipped == False) + if not clipped: + res_len = len(indices) - shift_len - len(batch_indices) + # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) + if res_len != 0: + batch_indices.extend(indices[-res_len:]) + batch_indices.extend(indices[0:shift_len]) + assert len(indices) == len( + batch_indices + ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}" + return batch_indices + + class SortagradDistributedBatchSampler(DistributedBatchSampler): def __init__(self, dataset, @@ -43,49 +82,23 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler): drop_last=False, sortagrad=False, shuffle_method="batch_shuffle"): + """Sortagrad Sampler for multi gpus. + + Args: + dataset (paddle.io.Dataset): + batch_size (int): batch size for one gpu + num_replicas (int, optional): world size or numbers of gpus. Defaults to None. + rank (int, optional): rank id. Defaults to None. + shuffle (bool, optional): True for do shuffle, or else. Defaults to False. + drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False. + sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False. + shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle". + """ super().__init__(dataset, batch_size, num_replicas, rank, shuffle, drop_last) self._sortagrad = sortagrad self._shuffle_method = shuffle_method - def _batch_shuffle(self, indices, batch_size, clipped=False): - """Put similarly-sized instances into minibatches for better efficiency - and make a batch-wise shuffle. - - 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_size). - 3. Randomly shift `k` instances in order to create different batches - for different epochs. Create minibatches. - 4. Shuffle the minibatches. - - :param indices: indexes. List of int. - :type indices: list - :param batch_size: Batch size. This size is also used for generate - a random number for batch shuffle. - :type batch_size: int - :param clipped: Whether to clip the heading (small shift) and trailing - (incomplete batch) instances. - :type clipped: bool - :return: Batch shuffled mainifest. - :rtype: list - """ - rng = np.random.RandomState(self.epoch) - shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) - rng.shuffle(batch_indices) - batch_indices = [item for batch in batch_indices for item in batch] - assert (clipped == False) - if not clipped: - res_len = len(indices) - shift_len - len(batch_indices) - # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) - if res_len != 0: - batch_indices.extend(indices[-res_len:]) - batch_indices.extend(indices[0:shift_len]) - assert len(indices) == len( - batch_indices - ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}" - return batch_indices - def __iter__(self): num_samples = len(self.dataset) indices = np.arange(num_samples).tolist() @@ -103,8 +116,11 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler): f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}' ) if self._shuffle_method == "batch_shuffle": - indices = self._batch_shuffle( - indices, self.batch_size, clipped=False) + # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, + # since diff batch examlpe length in batches case instability loss in diff rank, + # e.g. rank0 maxlength 20, rank3 maxlength 1000 + indices = _batch_shuffle( + indices, self.batch_size * self.nranks, clipped=False) elif self._shuffle_method == "instance_shuffle": np.random.RandomState(self.epoch).shuffle(indices) else: @@ -114,7 +130,7 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler): indices ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}" - # subsample + # slice `self.batch_size` examples by rank id def _get_indices_by_batch_size(indices): subsampled_indices = [] last_batch_size = self.total_size % (self.batch_size * self.nranks) @@ -163,6 +179,16 @@ class SortagradBatchSampler(BatchSampler): drop_last=False, sortagrad=False, shuffle_method="batch_shuffle"): + """Sortagrad Sampler for one gpu. + + Args: + dataset (paddle.io.Dataset): + batch_size (int): batch size for one gpu + shuffle (bool, optional): True for do shuffle, or else. Defaults to False. + drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False. + sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False. + shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle". + """ self.dataset = dataset assert isinstance(batch_size, int) and batch_size > 0, \ @@ -181,45 +207,6 @@ class SortagradBatchSampler(BatchSampler): self._sortagrad = sortagrad self._shuffle_method = shuffle_method - def _batch_shuffle(self, indices, batch_size, clipped=False): - """Put similarly-sized instances into minibatches for better efficiency - and make a batch-wise shuffle. - - 1. Sort the audio clips by duration. - 2. Generate a random number `k`, k in [0, batch_size). - 3. Randomly shift `k` instances in order to create different batches - for different epochs. Create minibatches. - 4. Shuffle the minibatches. - - :param indices: indexes. List of int. - :type indices: list - :param batch_size: Batch size. This size is also used for generate - a random number for batch shuffle. - :type batch_size: int - :param clipped: Whether to clip the heading (small shift) and trailing - (incomplete batch) instances. - :type clipped: bool - :return: Batch shuffled mainifest. - :rtype: list - """ - rng = np.random.RandomState(self.epoch) - # must shift at leat by one - shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) - rng.shuffle(batch_indices) - batch_indices = [item for batch in batch_indices for item in batch] - assert (clipped == False) - if not clipped: - res_len = len(indices) - shift_len - len(batch_indices) - # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) - if res_len != 0: - batch_indices.extend(indices[-res_len:]) - batch_indices.extend(indices[0:shift_len]) - assert len(indices) == len( - batch_indices - ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}" - return batch_indices - def __iter__(self): num_samples = len(self.dataset) indices = np.arange(num_samples).tolist() @@ -233,7 +220,7 @@ class SortagradBatchSampler(BatchSampler): else: logger.info(f'dataset shuffle! epoch {self.epoch}') if self._shuffle_method == "batch_shuffle": - indices = self._batch_shuffle( + indices = _batch_shuffle( indices, self.batch_size, clipped=False) elif self._shuffle_method == "instance_shuffle": np.random.RandomState(self.epoch).shuffle(indices)