fix instability loss and grad nan or inf for librispeech training

5 years ago · 00147dc060
parent 9a86f0672b
commit 00147dc060
1 changed files with 68 additions and 81 deletions
--- a/deepspeech/io/sampler.py
+++ b/deepspeech/io/sampler.py
@ -33,22 +33,7 @@ __all__ = [
 ]
-class SortagradDistributedBatchSampler(DistributedBatchSampler):
+def _batch_shuffle(indices, batch_size, clipped=False):
    def __init__(self,
                 dataset,
                 batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=False,
                 drop_last=False,
                 sortagrad=False,
                 shuffle_method="batch_shuffle"):
        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
                         drop_last)
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
    def _batch_shuffle(self, indices, batch_size, clipped=False):
    """Put similarly-sized instances into minibatches for better efficiency
    and make a batch-wise shuffle.
@ -86,6 +71,34 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
        ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
    return batch_indices
 class SortagradDistributedBatchSampler(DistributedBatchSampler):
    def __init__(self,
                 dataset,
                 batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=False,
                 drop_last=False,
                 sortagrad=False,
                 shuffle_method="batch_shuffle"):
        """Sortagrad Sampler for multi gpus.
        Args:
            dataset (paddle.io.Dataset): 
            batch_size (int): batch size for one gpu
            num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
            rank (int, optional): rank id. Defaults to None.
            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
        """
        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
                         drop_last)
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
    def __iter__(self):
        num_samples = len(self.dataset)
        indices = np.arange(num_samples).tolist()
@ -103,8 +116,11 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
                    f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
                )
                if self._shuffle_method == "batch_shuffle":
-                    indices = self._batch_shuffle(
+                    # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, 
-                        indices, self.batch_size, clipped=False)
+                    # since diff batch examlpe length in batches case instability loss in diff rank, 
                    # e.g. rank0 maxlength 20, rank3 maxlength 1000
                    indices = _batch_shuffle(
                        indices, self.batch_size * self.nranks, clipped=False)
                elif self._shuffle_method == "instance_shuffle":
                    np.random.RandomState(self.epoch).shuffle(indices)
                else:
@ -114,7 +130,7 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
            indices
        ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
-        # subsample
+        # slice `self.batch_size` examples by rank id
        def _get_indices_by_batch_size(indices):
            subsampled_indices = []
            last_batch_size = self.total_size % (self.batch_size * self.nranks)
@ -163,6 +179,16 @@ class SortagradBatchSampler(BatchSampler):
                 drop_last=False,
                 sortagrad=False,
                 shuffle_method="batch_shuffle"):
        """Sortagrad Sampler for one gpu.
        Args:
            dataset (paddle.io.Dataset): 
            batch_size (int): batch size for one gpu
            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
        """
        self.dataset = dataset
        assert isinstance(batch_size, int) and batch_size > 0, \
@ -181,45 +207,6 @@ class SortagradBatchSampler(BatchSampler):
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method
    def _batch_shuffle(self, indices, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.
        1. Sort the audio clips by duration.
        2. Generate a random number `k`, k in [0, batch_size).
        3. Randomly shift `k` instances in order to create different batches
           for different epochs. Create minibatches.
        4. Shuffle the minibatches.
        :param indices: indexes. List of int.
        :type indices: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
        :param clipped: Whether to clip the heading (small shift) and trailing
                        (incomplete batch) instances.
        :type clipped: bool
        :return: Batch shuffled mainifest.
        :rtype: list
        """
        rng = np.random.RandomState(self.epoch)
        # must shift at leat by one
        shift_len = rng.randint(0, batch_size - 1)
        batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
        rng.shuffle(batch_indices)
        batch_indices = [item for batch in batch_indices for item in batch]
        assert (clipped == False)
        if not clipped:
            res_len = len(indices) - shift_len - len(batch_indices)
            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
            if res_len != 0:
                batch_indices.extend(indices[-res_len:])
            batch_indices.extend(indices[0:shift_len])
            assert len(indices) == len(
                batch_indices
            ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
        return batch_indices
    def __iter__(self):
        num_samples = len(self.dataset)
        indices = np.arange(num_samples).tolist()
@ -233,7 +220,7 @@ class SortagradBatchSampler(BatchSampler):
            else:
                logger.info(f'dataset shuffle! epoch {self.epoch}')
                if self._shuffle_method == "batch_shuffle":
-                    indices = self._batch_shuffle(
+                    indices = _batch_shuffle(
                        indices, self.batch_size, clipped=False)
                elif self._shuffle_method == "instance_shuffle":
                    np.random.RandomState(self.epoch).shuffle(indices)