fix instability loss and grad nan or inf for librispeech training

5 years ago · 00147dc060
parent 9a86f0672b
commit 00147dc060
1 changed files with 68 additions and 81 deletions
--- a/deepspeech/io/sampler.py
+++ b/deepspeech/io/sampler.py
@ -33,22 +33,7 @@ __all__ = [
 ]


-class SortagradDistributedBatchSampler(DistributedBatchSampler):
-    def __init__(self,
-                 dataset,
-                 batch_size,
-                 num_replicas=None,
-                 rank=None,
-                 shuffle=False,
-                 drop_last=False,
-                 sortagrad=False,
-                 shuffle_method="batch_shuffle"):
-        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
-                         drop_last)
-        self._sortagrad = sortagrad
-        self._shuffle_method = shuffle_method
-
-    def _batch_shuffle(self, indices, batch_size, clipped=False):
+def _batch_shuffle(indices, batch_size, clipped=False):
    """Put similarly-sized instances into minibatches for better efficiency
    and make a batch-wise shuffle.

@ -86,6 +71,34 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
        ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
    return batch_indices

+
+class SortagradDistributedBatchSampler(DistributedBatchSampler):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False,
+                 sortagrad=False,
+                 shuffle_method="batch_shuffle"):
+        """Sortagrad Sampler for multi gpus.
+
+        Args:
+            dataset (paddle.io.Dataset): 
+            batch_size (int): batch size for one gpu
+            num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
+            rank (int, optional): rank id. Defaults to None.
+            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
+            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
+            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
+            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
+        """
+        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
+                         drop_last)
+        self._sortagrad = sortagrad
+        self._shuffle_method = shuffle_method
+
    def __iter__(self):
        num_samples = len(self.dataset)
        indices = np.arange(num_samples).tolist()
@ -103,8 +116,11 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
                    f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
                )
                if self._shuffle_method == "batch_shuffle":
-                    indices = self._batch_shuffle(
-                        indices, self.batch_size, clipped=False)
+                    # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, 
+                    # since diff batch examlpe length in batches case instability loss in diff rank, 
+                    # e.g. rank0 maxlength 20, rank3 maxlength 1000
+                    indices = _batch_shuffle(
+                        indices, self.batch_size * self.nranks, clipped=False)
                elif self._shuffle_method == "instance_shuffle":
                    np.random.RandomState(self.epoch).shuffle(indices)
                else:
@ -114,7 +130,7 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
            indices
        ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"

-        # subsample
+        # slice `self.batch_size` examples by rank id
        def _get_indices_by_batch_size(indices):
            subsampled_indices = []
            last_batch_size = self.total_size % (self.batch_size * self.nranks)
@ -163,6 +179,16 @@ class SortagradBatchSampler(BatchSampler):
                 drop_last=False,
                 sortagrad=False,
                 shuffle_method="batch_shuffle"):
+        """Sortagrad Sampler for one gpu.
+
+        Args:
+            dataset (paddle.io.Dataset): 
+            batch_size (int): batch size for one gpu
+            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
+            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
+            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
+            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
+        """
        self.dataset = dataset

        assert isinstance(batch_size, int) and batch_size > 0, \
@ -181,45 +207,6 @@ class SortagradBatchSampler(BatchSampler):
        self._sortagrad = sortagrad
        self._shuffle_method = shuffle_method

-    def _batch_shuffle(self, indices, batch_size, clipped=False):
-        """Put similarly-sized instances into minibatches for better efficiency
-        and make a batch-wise shuffle.
-
-        1. Sort the audio clips by duration.
-        2. Generate a random number `k`, k in [0, batch_size).
-        3. Randomly shift `k` instances in order to create different batches
-           for different epochs. Create minibatches.
-        4. Shuffle the minibatches.
-
-        :param indices: indexes. List of int.
-        :type indices: list
-        :param batch_size: Batch size. This size is also used for generate
-                           a random number for batch shuffle.
-        :type batch_size: int
-        :param clipped: Whether to clip the heading (small shift) and trailing
-                        (incomplete batch) instances.
-        :type clipped: bool
-        :return: Batch shuffled mainifest.
-        :rtype: list
-        """
-        rng = np.random.RandomState(self.epoch)
-        # must shift at leat by one
-        shift_len = rng.randint(0, batch_size - 1)
-        batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
-        rng.shuffle(batch_indices)
-        batch_indices = [item for batch in batch_indices for item in batch]
-        assert (clipped == False)
-        if not clipped:
-            res_len = len(indices) - shift_len - len(batch_indices)
-            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            if res_len != 0:
-                batch_indices.extend(indices[-res_len:])
-            batch_indices.extend(indices[0:shift_len])
-            assert len(indices) == len(
-                batch_indices
-            ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
-        return batch_indices
-
    def __iter__(self):
        num_samples = len(self.dataset)
        indices = np.arange(num_samples).tolist()
@ -233,7 +220,7 @@ class SortagradBatchSampler(BatchSampler):
            else:
                logger.info(f'dataset shuffle! epoch {self.epoch}')
                if self._shuffle_method == "batch_shuffle":
-                    indices = self._batch_shuffle(
+                    indices = _batch_shuffle(
                        indices, self.batch_size, clipped=False)
                elif self._shuffle_method == "instance_shuffle":
                    np.random.RandomState(self.epoch).shuffle(indices)