From 00147dc060c603c4a2ac5899acd2764d47fbce76 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 4 Mar 2021 06:26:26 +0000
Subject: [PATCH] fix instability loss and grad nan or inf for librispeech
 training

---
 deepspeech/io/sampler.py | 149 ++++++++++++++++++---------------------
 1 file changed, 68 insertions(+), 81 deletions(-)

diff --git a/deepspeech/io/sampler.py b/deepspeech/io/sampler.py
index a08a3983d..a0cc469ff 100644
--- a/deepspeech/io/sampler.py
+++ b/deepspeech/io/sampler.py
@@ -33,6 +33,45 @@ __all__ = [
 ]
 
 
+def _batch_shuffle(indices, batch_size, clipped=False):
+    """Put similarly-sized instances into minibatches for better efficiency
+    and make a batch-wise shuffle.
+
+    1. Sort the audio clips by duration.
+    2. Generate a random number `k`, k in [0, batch_size).
+    3. Randomly shift `k` instances in order to create different batches
+        for different epochs. Create minibatches.
+    4. Shuffle the minibatches.
+
+    :param indices: indexes. List of int.
+    :type indices: list
+    :param batch_size: Batch size. This size is also used for generate
+                        a random number for batch shuffle.
+    :type batch_size: int
+    :param clipped: Whether to clip the heading (small shift) and trailing
+                    (incomplete batch) instances.
+    :type clipped: bool
+    :return: Batch shuffled mainifest.
+    :rtype: list
+    """
+    rng = np.random.RandomState(self.epoch)
+    shift_len = rng.randint(0, batch_size - 1)
+    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
+    rng.shuffle(batch_indices)
+    batch_indices = [item for batch in batch_indices for item in batch]
+    assert (clipped == False)
+    if not clipped:
+        res_len = len(indices) - shift_len - len(batch_indices)
+        # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
+        if res_len != 0:
+            batch_indices.extend(indices[-res_len:])
+        batch_indices.extend(indices[0:shift_len])
+        assert len(indices) == len(
+            batch_indices
+        ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
+    return batch_indices
+
+
 class SortagradDistributedBatchSampler(DistributedBatchSampler):
     def __init__(self,
                  dataset,
@@ -43,49 +82,23 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
                  drop_last=False,
                  sortagrad=False,
                  shuffle_method="batch_shuffle"):
+        """Sortagrad Sampler for multi gpus.
+
+        Args:
+            dataset (paddle.io.Dataset): 
+            batch_size (int): batch size for one gpu
+            num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
+            rank (int, optional): rank id. Defaults to None.
+            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
+            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
+            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
+            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
+        """
         super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
                          drop_last)
         self._sortagrad = sortagrad
         self._shuffle_method = shuffle_method
 
-    def _batch_shuffle(self, indices, batch_size, clipped=False):
-        """Put similarly-sized instances into minibatches for better efficiency
-        and make a batch-wise shuffle.
-
-        1. Sort the audio clips by duration.
-        2. Generate a random number `k`, k in [0, batch_size).
-        3. Randomly shift `k` instances in order to create different batches
-           for different epochs. Create minibatches.
-        4. Shuffle the minibatches.
-
-        :param indices: indexes. List of int.
-        :type indices: list
-        :param batch_size: Batch size. This size is also used for generate
-                           a random number for batch shuffle.
-        :type batch_size: int
-        :param clipped: Whether to clip the heading (small shift) and trailing
-                        (incomplete batch) instances.
-        :type clipped: bool
-        :return: Batch shuffled mainifest.
-        :rtype: list
-        """
-        rng = np.random.RandomState(self.epoch)
-        shift_len = rng.randint(0, batch_size - 1)
-        batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
-        rng.shuffle(batch_indices)
-        batch_indices = [item for batch in batch_indices for item in batch]
-        assert (clipped == False)
-        if not clipped:
-            res_len = len(indices) - shift_len - len(batch_indices)
-            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            if res_len != 0:
-                batch_indices.extend(indices[-res_len:])
-            batch_indices.extend(indices[0:shift_len])
-            assert len(indices) == len(
-                batch_indices
-            ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
-        return batch_indices
-
     def __iter__(self):
         num_samples = len(self.dataset)
         indices = np.arange(num_samples).tolist()
@@ -103,8 +116,11 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
                     f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
                 )
                 if self._shuffle_method == "batch_shuffle":
-                    indices = self._batch_shuffle(
-                        indices, self.batch_size, clipped=False)
+                    # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, 
+                    # since diff batch examlpe length in batches case instability loss in diff rank, 
+                    # e.g. rank0 maxlength 20, rank3 maxlength 1000
+                    indices = _batch_shuffle(
+                        indices, self.batch_size * self.nranks, clipped=False)
                 elif self._shuffle_method == "instance_shuffle":
                     np.random.RandomState(self.epoch).shuffle(indices)
                 else:
@@ -114,7 +130,7 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
             indices
         ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
 
-        # subsample
+        # slice `self.batch_size` examples by rank id
         def _get_indices_by_batch_size(indices):
             subsampled_indices = []
             last_batch_size = self.total_size % (self.batch_size * self.nranks)
@@ -163,6 +179,16 @@ class SortagradBatchSampler(BatchSampler):
                  drop_last=False,
                  sortagrad=False,
                  shuffle_method="batch_shuffle"):
+        """Sortagrad Sampler for one gpu.
+
+        Args:
+            dataset (paddle.io.Dataset): 
+            batch_size (int): batch size for one gpu
+            shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
+            drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
+            sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
+            shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
+        """
         self.dataset = dataset
 
         assert isinstance(batch_size, int) and batch_size > 0, \
@@ -181,45 +207,6 @@ class SortagradBatchSampler(BatchSampler):
         self._sortagrad = sortagrad
         self._shuffle_method = shuffle_method
 
-    def _batch_shuffle(self, indices, batch_size, clipped=False):
-        """Put similarly-sized instances into minibatches for better efficiency
-        and make a batch-wise shuffle.
-
-        1. Sort the audio clips by duration.
-        2. Generate a random number `k`, k in [0, batch_size).
-        3. Randomly shift `k` instances in order to create different batches
-           for different epochs. Create minibatches.
-        4. Shuffle the minibatches.
-
-        :param indices: indexes. List of int.
-        :type indices: list
-        :param batch_size: Batch size. This size is also used for generate
-                           a random number for batch shuffle.
-        :type batch_size: int
-        :param clipped: Whether to clip the heading (small shift) and trailing
-                        (incomplete batch) instances.
-        :type clipped: bool
-        :return: Batch shuffled mainifest.
-        :rtype: list
-        """
-        rng = np.random.RandomState(self.epoch)
-        # must shift at leat by one
-        shift_len = rng.randint(0, batch_size - 1)
-        batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
-        rng.shuffle(batch_indices)
-        batch_indices = [item for batch in batch_indices for item in batch]
-        assert (clipped == False)
-        if not clipped:
-            res_len = len(indices) - shift_len - len(batch_indices)
-            # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
-            if res_len != 0:
-                batch_indices.extend(indices[-res_len:])
-            batch_indices.extend(indices[0:shift_len])
-            assert len(indices) == len(
-                batch_indices
-            ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
-        return batch_indices
-
     def __iter__(self):
         num_samples = len(self.dataset)
         indices = np.arange(num_samples).tolist()
@@ -233,7 +220,7 @@ class SortagradBatchSampler(BatchSampler):
             else:
                 logger.info(f'dataset shuffle! epoch {self.epoch}')
                 if self._shuffle_method == "batch_shuffle":
-                    indices = self._batch_shuffle(
+                    indices = _batch_shuffle(
                         indices, self.batch_size, clipped=False)
                 elif self._shuffle_method == "instance_shuffle":
                     np.random.RandomState(self.epoch).shuffle(indices)