fix instability loss and grad nan or inf for librispeech training

pull/538/head
Hui Zhang 5 years ago
parent 9a86f0672b
commit 00147dc060

@ -33,22 +33,7 @@ __all__ = [
]
class SortagradDistributedBatchSampler(DistributedBatchSampler):
def __init__(self,
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=False,
drop_last=False,
sortagrad=False,
shuffle_method="batch_shuffle"):
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
drop_last)
self._sortagrad = sortagrad
self._shuffle_method = shuffle_method
def _batch_shuffle(self, indices, batch_size, clipped=False):
def _batch_shuffle(indices, batch_size, clipped=False):
"""Put similarly-sized instances into minibatches for better efficiency
and make a batch-wise shuffle.
@ -86,6 +71,34 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
return batch_indices
class SortagradDistributedBatchSampler(DistributedBatchSampler):
def __init__(self,
dataset,
batch_size,
num_replicas=None,
rank=None,
shuffle=False,
drop_last=False,
sortagrad=False,
shuffle_method="batch_shuffle"):
"""Sortagrad Sampler for multi gpus.
Args:
dataset (paddle.io.Dataset):
batch_size (int): batch size for one gpu
num_replicas (int, optional): world size or numbers of gpus. Defaults to None.
rank (int, optional): rank id. Defaults to None.
shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
"""
super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
drop_last)
self._sortagrad = sortagrad
self._shuffle_method = shuffle_method
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
@ -103,8 +116,11 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}'
)
if self._shuffle_method == "batch_shuffle":
indices = self._batch_shuffle(
indices, self.batch_size, clipped=False)
# using `batch_size * nrank`, or will cause instability loss and nan or inf grad,
# since diff batch examlpe length in batches case instability loss in diff rank,
# e.g. rank0 maxlength 20, rank3 maxlength 1000
indices = _batch_shuffle(
indices, self.batch_size * self.nranks, clipped=False)
elif self._shuffle_method == "instance_shuffle":
np.random.RandomState(self.epoch).shuffle(indices)
else:
@ -114,7 +130,7 @@ class SortagradDistributedBatchSampler(DistributedBatchSampler):
indices
) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}"
# subsample
# slice `self.batch_size` examples by rank id
def _get_indices_by_batch_size(indices):
subsampled_indices = []
last_batch_size = self.total_size % (self.batch_size * self.nranks)
@ -163,6 +179,16 @@ class SortagradBatchSampler(BatchSampler):
drop_last=False,
sortagrad=False,
shuffle_method="batch_shuffle"):
"""Sortagrad Sampler for one gpu.
Args:
dataset (paddle.io.Dataset):
batch_size (int): batch size for one gpu
shuffle (bool, optional): True for do shuffle, or else. Defaults to False.
drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False.
sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False.
shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle".
"""
self.dataset = dataset
assert isinstance(batch_size, int) and batch_size > 0, \
@ -181,45 +207,6 @@ class SortagradBatchSampler(BatchSampler):
self._sortagrad = sortagrad
self._shuffle_method = shuffle_method
def _batch_shuffle(self, indices, batch_size, clipped=False):
"""Put similarly-sized instances into minibatches for better efficiency
and make a batch-wise shuffle.
1. Sort the audio clips by duration.
2. Generate a random number `k`, k in [0, batch_size).
3. Randomly shift `k` instances in order to create different batches
for different epochs. Create minibatches.
4. Shuffle the minibatches.
:param indices: indexes. List of int.
:type indices: list
:param batch_size: Batch size. This size is also used for generate
a random number for batch shuffle.
:type batch_size: int
:param clipped: Whether to clip the heading (small shift) and trailing
(incomplete batch) instances.
:type clipped: bool
:return: Batch shuffled mainifest.
:rtype: list
"""
rng = np.random.RandomState(self.epoch)
# must shift at leat by one
shift_len = rng.randint(0, batch_size - 1)
batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
rng.shuffle(batch_indices)
batch_indices = [item for batch in batch_indices for item in batch]
assert (clipped == False)
if not clipped:
res_len = len(indices) - shift_len - len(batch_indices)
# when res_len is 0, will return whole list, len(List[-0:]) = len(List[:])
if res_len != 0:
batch_indices.extend(indices[-res_len:])
batch_indices.extend(indices[0:shift_len])
assert len(indices) == len(
batch_indices
), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}"
return batch_indices
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
@ -233,7 +220,7 @@ class SortagradBatchSampler(BatchSampler):
else:
logger.info(f'dataset shuffle! epoch {self.epoch}')
if self._shuffle_method == "batch_shuffle":
indices = self._batch_shuffle(
indices = _batch_shuffle(
indices, self.batch_size, clipped=False)
elif self._shuffle_method == "instance_shuffle":
np.random.RandomState(self.epoch).shuffle(indices)

Loading…
Cancel
Save