diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index c72e3d0a6..5efe4b48d 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -73,28 +73,37 @@ class DeepSpeech2Trainer(Trainer): logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") self.model.eval() valid_losses = defaultdict(list) + num_seen_utts = 1 + total_loss = 0.0 for i, batch in enumerate(self.valid_loader): loss = self.model(*batch) - + if paddle.isfinite(loss): + num_utts = batch[0].shape[0] + num_seen_utts += num_utts + total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - # write visual log - valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} + if (i + 1) % self.config.training.log_interval == 0: + valid_losses['val_history_loss'] = total_loss / num_seen_utts - # logging - msg = f"Valid: Rank: {dist.get_rank()}, " - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in valid_losses.items()) - logger.info(msg) + # write visual log + valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} - if self.visualizer: - for k, v in valid_losses.items(): - self.visualizer.add_scalar("valid/{}".format(k), v, - self.iteration) + # logging + msg = f"Valid: Rank: {dist.get_rank()}, " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in valid_losses.items()) + logger.info(msg) + + if self.visualizer: + for k, v in valid_losses.items(): + self.visualizer.add_scalar("valid/{}".format(k), v, + self.iteration) - return valid_losses + return total_loss, num_seen_utts def setup_model(self): config = self.config diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 4ae3a603e..6b951da42 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -81,11 +81,11 @@ class U2Trainer(Trainer): loss.backward() layer_tools.print_grads(self.model, print_func=None) - losses_np = { - 'train_loss': float(loss) * train_conf.accum_grad, - 'train_att_loss': float(attention_loss), - 'train_ctc_loss': float(ctc_loss), - } + losses_np = {'loss': float(loss) * train_conf.accum_grad} + if attention_loss: + losses_np['att_loss'] = float(attention_loss) + if ctc_loss: + losses_np['ctc_loss'] = float(ctc_loss) if (batch_index + 1) % train_conf.accum_grad == 0: if dist.get_rank() == 0 and self.visualizer: @@ -135,6 +135,8 @@ class U2Trainer(Trainer): msg = "Train: Rank: {}, ".format(dist.get_rank()) msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) + msg += "batch : {}/{}, ".format(batch_index + 1, + len(self.train_loader)) msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "dataloader time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) @@ -143,8 +145,9 @@ class U2Trainer(Trainer): logger.error(e) raise e - valid_losses = self.valid() - self.save(tag=self.epoch, infos=valid_losses) + total_loss, num_seen_utts = self.valid() + self.save( + tag=self.epoch, infos={'val_loss': total_loss / num_seen_utts}) self.new_epoch() @mp_tools.rank_zero_only @@ -153,29 +156,42 @@ class U2Trainer(Trainer): self.model.eval() logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) + num_seen_utts = 1 + total_loss = 0.0 for i, batch in enumerate(self.valid_loader): - total_loss, attention_loss, ctc_loss = self.model(*batch) - - valid_losses['val_loss'].append(float(total_loss)) - valid_losses['val_att_loss'].append(float(attention_loss)) - valid_losses['val_ctc_loss'].append(float(ctc_loss)) - - # write visual log - valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} - - # logging - msg = f"Valid: Rank: {dist.get_rank()}, " - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in valid_losses.items()) - logger.info(msg) - - if self.visualizer: - valid_losses_v = valid_losses.copy() - valid_losses_v.update({"lr": self.lr_scheduler()}) - self.visualizer.add_scalars('epoch', valid_losses_v, self.epoch) - return valid_losses + loss, attention_loss, ctc_loss = self.model(*batch) + if paddle.isfinite(loss): + num_utts = batch[0].shape[0] + num_seen_utts += num_utts + total_loss += float(loss) * num_utts + valid_losses = {'val_loss': float(loss)} + if attention_loss: + valid_losses['val_att_loss'] = float(attention_loss) + if ctc_loss: + valid_losses['val_ctc_loss'] = float(ctc_loss) + + if (i + 1) % self.config.training.log_interval == 0: + valid_losses['val_history_loss'] = total_loss / num_seen_utts + + # write visual log + valid_losses = {k: np.mean(v) for k, v in valid_losses.items()} + + # logging + msg = f"Valid: Rank: {dist.get_rank()}, " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in valid_losses.items()) + logger.info(msg) + + if self.visualizer: + valid_losses_v = valid_losses.copy() + valid_losses_v.update({"lr": self.lr_scheduler()}) + self.visualizer.add_scalars('epoch', valid_losses_v, + self.epoch) + + return total_loss, num_seen_utts def setup_dataloader(self): config = self.config.clone() diff --git a/deepspeech/training/gradclip.py b/deepspeech/training/gradclip.py index 348308fdb..6c106f340 100644 --- a/deepspeech/training/gradclip.py +++ b/deepspeech/training/gradclip.py @@ -56,6 +56,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm): global_norm_var = layers.sqrt(global_norm_var) # debug log logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!") + max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( diff --git a/deepspeech/training/scheduler.py b/deepspeech/training/scheduler.py index 07195c298..d36130284 100644 --- a/deepspeech/training/scheduler.py +++ b/deepspeech/training/scheduler.py @@ -53,5 +53,14 @@ class WarmupLR(LRScheduler): return self.base_lr * self.warmup_steps**0.5 * min( step_num**-0.5, step_num * self.warmup_steps**-1.5) - def set_step(self, step: int): - self.step(step) + def set_step(self, step: int=None): + ''' + It will update the learning rate in optimizer according to current ``epoch`` . + The new learning rate will take effect on next ``optimizer.step`` . + + Args: + step (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. + Returns: + None + ''' + self.step(epoch=step) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index ff5910d16..aded34624 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -157,6 +157,8 @@ class Trainer(): self.epoch = infos["epoch"] scratch = False else: + self.iteration = 0 + self.epoch = 0 scratch = True return scratch @@ -189,6 +191,8 @@ class Trainer(): msg = "Train: Rank: {}, ".format(dist.get_rank()) msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) + msg += "batch : {}/{}, ".format(batch_index + 1, + len(self.train_loader)) msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "dataloader time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) @@ -197,8 +201,8 @@ class Trainer(): logger.error(e) raise e - valid_losses = self.valid() - self.save(infos=valid_losses) + total_loss, num_seen_utts = self.valid() + self.save(infos={'val_loss': total_loss / num_seen_utts}) self.lr_scheduler.step() self.new_epoch()