diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 6b951da42..142491f86 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -88,10 +88,6 @@ class U2Trainer(Trainer): losses_np['ctc_loss'] = float(ctc_loss) if (batch_index + 1) % train_conf.accum_grad == 0: - if dist.get_rank() == 0 and self.visualizer: - losses_np_v = losses_np.copy() - losses_np_v.update({"lr": self.lr_scheduler()}) - self.visualizer.add_scalars("step", losses_np_v, self.iteration) self.optimizer.step() self.optimizer.clear_grad() self.lr_scheduler.step() @@ -107,6 +103,12 @@ class U2Trainer(Trainer): for k, v in losses_np.items()) logger.info(msg) + if dist.get_rank() == 0 and self.visualizer: + losses_np_v = losses_np.copy() + losses_np_v.update({"lr": self.lr_scheduler()}) + self.visualizer.add_scalars("step", losses_np_v, + self.iteration - 1) + def train(self): """The training process control by step.""" # !!!IMPORTANT!!! diff --git a/deepspeech/utils/checkpoint.py b/deepspeech/utils/checkpoint.py index 882b65241..8ede6b8fd 100644 --- a/deepspeech/utils/checkpoint.py +++ b/deepspeech/utils/checkpoint.py @@ -46,8 +46,8 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int: return iteration -def _save_checkpoint(checkpoint_dir: str, iteration: int): - """Save the iteration number of the latest model to be checkpointed. +def _save_record(checkpoint_dir: str, iteration: int): + """Save the iteration number of the latest model to be checkpoint record. Args: checkpoint_dir (str): the directory where checkpoint is saved. iteration (int): the latest iteration number. @@ -149,4 +149,4 @@ def save_parameters(checkpoint_dir: str, fout.write(data) if isinstance(tag_or_iteration, int): - _save_checkpoint(checkpoint_dir, tag_or_iteration) + _save_record(checkpoint_dir, tag_or_iteration) diff --git a/deepspeech/utils/layer_tools.py b/deepspeech/utils/layer_tools.py index 1e8e55ed1..c05982c14 100644 --- a/deepspeech/utils/layer_tools.py +++ b/deepspeech/utils/layer_tools.py @@ -21,6 +21,8 @@ __all__ = [ def summary(layer: nn.Layer, print_func=print): + if print_func is None: + return num_params = num_elements = 0 for name, param in layer.state_dict().items(): if print_func: @@ -32,15 +34,6 @@ def summary(layer: nn.Layer, print_func=print): print_func(f"Total parameters: {num_params}, {num_elements} elements.") -def gradient_norm(layer: nn.Layer): - grad_norm_dict = {} - for name, param in layer.state_dict().items(): - if param.trainable: - grad = param.gradient() # return numpy.ndarray - grad_norm_dict[name] = np.linalg.norm(grad) / grad.size - return grad_norm_dict - - def print_grads(model, print_func=print): if print_func is None: return @@ -64,6 +57,15 @@ def print_params(model, print_func=print): print_func(f"Total parameters: {num_params}, {total} elements.") +def gradient_norm(layer: nn.Layer): + grad_norm_dict = {} + for name, param in layer.state_dict().items(): + if param.trainable: + grad = param.gradient() # return numpy.ndarray + grad_norm_dict[name] = np.linalg.norm(grad) / grad.size + return grad_norm_dict + + def recursively_remove_weight_norm(layer: nn.Layer): for layer in layer.sublayers(): try: