From 0e91d26ae3808497072c36d71d424a4db40cf4f8 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 17 Sep 2021 06:26:08 +0000 Subject: [PATCH] fix log; add report to trainer --- deepspeech/exps/deepspeech2/model.py | 22 ++++++++------- deepspeech/exps/u2/model.py | 42 ++++++++++++++++++---------- deepspeech/training/trainer.py | 35 +++++++++++++++++------ examples/aishell/s1/local/train.sh | 6 ++-- examples/tiny/s1/local/train.sh | 4 +-- 5 files changed, 70 insertions(+), 39 deletions(-) diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 128c4c82..8272d72e 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -36,6 +36,7 @@ from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.trainer import Trainer +from deepspeech.training.reporter import report from deepspeech.utils import error_rate from deepspeech.utils import layer_tools from deepspeech.utils import mp_tools @@ -67,7 +68,9 @@ class DeepSpeech2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + batch_size = self.config.collator.batch_size + accum_grad = self.config.training.accum_grad + start = time.time() # forward @@ -78,7 +81,7 @@ class DeepSpeech2Trainer(Trainer): } # loss backward - if (batch_index + 1) % train_conf.accum_grad != 0: + if (batch_index + 1) % accum_grad != 0: # Disable gradient synchronizations across DDP processes. # Within this context, gradients will be accumulated on module # variables, which will later be synchronized. @@ -93,20 +96,19 @@ class DeepSpeech2Trainer(Trainer): layer_tools.print_grads(self.model, print_func=None) # optimizer step - if (batch_index + 1) % train_conf.accum_grad == 0: + if (batch_index + 1) % accum_grad == 0: self.optimizer.step() self.optimizer.clear_grad() self.iteration += 1 iteration_time = time.time() - start - msg += "batch cost: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) - msg += "accum: {}, ".format(train_conf.accum_grad) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_np.items()) - logger.info(msg) - + for k, v in losses_np.items(): + report(k, v) + report("batch_size", batch_size) + report("accum", accum_grad) + report("step_cost", iteration_time) + if dist.get_rank() == 0 and self.visualizer: for k, v in losses_np.items(): # `step -1` since we update `step` after optimizer.step(). diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 89d443e0..68b001ca 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -17,6 +17,7 @@ import os import sys import time from collections import defaultdict +from collections import OrderedDict from contextlib import nullcontext from pathlib import Path from typing import Optional @@ -36,6 +37,8 @@ from deepspeech.training.optimizer import OptimizerFactory from deepspeech.training.scheduler import LRSchedulerFactory from deepspeech.training.timer import Timer from deepspeech.training.trainer import Trainer +from deepspeech.training.reporter import report +from deepspeech.training.reporter import ObsScope from deepspeech.utils import ctc_utils from deepspeech.utils import error_rate from deepspeech.utils import layer_tools @@ -121,12 +124,11 @@ class U2Trainer(Trainer): iteration_time = time.time() - start if (batch_index + 1) % train_conf.log_interval == 0: - msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) - msg += "accum: {}, ".format(train_conf.accum_grad) - msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_np.items()) - logger.info(msg) + for k, v in losses_np.items(): + report(k, v) + report("batch_size", self.config.collator.batch_size) + report("accum", train_conf.accum_grad) + report("step_cost", iteration_time) if dist.get_rank() == 0 and self.visualizer: losses_np_v = losses_np.copy() @@ -199,15 +201,25 @@ class U2Trainer(Trainer): data_start_time = time.time() for batch_index, batch in enumerate(self.train_loader): dataload_time = time.time() - data_start_time - msg = "Train: Rank: {}, ".format(dist.get_rank()) - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, - len(self.train_loader)) - msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) - msg += "data time: {:>.3f}s, ".format(dataload_time) - self.train_batch(batch_index, batch, msg) - self.after_train_batch() + msg = "Train:" + observation = OrderedDict() + with ObsScope(observation): + report("Rank", dist.get_rank()) + report("epoch", self.epoch) + report('step', self.iteration) + report('step/total', (batch_index + 1) / len(self.train_loader)) + report("lr", self.lr_scheduler()) + self.train_batch(batch_index, batch, msg) + self.after_train_batch() + report('reader_cost', dataload_time) + observation['batch_cost'] = observation['reader_cost']+observation['step_cost'] + observation['samples'] = observation['batch_size'] + observation['ips[sent./sec]'] = observation['batch_size'] / observation['batch_cost'] + for k, v in observation.items(): + msg += f" {k}: " + msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" + msg += "," + logger.info(msg) data_start_time = time.time() except Exception as e: logger.error(e) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index f5e5f12a..18578b42 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -14,12 +14,15 @@ import sys import time from pathlib import Path +from collections import OrderedDict import paddle from paddle import distributed as dist from tensorboardX import SummaryWriter from deepspeech.training.timer import Timer +from deepspeech.training.reporter import report +from deepspeech.training.reporter import ObsScope from deepspeech.utils import mp_tools from deepspeech.utils import profiler from deepspeech.utils.checkpoint import Checkpoint @@ -27,6 +30,7 @@ from deepspeech.utils.log import Log from deepspeech.utils.utility import seed_all from deepspeech.utils.utility import UpdateConfig + __all__ = ["Trainer"] logger = Log(__name__).getlog() @@ -98,6 +102,9 @@ class Trainer(): self.checkpoint_dir = None self.iteration = 0 self.epoch = 0 + self.rank = dist.get_rank() + + logger.info(f"Rank: {self.rank}/{dist.get_world_size()}") if args.seed: seed_all(args.seed) @@ -223,15 +230,25 @@ class Trainer(): data_start_time = time.time() for batch_index, batch in enumerate(self.train_loader): dataload_time = time.time() - data_start_time - msg = "Train: Rank: {}, ".format(dist.get_rank()) - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, - len(self.train_loader)) - msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) - msg += "data time: {:>.3f}s, ".format(dataload_time) - self.train_batch(batch_index, batch, msg) - self.after_train_batch() + msg = "Train:" + observation = OrderedDict() + with ObsScope(observation): + report("Rank", dist.get_rank()) + report("epoch", self.epoch) + report('step', self.iteration) + report('step/total', (batch_index + 1) / len(self.train_loader)) + report("lr", self.lr_scheduler()) + self.train_batch(batch_index, batch, msg) + self.after_train_batch() + report('reader_cost', dataload_time) + observation['batch_cost'] = observation['reader_cost']+observation['step_cost'] + observation['samples'] = observation['batch_size'] + observation['ips[sent./sec]'] = observation['batch_size'] / observation['batch_cost'] + for k, v in observation.items(): + msg += f" {k}: " + msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}" + msg += "," + logger.info(msg) data_start_time = time.time() except Exception as e: logger.error(e) diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/s1/local/train.sh index 5b9c45f5..1a341de7 100755 --- a/examples/aishell/s1/local/train.sh +++ b/examples/aishell/s1/local/train.sh @@ -1,8 +1,8 @@ #!/bin/bash profiler_options= -benchmark_batch_size= -benchmark_max_step= +benchmark_batch_size=0 +benchmark_max_step=0 # seed may break model convergence seed=0 @@ -52,4 +52,4 @@ if [ $? -ne 0 ]; then exit 1 fi -exit 0 \ No newline at end of file +exit 0 diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/s1/local/train.sh index 56ceab41..5097d4d0 100755 --- a/examples/tiny/s1/local/train.sh +++ b/examples/tiny/s1/local/train.sh @@ -1,8 +1,8 @@ #!/bin/bash profiler_options= -benchmark_batch_size= -benchmark_max_step= +benchmark_batch_size=0 +benchmark_max_step=0 # seed may break model convergence seed=0