fix log; add report to trainer

pull/837/head
Hui Zhang 3 years ago
parent 6de20de3f8
commit 0e91d26ae3

@ -36,6 +36,7 @@ from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.training.reporter import report
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
@ -67,7 +68,9 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training batch_size = self.config.collator.batch_size
accum_grad = self.config.training.accum_grad
start = time.time() start = time.time()
# forward # forward
@ -78,7 +81,7 @@ class DeepSpeech2Trainer(Trainer):
} }
# loss backward # loss backward
if (batch_index + 1) % train_conf.accum_grad != 0: if (batch_index + 1) % accum_grad != 0:
# Disable gradient synchronizations across DDP processes. # Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module # Within this context, gradients will be accumulated on module
# variables, which will later be synchronized. # variables, which will later be synchronized.
@ -93,20 +96,19 @@ class DeepSpeech2Trainer(Trainer):
layer_tools.print_grads(self.model, print_func=None) layer_tools.print_grads(self.model, print_func=None)
# optimizer step # optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0: if (batch_index + 1) % accum_grad == 0:
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
self.iteration += 1 self.iteration += 1
iteration_time = time.time() - start iteration_time = time.time() - start
msg += "batch cost: {:>.3f}s, ".format(iteration_time) for k, v in losses_np.items():
msg += "batch size: {}, ".format(self.config.collator.batch_size) report(k, v)
msg += "accum: {}, ".format(train_conf.accum_grad) report("batch_size", batch_size)
msg += ', '.join('{}: {:>.6f}'.format(k, v) report("accum", accum_grad)
for k, v in losses_np.items()) report("step_cost", iteration_time)
logger.info(msg)
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
for k, v in losses_np.items(): for k, v in losses_np.items():
# `step -1` since we update `step` after optimizer.step(). # `step -1` since we update `step` after optimizer.step().

@ -17,6 +17,7 @@ import os
import sys import sys
import time import time
from collections import defaultdict from collections import defaultdict
from collections import OrderedDict
from contextlib import nullcontext from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -36,6 +37,8 @@ from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.training.reporter import report
from deepspeech.training.reporter import ObsScope
from deepspeech.utils import ctc_utils from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
@ -121,12 +124,11 @@ class U2Trainer(Trainer):
iteration_time = time.time() - start iteration_time = time.time() - start
if (batch_index + 1) % train_conf.log_interval == 0: if (batch_index + 1) % train_conf.log_interval == 0:
msg += "train time: {:>.3f}s, ".format(iteration_time) for k, v in losses_np.items():
msg += "batch size: {}, ".format(self.config.collator.batch_size) report(k, v)
msg += "accum: {}, ".format(train_conf.accum_grad) report("batch_size", self.config.collator.batch_size)
msg += ', '.join('{}: {:>.6f}'.format(k, v) report("accum", train_conf.accum_grad)
for k, v in losses_np.items()) report("step_cost", iteration_time)
logger.info(msg)
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy() losses_np_v = losses_np.copy()
@ -199,15 +201,25 @@ class U2Trainer(Trainer):
data_start_time = time.time() data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader): for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train:"
msg += "epoch: {}, ".format(self.epoch) observation = OrderedDict()
msg += "step: {}, ".format(self.iteration) with ObsScope(observation):
msg += "batch : {}/{}, ".format(batch_index + 1, report("Rank", dist.get_rank())
len(self.train_loader)) report("epoch", self.epoch)
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) report('step', self.iteration)
msg += "data time: {:>.3f}s, ".format(dataload_time) report('step/total', (batch_index + 1) / len(self.train_loader))
self.train_batch(batch_index, batch, msg) report("lr", self.lr_scheduler())
self.after_train_batch() self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('reader_cost', dataload_time)
observation['batch_cost'] = observation['reader_cost']+observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation['batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)

@ -14,12 +14,15 @@
import sys import sys
import time import time
from pathlib import Path from pathlib import Path
from collections import OrderedDict
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from deepspeech.training.timer import Timer from deepspeech.training.timer import Timer
from deepspeech.training.reporter import report
from deepspeech.training.reporter import ObsScope
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils import profiler from deepspeech.utils import profiler
from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.checkpoint import Checkpoint
@ -27,6 +30,7 @@ from deepspeech.utils.log import Log
from deepspeech.utils.utility import seed_all from deepspeech.utils.utility import seed_all
from deepspeech.utils.utility import UpdateConfig from deepspeech.utils.utility import UpdateConfig
__all__ = ["Trainer"] __all__ = ["Trainer"]
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -98,6 +102,9 @@ class Trainer():
self.checkpoint_dir = None self.checkpoint_dir = None
self.iteration = 0 self.iteration = 0
self.epoch = 0 self.epoch = 0
self.rank = dist.get_rank()
logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
if args.seed: if args.seed:
seed_all(args.seed) seed_all(args.seed)
@ -223,15 +230,25 @@ class Trainer():
data_start_time = time.time() data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader): for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train:"
msg += "epoch: {}, ".format(self.epoch) observation = OrderedDict()
msg += "step: {}, ".format(self.iteration) with ObsScope(observation):
msg += "batch : {}/{}, ".format(batch_index + 1, report("Rank", dist.get_rank())
len(self.train_loader)) report("epoch", self.epoch)
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) report('step', self.iteration)
msg += "data time: {:>.3f}s, ".format(dataload_time) report('step/total', (batch_index + 1) / len(self.train_loader))
self.train_batch(batch_index, batch, msg) report("lr", self.lr_scheduler())
self.after_train_batch() self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('reader_cost', dataload_time)
observation['batch_cost'] = observation['reader_cost']+observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation['batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)

@ -1,8 +1,8 @@
#!/bin/bash #!/bin/bash
profiler_options= profiler_options=
benchmark_batch_size= benchmark_batch_size=0
benchmark_max_step= benchmark_max_step=0
# seed may break model convergence # seed may break model convergence
seed=0 seed=0
@ -52,4 +52,4 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
exit 0 exit 0

@ -1,8 +1,8 @@
#!/bin/bash #!/bin/bash
profiler_options= profiler_options=
benchmark_batch_size= benchmark_batch_size=0
benchmark_max_step= benchmark_max_step=0
# seed may break model convergence # seed may break model convergence
seed=0 seed=0

Loading…
Cancel
Save