chaner style updater

5 years ago · cfdca210ff
parent 787fa9d91f
commit cfdca210ff
15 changed files with 888 additions and 0 deletions
--- a/deepspeech/training/extensions/init.py
+++ b/deepspeech/training/extensions/init.py
@ -0,0 +1,28 @@
+
+from typing import Callable
+
+from .extension import Extension
+
+def make_extension(trigger: Callable=None,
+                   default_name: str=None,
+                   priority: int=None,
+                   finalizer: Callable=None,
+                   initializer: Callable=None,
+                   on_error: Callable=None):
+    """Make an Extension-like object by injecting required attributes to it.
+    """
+    if trigger is None:
+        trigger = Extension.trigger
+    if priority is None:
+        priority = Extension.priority
+
+    def decorator(ext):
+        ext.trigger = trigger
+        ext.default_name = default_name or ext.__name__
+        ext.priority = priority
+        ext.finalize = finalizer
+        ext.on_error = on_error
+        ext.initialize = initializer
+        return ext
+
+    return decorator
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
@ -0,0 +1,58 @@
+from typing import Dict
+
+import paddle
+from paddle.io import DataLoader
+from paddle.nn import Layer
+
+import extension
+from ..reporter import DictSummary
+from ..reporter import report
+from ..reporter import scope
+
+
+class StandardEvaluator(extension.Extension):
+
+    trigger = (1, 'epoch')
+    default_name = 'validation'
+    priority = extension.PRIORITY_WRITER
+
+    name = None
+
+    def __init__(self, model: Layer, dataloader: DataLoader):
+        # it is designed to hold multiple models
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        self.model = model
+
+        # dataloaders
+        self.dataloader = dataloader
+
+    def evaluate_core(self, batch):
+        # compute
+        self.model(batch)  # you may report here
+
+    def evaluate(self):
+        # switch to eval mode
+        for model in self.models.values():
+            model.eval()
+
+        # to average evaluation metrics
+        summary = DictSummary()
+        for batch in self.dataloader:
+            observation = {}
+            with scope(observation):
+                # main evaluation computation here.
+                with paddle.no_grad():
+                    self.evaluate_core(batch)
+            summary.add(observation)
+        summary = summary.compute_mean()
+        return summary
+
+    def __call__(self, trainer=None):
+        # evaluate and report the averaged metric to current observation
+        # if it is used to extend a trainer, the metrics is reported to
+        # to observation of the trainer
+        # or otherwise, you can use your own observation
+        summary = self.evaluate()
+        for k, v in summary.items():
+            report(k, v)
--- a/deepspeech/training/extensions/extension.py
+++ b/deepspeech/training/extensions/extension.py
@ -0,0 +1,41 @@
+from typing import Callable
+
+PRIORITY_WRITER = 300
+PRIORITY_EDITOR = 200
+PRIORITY_READER = 100
+
+
+class Extension():
+    """Extension to customize the behavior of Trainer."""
+    trigger = (1, 'iteration')
+    priority = PRIORITY_READER
+    name = None
+
+    @property
+    def default_name(self):
+        """Default name of the extension, class name by default."""
+        return type(self).__name__
+
+    def __call__(self, trainer):
+        """Main action of the extention. After each update, it is executed
+        when the trigger fires."""
+        raise NotImplementedError(
+            'Extension implementation must override __call__.')
+
+    def initialize(self, trainer):
+        """Action that is executed once to get the corect trainer state.
+        It is called before training normally, but if the trainer restores
+        states with an Snapshot extension, this method should also be called.
+        """
+        pass
+
+    def on_error(self, trainer, exc, tb):
+        """Handles the error raised during training before finalization.
+        """
+        pass
+
+    def finalize(self, trainer):
+        """Action that is executed when training is done.
+        For example, visualizers would need to be closed.
+        """
+        pass
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
@ -0,0 +1,102 @@
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+
+from deepspeech.training.updaters.trainer import Trainer
+from deepspeech.training.extensions import extension
+from deepspeech.utils.mp_tools import rank_zero_only
+
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+def load_records(records_fp):
+    """Load record files (json lines.)"""
+    with jsonlines.open(records_fp, 'r') as reader:
+        records = list(reader)
+    return records
+
+
+class Snapshot(extension.Extension):
+    """An extension to make snapshot of the updater object inside
+    the trainer. It is done by calling the updater's `save` method.
+    An Updater save its state_dict by default, which contains the
+    updater state, (i.e. epoch and iteration) and all the model
+    parameters and optimizer states. If the updater inside the trainer
+    subclasses StandardUpdater, everything is good to go.
+    Parameters
+    ----------
+    checkpoint_dir : Union[str, Path]
+        The directory to save checkpoints into.
+    """
+
+    trigger = (1, 'epoch')
+    priority = -100
+    default_name = "snapshot"
+
+    def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
+        self.records: List[Dict[str, Any]] = []
+        self.max_size = max_size
+        self._snapshot_on_error = snapshot_on_error
+        self._save_all = (max_size == -1)
+        self.checkpoint_dir = None
+
+    def initialize(self, trainer: Trainer):
+        """Setting up this extention."""
+        self.checkpoint_dir = trainer.out / "checkpoints"
+
+        # load existing records
+        record_path: Path = self.checkpoint_dir / "records.jsonl"
+        if record_path.exists():
+            logger.debug("Loading from an existing checkpoint dir")
+            self.records = load_records(record_path)
+            trainer.updater.load(self.records[-1]['path'])
+
+    def on_error(self, trainer, exc, tb):
+        if self._snapshot_on_error:
+            self.save_checkpoint_and_update(trainer)
+
+    def __call__(self, trainer: Trainer):
+        self.save_checkpoint_and_update(trainer)
+
+    def full(self):
+        """Whether the number of snapshots it keeps track of is greater
+        than the max_size."""
+        return (not self._save_all) and len(self.records) > self.max_size
+
+    @rank_zero_only
+    def save_checkpoint_and_update(self, trainer: Trainer):
+        """Saving new snapshot and remove the oldest snapshot if needed."""
+        iteration = trainer.updater.state.iteration
+        epoch = trainer.updater.state.epoch
+        num = epoch if self.trigger[1] is 'epoch' else iteration
+        path = self.checkpoint_dir / f"{num}.pdz"
+
+        # add the new one
+        trainer.updater.save(path)
+        record = {
+            "time": str(datetime.now()),
+            'path': str(path.resolve()),  # use absolute path
+            'iteration': iteration,
+            'epoch': epoch,
+        }
+        self.records.append(record)
+
+        # remove the earist
+        if self.full():
+            eariest_record = self.records[0]
+            os.remove(eariest_record["path"])
+            self.records.pop(0)
+
+        # update the record file
+        record_path = self.checkpoint_dir / "records.jsonl"
+        with jsonlines.open(record_path, 'w') as writer:
+            for record in self.records:
+                # jsonlines.open may return a Writer or a Reader
+                writer.write(record)  # pylint: disable=no-member
--- a/deepspeech/training/extensions/visualizer.py
+++ b/deepspeech/training/extensions/visualizer.py
@ -0,0 +1,24 @@
+from deepspeech.training.extensions import extension
+from deepspeech.training.updaters.trainer import Trainer
+
+
+class VisualDL(extension.Extension):
+    """A wrapper of visualdl log writer. It assumes that the metrics to be visualized
+    are all scalars which are recorded into the `.observation` dictionary of the
+    trainer object. The dictionary is created for each step, thus the visualdl log
+    writer uses the iteration from the updater's `iteration` as the global step to
+    add records.
+    """
+    trigger = (1, 'iteration')
+    default_name = 'visualdl'
+    priority = extension.PRIORITY_READER
+
+    def __init__(self, writer):
+        self.writer = writer
+
+    def __call__(self, trainer: Trainer):
+        for k, v in trainer.observation.items():
+            self.writer.add_scalar(k, v, step=trainer.updater.state.iteration)
+
+    def finalize(self, trainer):
+        self.writer.close()
--- a/deepspeech/training/reporter.py
+++ b/deepspeech/training/reporter.py
@ -0,0 +1,131 @@
+import contextlib
+import math
+from collections import defaultdict
+
+OBSERVATIONS = None
+
+
+@contextlib.contextmanager
+def scope(observations):
+    # make `observation` the target to report to.
+    # it is basically a dictionary that stores temporary observations
+    global OBSERVATIONS
+    old = OBSERVATIONS
+    OBSERVATIONS = observations
+
+    try:
+        yield
+    finally:
+        OBSERVATIONS = old
+
+
+def get_observations():
+    global OBSERVATIONS
+    return OBSERVATIONS
+
+
+def report(name, value):
+    # a simple function to report named value
+    # you can use it everywhere, it will get the default target and writ to it
+    # you can think of it as std.out
+    observations = get_observations()
+    if observations is None:
+        return
+    else:
+        observations[name] = value
+
+
+class Summary():
+    """Online summarization of a sequence of scalars.
+    Summary computes the statistics of given scalars online.
+    """
+
+    def __init__(self):
+        self._x = 0.0
+        self._x2 = 0.0
+        self._n = 0
+
+    def add(self, value, weight=1):
+        """Adds a scalar value.
+        Args:
+            value: Scalar value to accumulate. It is either a NumPy scalar or
+                a zero-dimensional array (on CPU or GPU).
+            weight: An optional weight for the value. It is a NumPy scalar or
+                a zero-dimensional array (on CPU or GPU).
+                Default is 1 (integer).
+        """
+        self._x += weight * value
+        self._x2 += weight * value * value
+        self._n += weight
+
+    def compute_mean(self):
+        """Computes the mean."""
+        x, n = self._x, self._n
+        return x / n
+
+    def make_statistics(self):
+        """Computes and returns the mean and standard deviation values.
+        Returns:
+            tuple: Mean and standard deviation values.
+        """
+        x, n = self._x, self._n
+        mean = x / n
+        var = self._x2 / n - mean * mean
+        std = math.sqrt(var)
+        return mean, std
+
+
+class DictSummary():
+    """Online summarization of a sequence of dictionaries.
+    ``DictSummary`` computes the statistics of a given set of scalars online.
+    It only computes the statistics for scalar values and variables of scalar
+    values in the dictionaries.
+    """
+
+    def __init__(self):
+        self._summaries = defaultdict(Summary)
+
+    def add(self, d):
+        """Adds a dictionary of scalars.
+        Args:
+            d (dict): Dictionary of scalars to accumulate. Only elements of
+               scalars, zero-dimensional arrays, and variables of
+               zero-dimensional arrays are accumulated. When the value
+               is a tuple, the second element is interpreted as a weight.
+        """
+        summaries = self._summaries
+        for k, v in d.items():
+            w = 1
+            if isinstance(v, tuple):
+                v = v[0]
+                w = v[1]
+            summaries[k].add(v, weight=w)
+
+    def compute_mean(self):
+        """Creates a dictionary of mean values.
+        It returns a single dictionary that holds a mean value for each entry
+        added to the summary.
+        Returns:
+            dict: Dictionary of mean values.
+        """
+        return {
+            name: summary.compute_mean()
+            for name, summary in self._summaries.items()
+        }
+
+    def make_statistics(self):
+        """Creates a dictionary of statistics.
+        It returns a single dictionary that holds mean and standard deviation
+        values for every entry added to the summary. For an entry of name
+        ``'key'``, these values are added to the dictionary by names ``'key'``
+        and ``'key.std'``, respectively.
+        Returns:
+            dict: Dictionary of statistics of all entries.
+        """
+        stats = {}
+        for name, summary in self._summaries.items():
+            mean, std = summary.make_statistics()
+            stats[name] = mean
+            stats[name + '.std'] = std
+
+        return stats
--- a/deepspeech/training/triggers/init.py
+++ b/deepspeech/training/triggers/init.py
@ -0,0 +1,13 @@
+from .interval_trigger import IntervalTrigger
+
+def never_fail_trigger(trainer):
+    return False
+
+def get_trigger(trigger):
+    if trigger is None:
+        return never_fail_trigger
+    if callable(trigger):
+        return trigger
+    else:
+        trigger = IntervalTrigger(*trigger)
+        return trigger
--- a/deepspeech/training/triggers/interval_trigger.py
+++ b/deepspeech/training/triggers/interval_trigger.py
@ -0,0 +1,24 @@
+
+class IntervalTrigger():
+    """A Predicate to do something every N cycle."""
+
+    def __init__(self, period: int, unit: str):
+        if unit not in ("iteration", "epoch"):
+            raise ValueError("unit should be 'iteration' or 'epoch'")
+        if period <= 0:
+            raise ValueError("period should be a positive integer.")
+        self.period = period
+        self.unit = unit
+        self.last_index = None
+
+    def __call__(self, trainer):
+        if self.last_index is None:
+            last_index = getattr(trainer.updater.state, self.unit)
+            self.last_index = last_index
+
+        last_index = self.last_index
+        index = getattr(trainer.updater.state, self.unit)
+        fire = index // self.period != last_index // self.period
+
+        self.last_index = index
+        return fire
--- a/deepspeech/training/triggers/limit_trigger.py
+++ b/deepspeech/training/triggers/limit_trigger.py
@ -0,0 +1,17 @@
+
+class LimitTrigger():
+    """A Predicate to decide whether to stop."""
+
+    def __init__(self, limit: int, unit: str):
+        if unit not in ("iteration", "epoch"):
+            raise ValueError("unit should be 'iteration' or 'epoch'")
+        if limit <= 0:
+            raise ValueError("limit should be a positive integer.")
+        self.limit = limit
+        self.unit = unit
+
+    def __call__(self, trainer):
+        state = trainer.updater.state
+        index = getattr(state, self.unit)
+        fire = index >= self.limit
+        return fire
--- a/deepspeech/training/triggers/time_trigger.py
+++ b/deepspeech/training/triggers/time_trigger.py
@ -0,0 +1,17 @@
+class TimeTrigger():
+    """Trigger based on a fixed time interval.
+    This trigger accepts iterations with a given interval time.
+    Args:
+        period (float): Interval time. It is given in seconds.
+    """
+
+    def __init__(self, period):
+        self._period = period
+        self._next_time = self._period
+
+    def __call__(self, trainer):
+        if self._next_time < trainer.elapsed_time:
+            self._next_time += self._period
+            return True
+        else:
+            return False
--- a/deepspeech/training/updaters/init.py
+++ b/deepspeech/training/updaters/init.py
--- a/deepspeech/training/updaters/standard_updater.py
+++ b/deepspeech/training/updaters/standard_updater.py
@ -0,0 +1,179 @@
+from typing import Dict
+from typing import Optional
+
+from paddle import Tensor
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from timer import timer
+
+from deepspeech.training.reporter import report
+from deepspeech.training.updaters.updater import UpdaterBase
+from deepspeech.training.updaters.updater import UpdaterState
+
+from deepspeech.utils.log import Log
+
+__all__ = ["StandardUpdater"]
+
+logger = Log(__name__).getlog()
+
+class StandardUpdater(UpdaterBase):
+    """An example of over-simplification. Things may not be that simple, but
+    you can subclass it to fit your need.
+    """
+
+    def __init__(self,
+                 model: Layer,
+                 optimizer: Optimizer,
+                 dataloader: DataLoader,
+                 init_state: Optional[UpdaterState]=None):
+        # it is designed to hold multiple models
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        self.model = model
+
+        # it is designed to hold multiple optimizers
+        optimizers = {"main": optimizer}
+        self.optimizer = optimizer
+        self.optimizers: Dict[str, Optimizer] = optimizers
+
+        # dataloaders
+        self.dataloader = dataloader
+
+        # init state
+        if init_state is None:
+            self.state = UpdaterState()
+        else:
+            self.state = init_state
+
+        self.train_iterator = iter(dataloader)
+
+    def update(self):
+        # We increase the iteration index after updating and before extension.
+        # Here are the reasons.
+
+        # 0. Snapshotting(as well as other extensions, like visualizer) is
+        #    executed after a step of updating;
+        # 1. We decide to increase the iteration index after updating and
+        #    before any all extension is executed. 
+        # 3. We do not increase the iteration after extension because we
+        #    prefer a consistent resume behavior, when load from a
+        #    `snapshot_iter_100.pdz` then the next step to train is `101`,
+        #    naturally. But if iteration is increased increased after
+        #    extension(including snapshot), then, a `snapshot_iter_99` is
+        #    loaded. You would need a extra increasing of the iteration idex
+        #    before training to avoid another iteration `99`, which has been
+        #    done before snapshotting.
+        # 4. Thus iteration index represrnts "currently how mant epochs has
+        #    been done."
+        # NOTE: use report to capture the correctly value. If you want to
+        # report the learning rate used for a step, you must report it before
+        # the learning rate scheduler's step() has been called. In paddle's
+        # convention, we do not use an extension to change the learning rate.
+        # so if you want to report it, do it in the updater.
+
+        # Then here comes the next question. When is the proper time to
+        # increase the epoch index? Since all extensions are executed after
+        # updating, it is the time that after updating is the proper time to
+        # increase epoch index.
+        # 1. If we increase the epoch index before updating, then an extension
+        #    based ot epoch would miss the correct timing. It could only be
+        #    triggerd after an extra updating.
+        # 2. Theoretically, when an epoch is done, the epoch index should be
+        #    increased. So it would be increase after updating.
+        # 3. Thus, eppoch index represents "currently how many epochs has been
+        #    done." So it starts from 0.
+
+        # switch to training mode
+        for model in self.models.values():
+            model.train()
+
+        # training for a step is implemented here
+        batch = self.read_batch()
+        self.update_core(batch)
+
+        self.state.iteration += 1
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
+                self.state.epoch += 1
+
+    def update_core(self, batch):
+        """A simple case for a training step. Basic assumptions are:
+        Single model;
+        Single optimizer;
+        A batch from the dataloader is just the input of the model;
+        The model return a single loss, or a dict containing serval losses.
+        Parameters updates at every batch, no gradient accumulation.
+        """
+        loss = self.model(*batch)
+
+        if isinstance(loss, Tensor):
+            loss_dict = {"main": loss}
+        else:
+            # Dict[str, Tensor]
+            loss_dict = loss
+            if "main" not in loss_dict:
+                main_loss = 0
+                for loss_item in loss.values():
+                    main_loss += loss_item
+                loss_dict["main"] = main_loss
+
+        for name, loss_item in loss_dict.items():
+            report(name, float(loss_item))
+
+        self.optimizer.clear_gradient()
+        loss_dict["main"].backward()
+        self.optimizer.update()
+
+    @property
+    def updates_per_epoch(self):
+        """Number of updater per epoch, determined by the length of the
+        dataloader."""
+        length_of_dataloader = None
+        try:
+            length_of_dataloader = len(self.dataloader)
+        except TypeError:
+            logger.debug("This dataloader has no __len__.")
+        finally:
+            return length_of_dataloader
+
+    def new_epoch(self):
+        """Start a new epoch."""
+        # NOTE: all batch sampler for distributed training should
+        # subclass DistributedBatchSampler and implement `set_epoch` method
+        if hasattr(self.dataloader, "batch_sampler")
+            batch_sampler = self.dataloader.batch_sampler
+            if isinstance(batch_sampler, DistributedBatchSampler):
+                batch_sampler.set_epoch(self.state.epoch)
+        self.train_iterator = iter(self.dataloader)
+
+    def read_batch(self):
+        """Read a batch from the data loader, auto renew when data is exhausted."""
+        with timer() as t:
+            try:
+                batch = next(self.train_iterator)
+            except StopIteration:
+                self.new_epoch()
+                batch = next(self.train_iterator)
+            logger.debug(
+                f"Read a batch takes {t.elapse}s.")  # replace it with logger
+        return batch
+
+    def state_dict(self):
+        """State dict of a Updater, model, optimizer and updater state are included."""
+        state_dict = super().state_dict()
+        for name, model in self.models.items():
+            state_dict[f"{name}_params"] = model.state_dict()
+        for name, optim in self.optimizers.items():
+            state_dict[f"{name}_optimizer"] = optim.state_dict()
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        """Set state dict for a Updater. Parameters of models, states for
+        optimizers and UpdaterState are restored."""
+        for name, model in self.models.items():
+            model.set_state_dict(state_dict[f"{name}_params"])
+        for name, optim in self.optimizers.items():
+            optim.set_state_dict(state_dict[f"{name}_optimizer"])
+        super().set_state_dict(state_dict)
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
@ -0,0 +1,171 @@
+import sys
+import traceback
+from collections import OrderedDict
+from pathlib import Path
+from typing import Callable
+from typing import List
+from typing import Union
+
+import six
+import tqdm
+
+from deepspeech.training.extensions.extension import Extension
+from deepspeech.training.extensions.extension import PRIORITY_READER
+from deepspeech.training.reporter import scope
+from deepspeech.training.triggers import get_trigger
+from deepspeech.training.triggers.limit_trigger import LimitTrigger
+from deepspeech.training.updaters.updater import UpdaterBase
+
+
+class _ExtensionEntry():
+    def __init__(self, extension, trigger, priority):
+        self.extension = extension
+        self.trigger = trigger
+        self.priority = priority
+
+
+class Trainer():
+    def __init__(self,
+                 updater: UpdaterBase,
+                 stop_trigger: Callable=None,
+                 out: Union[str, Path]='result',
+                 extensions: List[Extension]=None):
+        self.updater = updater
+        self.extensions = OrderedDict()
+        self.stop_trigger = LimitTrigger(*stop_trigger)
+        self.out = Path(out)
+        self.observation = None
+
+        self._done = False
+        if extensions:
+            for ext in extensions:
+                self.extend(ext)
+
+    @property
+    def is_before_training(self):
+        return self.updater.state.iteration == 0
+
+    def extend(self, extension, name=None, trigger=None, priority=None):
+        # get name for the extension
+        # argument \
+        # -> extention's name \
+        # -> default_name (class name, when it is an object) \
+        # -> function name when it is a function \
+        # -> error
+
+        if name is None:
+            name = getattr(extension, 'name', None)
+            if name is None:
+                name = getattr(extension, 'default_name', None)
+                if name is None:
+                    name = getattr(extension, '__name__', None)
+                    if name is None:
+                        raise ValueError("Name is not given for the extension.")
+        if name == 'training':
+            raise ValueError("training is a reserved name.")
+
+        if trigger is None:
+            trigger = getattr(extension, 'trigger', (1, 'iteration'))
+        trigger = get_trigger(trigger)
+
+        if priority is None:
+            priority = getattr(extension, 'priority', PRIORITY_READER)
+
+        # add suffix to avoid nameing conflict
+        ordinal = 0
+        modified_name = name
+        while modified_name in self.extensions:
+            ordinal += 1
+            modified_name = f"{name}_{ordinal}"
+        extension.name = modified_name
+
+        self.extensions[modified_name] = _ExtensionEntry(extension, trigger,
+                                                         priority)
+
+    def get_extension(self, name):
+        """get extension by name."""
+        extensions = self.extensions
+        if name in extensions:
+            return extensions[name].extension
+        else:
+            raise ValueError(f'extension {name} not found')
+
+    def run(self):
+        if self._done:
+            raise RuntimeError("Training is already done!.")
+
+        self.out.mkdir(parents=True, exist_ok=True)
+
+        # sort extensions by priorities once
+        extension_order = sorted(
+            self.extensions.keys(),
+            key=lambda name: self.extensions[name].priority,
+            reverse=True)
+        extensions = [(name, self.extensions[name]) for name in extension_order]
+
+        # initializing all extensions
+        for name, entry in extensions:
+            if hasattr(entry.extension, "initialize"):
+                entry.extension.initialize(self)
+
+        update = self.updater.update  # training step
+        stop_trigger = self.stop_trigger
+
+        # display only one progress bar
+        max_iteration = None
+        if isinstance(stop_trigger, LimitTrigger):
+            if stop_trigger.unit == 'epoch':
+                max_epoch = self.stop_trigger.limit
+                updates_per_epoch = getattr(self.updater, "updates_per_epoch",
+                                            None)
+                max_iteration = max_epoch * updates_per_epoch if updates_per_epoch else None
+            else:
+                max_iteration = self.stop_trigger.limit
+
+        p = tqdm.tqdm(initial=self.updater.state.iteration, total=max_iteration)
+
+        try:
+            while not stop_trigger(self):
+                self.observation = {}
+                # set observation as the report target
+                # you can use report freely in Updater.update()
+
+                # updating parameters and state
+                with scope(self.observation):
+                    update()
+                    p.update()
+
+                    # execute extension when necessary
+                    for name, entry in extensions:
+                        if entry.trigger(self):
+                            entry.extension(self)
+
+                # print("###", self.observation)
+        except Exception as e:
+            f = sys.stderr
+            f.write(f"Exception in main training loop: {e}\n")
+            f.write("Traceback (most recent call last):\n")
+            traceback.print_tb(sys.exc_info()[2])
+            f.write(
+                "Trainer extensions will try to handle the extension. Then all extensions will finalize."
+            )
+
+            # capture the exception in the mian training loop
+            exc_info = sys.exc_info()
+
+            # try to handle it
+            for name, entry in extensions:
+                if hasattr(entry.extension, "on_error"):
+                    try:
+                        entry.extension.on_error(self, e, sys.exc_info()[2])
+                    except Exception as ee:
+                        f.write(f"Exception in error handler: {ee}\n")
+                        f.write('Traceback (most recent call last):\n')
+                        traceback.print_tb(sys.exc_info()[2])
+
+            # raise exception in main training loop
+            six.reraise(*exc_info)
+        finally:
+            for name, entry in extensions:
+                if hasattr(entry.extension, "finalize"):
+                    entry.extension.finalize(self)
--- a/deepspeech/training/updaters/updater.py
+++ b/deepspeech/training/updaters/updater.py
@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import paddle
+
+from deepspeech.utils.log import Log
+
+__all__ = ["UpdaterBase", "UpdaterState"]
+
+logger = Log(__name__).getlog()
+
+
+@dataclass
+class UpdaterState:
+    iteration: int = 0
+    epoch: int = 0
+
+
+class UpdaterBase():
+    """An updater is the abstraction of how a model is trained given the
+    dataloader and the optimizer.
+    The `update_core` method is a step in the training loop with only necessary
+    operations (get a batch, forward and backward, update the parameters).
+    Other stuffs are made extensions. Visualization, saving, loading and
+    periodical validation and evaluation are not considered here.
+    But even in such simplist case, things are not that simple. There is an
+    attempt to standardize this process and requires only the model and
+    dataset and do all the stuffs automatically. But this may hurt flexibility.
+    If we assume a batch yield from the dataloader is just the input to the
+    model, we will find that some model requires more arguments, or just some
+    keyword arguments. But this prevents us from over-simplifying it.
+    From another perspective, the batch may includes not just the input, but
+    also the target. But the model's forward method may just need the input.
+    We can pass a dict or a super-long tuple to the model and let it pick what
+    it really needs. But this is an abuse of lazy interface.
+    After all, we care about how a model is trained. But just how the model is
+    used for inference. We want to control how a model is trained. We just
+    don't want to be messed up with other auxiliary code.
+    So the best practice is to define a model and define a updater for it.
+    """
+
+    def __init__(self, init_state=None):
+        if init_state is None:
+            self.state = UpdaterState()
+        else:
+            self.state = init_state
+
+    def update(self, batch):
+        raise NotImplementedError(
+            "Implement your own `update` method for training a step.")
+
+    def state_dict(self):
+        state_dict = {
+            "epoch": self.state.epoch,
+            "iteration": self.state.iteration,
+        }
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        self.state.epoch = state_dict["epoch"]
+        self.state.iteration = state_dict["iteration"]
+
+    def save(self, path):
+        logger.debug(f"Saving to {path}.")
+        archive = self.state_dict()
+        paddle.save(archive, str(path))
+
+    def load(self, path):
+        logger.debug(f"Loading from {path}.")
+        archive = paddle.load(str(path))
+        self.set_state_dict(archive)
--- a/requirements.txt
+++ b/requirements.txt
@ -15,3 +15,4 @@ tensorboardX
 textgrid
 typeguard
 yacs
+jsonlines