PaddleSpeech/deepspeech/utils/checkpoint.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
import logging
import numpy as np

import paddle
from paddle import distributed as dist
from paddle.nn import Layer
from paddle.optimizer import Optimizer

from deepspeech.utils import mp_tools

logger = logging.getLogger(__name__)

__all__ = ["load_parameters", "save_parameters"]


def _load_latest_checkpoint(checkpoint_dir: str) -> int:
    """Get the iteration number corresponding to the latest saved checkpoint.
    Args:
        checkpoint_dir (str): the directory where checkpoint is saved.
    Returns:
        int: the latest iteration number.
    """
    checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
    if (not os.path.isfile(checkpoint_record)):
        return 0

    # Fetch the latest checkpoint index.
    with open(checkpoint_record, "rt") as handle:
        latest_checkpoint = handle.readlines()[-1].strip()
        step = latest_checkpoint.split(":")[-1]
        iteration = int(step.split("-")[-1])

    return iteration


def _save_checkpoint(checkpoint_dir: str, iteration: int):
    """Save the iteration number of the latest model to be checkpointed.
    Args:
        checkpoint_dir (str): the directory where checkpoint is saved.
        iteration (int): the latest iteration number.
    Returns:
        None
    """
    checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")
    # Update the latest checkpoint index.
    with open(checkpoint_record, "a+") as handle:
        handle.write("model_checkpoint_path:step-{}\n".format(iteration))


def load_parameters(model,
                    optimizer=None,
                    checkpoint_dir=None,
                    checkpoint_path=None):
    """Load a specific model checkpoint from disk. 
    Args:
        model (Layer): model to load parameters.
        optimizer (Optimizer, optional): optimizer to load states if needed.
            Defaults to None.
        checkpoint_dir (str, optional): the directory where checkpoint is saved.
        checkpoint_path (str, optional): if specified, load the checkpoint
            stored in the checkpoint_path and the argument 'checkpoint_dir' will 
            be ignored. Defaults to None. 
    Returns:
        iteration (int): number of iterations that the loaded checkpoint has 
            been trained.
    """
    if checkpoint_path is not None:
        iteration = int(os.path.basename(checkpoint_path).split("-")[-1])
    elif checkpoint_dir is not None:
        iteration = _load_latest_checkpoint(checkpoint_dir)
        if iteration == 0:
            return iteration
        checkpoint_path = os.path.join(checkpoint_dir,
                                       "step-{}".format(iteration))
    else:
        raise ValueError(
            "At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
        )

    rank = dist.get_rank()

    params_path = checkpoint_path + ".pdparams"
    model_dict = paddle.load(params_path)
    model.set_state_dict(model_dict)
    logger.info(
        "[checkpoint] Rank {}: loaded model from {}".format(rank, params_path))

    optimizer_path = checkpoint_path + ".pdopt"
    if optimizer and os.path.isfile(optimizer_path):
        optimizer_dict = paddle.load(optimizer_path)
        optimizer.set_state_dict(optimizer_dict)
        logger.info("[checkpoint] Rank {}: loaded optimizer state from {}".
                    format(rank, optimizer_path))

    return iteration


@mp_tools.rank_zero_only
def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
    """Checkpoint the latest trained model parameters.
    Args:
        checkpoint_dir (str): the directory where checkpoint is saved.
        iteration (int): the latest iteration number.
        model (Layer): model to be checkpointed.
        optimizer (Optimizer, optional): optimizer to be checkpointed.
            Defaults to None.
    Returns:
        None
    """
    checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))

    model_dict = model.state_dict()
    params_path = checkpoint_path + ".pdparams"
    paddle.save(model_dict, params_path)
    logger.info("[checkpoint] Saved model to {}".format(params_path))

    if optimizer:
        opt_dict = optimizer.state_dict()
        optimizer_path = checkpoint_path + ".pdopt"
        paddle.save(opt_dict, optimizer_path)
        logger.info(
            "[checkpoint] Saved optimzier state to {}".format(optimizer_path))

    _save_checkpoint(checkpoint_dir, iteration)
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import os`
			`import time`
			`import logging`
			`import numpy as np`

			`import paddle`
			`from paddle import distributed as dist`
			`from paddle.nn import Layer`
			`from paddle.optimizer import Optimizer`

			`from deepspeech.utils import mp_tools`

			`logger = logging.getLogger(__name__)`

			`__all__ = ["load_parameters", "save_parameters"]`


			`def _load_latest_checkpoint(checkpoint_dir: str) -> int:`
			`"""Get the iteration number corresponding to the latest saved checkpoint.`
			`Args:`
			`checkpoint_dir (str): the directory where checkpoint is saved.`
			`Returns:`
			`int: the latest iteration number.`
			`"""`
			`checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")`
			`if (not os.path.isfile(checkpoint_record)):`
			`return 0`

			`# Fetch the latest checkpoint index.`
			`with open(checkpoint_record, "rt") as handle:`
			`latest_checkpoint = handle.readlines()[-1].strip()`
			`step = latest_checkpoint.split(":")[-1]`
			`iteration = int(step.split("-")[-1])`

			`return iteration`


			`def _save_checkpoint(checkpoint_dir: str, iteration: int):`
			`"""Save the iteration number of the latest model to be checkpointed.`
			`Args:`
			`checkpoint_dir (str): the directory where checkpoint is saved.`
			`iteration (int): the latest iteration number.`
			`Returns:`
			`None`
			`"""`
			`checkpoint_record = os.path.join(checkpoint_dir, "checkpoint")`
			`# Update the latest checkpoint index.`
			`with open(checkpoint_record, "a+") as handle:`
			`handle.write("model_checkpoint_path:step-{}\n".format(iteration))`


			`def load_parameters(model,`
			`optimizer=None,`
			`checkpoint_dir=None,`
			`checkpoint_path=None):`
			`"""Load a specific model checkpoint from disk.`
			`Args:`
			`model (Layer): model to load parameters.`
			`optimizer (Optimizer, optional): optimizer to load states if needed.`
			`Defaults to None.`
			`checkpoint_dir (str, optional): the directory where checkpoint is saved.`
			`checkpoint_path (str, optional): if specified, load the checkpoint`
			`stored in the checkpoint_path and the argument 'checkpoint_dir' will`
			`be ignored. Defaults to None.`
			`Returns:`
			`iteration (int): number of iterations that the loaded checkpoint has`
			`been trained.`
			`"""`
			`if checkpoint_path is not None:`
			`iteration = int(os.path.basename(checkpoint_path).split("-")[-1])`
			`elif checkpoint_dir is not None:`
			`iteration = _load_latest_checkpoint(checkpoint_dir)`
			`if iteration == 0:`
			`return iteration`
			`checkpoint_path = os.path.join(checkpoint_dir,`
			`"step-{}".format(iteration))`
			`else:`
			`raise ValueError(`
			`"At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"`
			`)`

			`rank = dist.get_rank()`

			`params_path = checkpoint_path + ".pdparams"`
			`model_dict = paddle.load(params_path)`
			`model.set_state_dict(model_dict)`
			`logger.info(`
			`"[checkpoint] Rank {}: loaded model from {}".format(rank, params_path))`

			`optimizer_path = checkpoint_path + ".pdopt"`
			`if optimizer and os.path.isfile(optimizer_path):`
			`optimizer_dict = paddle.load(optimizer_path)`
			`optimizer.set_state_dict(optimizer_dict)`
			`logger.info("[checkpoint] Rank {}: loaded optimizer state from {}".`
			`format(rank, optimizer_path))`

			`return iteration`


			`@mp_tools.rank_zero_only`
			`def save_parameters(checkpoint_dir, iteration, model, optimizer=None):`
			`"""Checkpoint the latest trained model parameters.`
			`Args:`
			`checkpoint_dir (str): the directory where checkpoint is saved.`
			`iteration (int): the latest iteration number.`
			`model (Layer): model to be checkpointed.`
			`optimizer (Optimizer, optional): optimizer to be checkpointed.`
			`Defaults to None.`
			`Returns:`
			`None`
			`"""`
			`checkpoint_path = os.path.join(checkpoint_dir, "step-{}".format(iteration))`

			`model_dict = model.state_dict()`
			`params_path = checkpoint_path + ".pdparams"`
			`paddle.save(model_dict, params_path)`
			`logger.info("[checkpoint] Saved model to {}".format(params_path))`

			`if optimizer:`
			`opt_dict = optimizer.state_dict()`
			`optimizer_path = checkpoint_path + ".pdopt"`
			`paddle.save(opt_dict, optimizer_path)`
			`logger.info(`
			`"[checkpoint] Saved optimzier state to {}".format(optimizer_path))`

			`_save_checkpoint(checkpoint_dir, iteration)`