diff --git a/deepspeech/exps/u2/bin/train.py b/deepspeech/exps/u2/bin/train.py index fef615ce..b664401a 100644 --- a/deepspeech/exps/u2/bin/train.py +++ b/deepspeech/exps/u2/bin/train.py @@ -21,6 +21,7 @@ from deepspeech.exps.u2.config import get_cfg_defaults from deepspeech.exps.u2.model import U2Trainer as Trainer from deepspeech.training.cli import default_argument_parser from deepspeech.utils.utility import print_arguments + # from deepspeech.exps.u2.trainer import U2Trainer as Trainer diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 2b6e2433..67b666ed 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -204,6 +204,7 @@ class U2Trainer(Trainer): msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "data time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) + self.after_train_batch() data_start_time = time.time() except Exception as e: logger.error(e) diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py index 095dfe34..3d15e025 100644 --- a/deepspeech/exps/u2_kaldi/model.py +++ b/deepspeech/exps/u2_kaldi/model.py @@ -205,6 +205,7 @@ class U2Trainer(Trainer): msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "data time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) + self.after_train_batch() data_start_time = time.time() except Exception as e: logger.error(e) diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index 8dca1654..91a81503 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -222,6 +222,7 @@ class U2STTrainer(Trainer): msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "data time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) + self.after_train_batch() data_start_time = time.time() except Exception as e: logger.error(e) diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py index 7f4bb804..1477bdfe 100644 --- a/deepspeech/training/cli.py +++ b/deepspeech/training/cli.py @@ -63,8 +63,13 @@ def default_argument_parser(): parser.add_argument("--opts", type=str, default=[], nargs='+', help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") + # random seed parser.add_argument("--seed", type=int, default=None, help="seed to use for paddle, np and random. None or 0 for random, else set seed.") + + # profiler + parser.add_argument('--profiler_options', type=str, default=None, + help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".') # yapd: enable return parser diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index 7959b41b..bdb68310 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -20,6 +20,7 @@ from tensorboardX import SummaryWriter from deepspeech.training.timer import Timer from deepspeech.utils import mp_tools +from deepspeech.utils import profiler from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.log import Log from deepspeech.utils.utility import seed_all @@ -183,6 +184,9 @@ class Trainer(): if isinstance(batch_sampler, paddle.io.DistributedBatchSampler): batch_sampler.set_epoch(self.epoch) + def after_train_batch(self): + profiler.add_profiler_step(self.args.profiler_options) + def train(self): """The training process control by epoch.""" from_scratch = self.resume_or_scratch() @@ -209,6 +213,7 @@ class Trainer(): msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "data time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) + self.after_train_batch() data_start_time = time.time() except Exception as e: logger.error(e) diff --git a/deepspeech/utils/profiler.py b/deepspeech/utils/profiler.py new file mode 100644 index 00000000..5b8389be --- /dev/null +++ b/deepspeech/utils/profiler.py @@ -0,0 +1,116 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + +import paddle + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +# A global variable to record the number of calling times for profiler +# functions. It is used to specify the tracing range of training steps. +_profiler_step_id = 0 + +# A global variable to avoid parsing from string every time. +_profiler_options = None + + +class ProfilerOptions(object): + ''' + Use a string to initialize a ProfilerOptions. + The string should be in the format: "key1=value1;key2=value;key3=value3". + For example: + "profile_path=model.profile" + "batch_range=[50, 60]; profile_path=model.profile" + "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" + ProfilerOptions supports following key-value pair: + batch_range - a integer list, e.g. [100, 110]. + state - a string, the optional values are 'CPU', 'GPU' or 'All'. + sorted_key - a string, the optional values are 'calls', 'total', + 'max', 'min' or 'ave. + tracer_option - a string, the optional values are 'Default', 'OpDetail', + 'AllOpDetail'. + profile_path - a string, the path to save the serialized profile data, + which can be used to generate a timeline. + exit_on_finished - a boolean. + ''' + + def __init__(self, options_str): + assert isinstance(options_str, str) + + self._options = { + 'batch_range': [10, 20], + 'state': 'All', + 'sorted_key': 'total', + 'tracer_option': 'Default', + 'profile_path': '/tmp/profile', + 'exit_on_finished': True + } + self._parse_from_string(options_str) + + def _parse_from_string(self, options_str): + for kv in options_str.replace(' ', '').split(';'): + key, value = kv.split('=') + if key == 'batch_range': + value_list = value.replace('[', '').replace(']', '').split(',') + value_list = list(map(int, value_list)) + if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ + 1] > value_list[0]: + self._options[key] = value_list + elif key == 'exit_on_finished': + self._options[key] = value.lower() in ("yes", "true", "t", "1") + elif key in [ + 'state', 'sorted_key', 'tracer_option', 'profile_path' + ]: + self._options[key] = value + + def __getitem__(self, name): + if self._options.get(name, None) is None: + raise ValueError( + "ProfilerOptions does not have an option named %s." % name) + return self._options[name] + + +def add_profiler_step(options_str=None): + ''' + Enable the operator-level timing using PaddlePaddle's profiler. + The profiler uses a independent variable to count the profiler steps. + One call of this function is treated as a profiler step. + + Args: + profiler_options - a string to initialize the ProfilerOptions. + Default is None, and the profiler is disabled. + ''' + if options_str is None: + return + + global _profiler_step_id + global _profiler_options + + if _profiler_options is None: + _profiler_options = ProfilerOptions(options_str) + logger.info(f"{options_str}") + logger.info(f"{_profiler_options._options}") + + if _profiler_step_id == _profiler_options['batch_range'][0]: + paddle.utils.profiler.start_profiler(_profiler_options['state'], + _profiler_options['tracer_option']) + elif _profiler_step_id == _profiler_options['batch_range'][1]: + paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'], + _profiler_options['profile_path']) + if _profiler_options['exit_on_finished']: + sys.exit(0) + + _profiler_step_id += 1 diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/s1/local/train.sh index f905b766..e065ad6a 100755 --- a/examples/aishell/s1/local/train.sh +++ b/examples/aishell/s1/local/train.sh @@ -1,38 +1,45 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" - exit -1 -fi + +profiler_options= + +# seed may break model convergence +seed=0 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." -config_path=$1 -ckpt_name=$2 - device=gpu if [ ${ngpu} == 0 ];then device=cpu fi -echo "using ${device}..." - -mkdir -p exp -# seed may break model convergence -seed=0 -if [ ${seed} != 0 ]; then +if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True + echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." fi +if [ $# != 2 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" + exit -1 +fi + +config_path=$1 +ckpt_name=$2 + +mkdir -p exp + python3 -u ${BIN_DIR}/train.py \ --device ${device} \ --nproc ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ +--profiler_options ${profiler_options} \ --seed ${seed} -if [ ${seed} != 0 ]; then +if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic fi @@ -41,4 +48,4 @@ if [ $? -ne 0 ]; then exit 1 fi -exit 0 +exit 0 \ No newline at end of file diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index d5b1ed91..3f1a376f 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -11,7 +11,7 @@ data: max_output_input_ratio: .inf collator: - batch_size: 15 + batch_size: 20 mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt @@ -45,7 +45,7 @@ model: training: n_epoch: 50 - accum_grad: 4 + accum_grad: 1 lr: 1e-3 lr_decay: 0.83 weight_decay: 1e-06 diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh index ea29b7fc..a657ce34 100755 --- a/examples/tiny/s0/local/train.sh +++ b/examples/tiny/s0/local/train.sh @@ -1,36 +1,44 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" - exit -1 -fi +profiler_options= + +# seed may break model convergence +seed=0 + +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." -config_path=$1 -ckpt_name=$2 -model_type=$3 - device=gpu if [ ${ngpu} == 0 ];then device=cpu fi -mkdir -p exp - -# seed may break model convergence -seed=0 if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True + echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." +fi + + +if [ $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" + exit -1 fi +config_path=$1 +ckpt_name=$2 +model_type=$3 + +mkdir -p exp + python3 -u ${BIN_DIR}/train.py \ --device ${device} \ --nproc ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ +--profiler_options ${profiler_options} \ --seed ${seed} if [ ${seed} != 0 ]; then