add benchmark flags, and logic

pull/837/head
Hui Zhang 3 years ago
parent dc2cdbf3fb
commit cda6ca8323

@ -100,7 +100,8 @@ class U2Trainer(Trainer):
# Disable gradient synchronizations across DDP processes. # Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module # Within this context, gradients will be accumulated on module
# variables, which will later be synchronized. # variables, which will later be synchronized.
context = self.model.no_sync # When using cpu w/o DDP, model does not have `no_sync`
context = self.model.no_sync if self.parallel else nullcontext
else: else:
# Used for single gpu training and DDP gradient synchronization # Used for single gpu training and DDP gradient synchronization
# processes. # processes.

@ -44,32 +44,24 @@ def default_argument_parser():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
# yapf: disable # yapf: disable
# data and output train_group = parser.add_argument_group(title='Train Options', description=None)
parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.") train_group.add_argument("--seed", type=int, default=None,
parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
# load from saved checkpoint
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
# running
parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
help="device type to use, cpu and gpu are supported.")
parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
# overwrite extra config and default config
# parser.add_argument("--opts", nargs=argparse.REMAINDER,
# help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
parser.add_argument("--opts", type=str, default=[], nargs='+',
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
# random seed
parser.add_argument("--seed", type=int, default=None,
help="seed to use for paddle, np and random. None or 0 for random, else set seed.") help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
train_group.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
# profiler help="device cpu and gpu are supported.")
parser.add_argument('--profiler_options', type=str, default=None, train_group.add_argument("--nprocs", type=int, default=1, help="number of parallel processes. 0 for cpu.")
train_group.add_argument("--config", metavar="CONFIG_FILE", help="config file.")
train_group.add_argument("--output", metavar="CKPT_DIR", help="path to save checkpoint.")
train_group.add_argument("--checkpoint_path", type=str, help="path to load checkpoint")
train_group.add_argument("--opts", type=str, default=[], nargs='+',
help="overwrite --config file, passing in LIST[KEY VALUE] pairs")
train_group.add_argument("--dump-config", metavar="FILE", help="dump config to `this` file.")
bech_group = parser.add_argument_group(title='Benchmark Options', description=None)
bech_group.add_argument('--profiler-options', type=str, default=None,
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".') help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".')
bech_group.add_argument('--benchmark-batch-size', type=int, default=None, help='batch size for benchmark.')
bech_group.add_argument('--benchmark-max-step', type=int, default=None, help='max iteration for benchmark.')
# yapd: enable # yapd: enable
return parser return parser

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
import time import time
from pathlib import Path from pathlib import Path
@ -24,6 +25,7 @@ from deepspeech.utils import profiler
from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import seed_all from deepspeech.utils.utility import seed_all
from deepspeech.utils.utility import UpdateConfig
__all__ = ["Trainer"] __all__ = ["Trainer"]
@ -101,6 +103,12 @@ class Trainer():
seed_all(args.seed) seed_all(args.seed)
logger.info(f"Set seed {args.seed}") logger.info(f"Set seed {args.seed}")
if self.args.benchmark_batch_size:
with UpdateConfig(self.config):
self.config.collator.batch_size = self.args.benchmark_batch_size
logger.info(
f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
@ -188,6 +196,12 @@ class Trainer():
if self.args.profiler_options: if self.args.profiler_options:
profiler.add_profiler_step(self.args.profiler_options) profiler.add_profiler_step(self.args.profiler_options)
if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
logger.info(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
sys.exit(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
def train(self): def train(self):
"""The training process control by epoch.""" """The training process control by epoch."""
from_scratch = self.resume_or_scratch() from_scratch = self.resume_or_scratch()

@ -16,15 +16,27 @@ import distutils.util
import math import math
import os import os
import random import random
from contextlib import contextmanager
from typing import List from typing import List
import numpy as np import numpy as np
import paddle import paddle
__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"] __all__ = [
"UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
]
@contextmanager
def UpdateConfig(config):
"""Update yacs config"""
config.defrost()
yield
config.freeze()
def seed_all(seed: int=210329): def seed_all(seed: int=210329):
"""freeze random generator seed."""
np.random.seed(seed) np.random.seed(seed)
random.seed(seed) random.seed(seed)
paddle.seed(seed) paddle.seed(seed)

@ -1,7 +1,8 @@
#!/bin/bash #!/bin/bash
profiler_options= profiler_options=
benchmark_batch_size=
benchmark_max_step=
# seed may break model convergence # seed may break model convergence
seed=0 seed=0
@ -32,12 +33,15 @@ ckpt_name=$2
mkdir -p exp mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--seed ${seed} \
--device ${device} \ --device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--profiler_options ${profiler_options} \ --profiler-options "${profiler-options}" \
--seed ${seed} --benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic unset FLAGS_cudnn_deterministic

@ -1,37 +1,49 @@
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then profiler_options=
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" benchmark_batch_size=
exit -1 benchmark_max_step=
fi
# seed may break model convergence
seed=0
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
device=gpu device=gpu
if [ ${ngpu} == 0 ];then if [ ${ngpu} == 0 ];then
device=cpu device=cpu
fi fi
mkdir -p exp if [ ${seed} != 0 ]; then
# seed may break model convergence
seed=0
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True export FLAGS_cudnn_deterministic=True
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1
fi fi
config_path=$1
ckpt_name=$2
mkdir -p exp
python3 -u ${BIN_DIR}/train.py \ python3 -u ${BIN_DIR}/train.py \
--seed ${seed} \
--device ${device} \ --device ${device} \
--nproc ${ngpu} \ --nproc ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--seed ${seed} --profiler-options "${profiler_options}" \
--benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic unset FLAGS_cudnn_deterministic
fi fi

@ -1,41 +1,46 @@
#!/bin/bash #!/bin/bash
CUR_DIR=${PWD}
ROOT_DIR=../../ ROOT_DIR=../../
# 提供可稳定复现性能的脚本默认在标准docker环境内py37执行 # 提供可稳定复现性能的脚本默认在标准docker环境内py37执行
# collect env info # collect env info
bash ${ROOT_DIR}/utils/pd_env_collect.sh bash ${ROOT_DIR}/utils/pd_env_collect.sh
cat pd_env.txt #cat pd_env.txt
# 执行目录:需说明
pushd ${ROOT_DIR}/examples/aishell/s1
# 1 安装该模型需要的依赖 (如需开启优化策略请注明) # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
pushd ${ROOT_DIR}/tools; make; popd #pushd ${ROOT_DIR}/tools; make; popd
source ${ROOT_DIR}/tools/venv/bin/activate #source ${ROOT_DIR}/tools/venv/bin/activate
pushd ${ROOT_DIR}; bash setup.sh; popd #pushd ${ROOT_DIR}; bash setup.sh; popd
# 2 拷贝该模型需要数据、预训练模型 # 2 拷贝该模型需要数据、预训练模型
# 执行目录:需说明
#pushd ${ROOT_DIR}/examples/aishell/s1
pushd ${ROOT_DIR}/examples/tiny/s1
mkdir -p exp/log mkdir -p exp/log
loca/data.sh &> exp/log/data.log . path.sh
#bash local/data.sh &> exp/log/data.log
# 3 批量运行如不方便批量12需放到单个模型中 # 3 批量运行如不方便批量12需放到单个模型中
model_mode_list=(conformer) model_mode_list=(conformer transformer)
fp_item_list=(fp32) fp_item_list=(fp32)
bs_item=(32 64 96) bs_item_list=(32 64 96)
for model_mode in ${model_mode_list[@]}; do for model_mode in ${model_mode_list[@]}; do
for fp_item in ${fp_item_list[@]}; do for fp_item in ${fp_item_list[@]}; do
for bs_item in ${bs_list[@]} for bs_item in ${bs_item_list[@]}
do do
echo "index is speed, 1gpus, begin, ${model_name}" echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp run_mode=sp
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min) CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min)
sleep 60 sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp run_mode=mp
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
sleep 60 sleep 60
done done
done done

@ -23,19 +23,19 @@ function _train(){
echo "Train on ${num_gpu_devices} GPUs" echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="--model_name=${model_name} train_cmd="--benchmark-batch-size ${batch_size}
--batch_size=${batch_size} --benchmark-max-step ${max_iter}
--fp=${fp_item} \ conf/${model_name}.yaml ${model_name}"
--max_iter=${max_iter} "
case ${run_mode} in case ${run_mode} in
sp) train_cmd="python -u tools/train.py "${train_cmd}" ;; sp) train_cmd="bash local/train.sh "${train_cmd}"" ;;
mp) mp)
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}" train_cmd="bash local/train.sh "${train_cmd}"" ;;
log_parse_file="mylog/workerlog.0" ;;
*) echo "choose run_mode(sp or mp)"; exit 1; *) echo "choose run_mode(sp or mp)"; exit 1;
esac esac
# 以下不用修改
timeout 15m ${train_cmd} > ${log_file} 2>&1 # 以下不用修改
CUDA_VISIBLE_DEVICES=${device} timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL" echo -e "${model_name}, FAIL"
export job_fail_flag=1 export job_fail_flag=1
@ -43,7 +43,8 @@ function _train(){
echo -e "${model_name}, SUCCESS" echo -e "${model_name}, SUCCESS"
export job_fail_flag=0 export job_fail_flag=0
fi fi
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
if [ $run_mode = "mp" -a -d mylog ]; then if [ $run_mode = "mp" -a -d mylog ]; then
rm ${log_file} rm ${log_file}

Loading…
Cancel
Save