Merge pull request #831 from PaddlePaddle/bench

Benchmark
pull/840/head
Hui Zhang 3 years ago committed by GitHub
commit dc2cdbf3fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -182,9 +182,10 @@ class U2Trainer(Trainer):
from_scratch = self.resume_or_scratch() from_scratch = self.resume_or_scratch()
if from_scratch: if from_scratch:
# save init model, i.e. 0 epoch # save init model, i.e. 0 epoch
self.save(tag='init') self.save(tag='init', infos=None)
self.lr_scheduler.step(self.iteration) # lr will resotre from optimizer ckpt
# self.lr_scheduler.step(self.iteration)
if self.parallel and hasattr(self.train_loader, 'batch_sampler'): if self.parallel and hasattr(self.train_loader, 'batch_sampler'):
self.train_loader.batch_sampler.set_epoch(self.epoch) self.train_loader.batch_sampler.set_epoch(self.epoch)

@ -101,7 +101,7 @@ class Snapshot(extension.Extension):
iteration = trainer.updater.state.iteration iteration = trainer.updater.state.iteration
epoch = trainer.updater.state.epoch epoch = trainer.updater.state.epoch
num = epoch if self.trigger[1] == 'epoch' else iteration num = epoch if self.trigger[1] == 'epoch' else iteration
path = self.checkpoint_dir / f"{num}.pdz" path = self.checkpoint_dir / f"{num}.np"
# add the new one # add the new one
trainer.updater.save(path) trainer.updater.save(path)

@ -185,7 +185,8 @@ class Trainer():
batch_sampler.set_epoch(self.epoch) batch_sampler.set_epoch(self.epoch)
def after_train_batch(self): def after_train_batch(self):
profiler.add_profiler_step(self.args.profiler_options) if self.args.profiler_options:
profiler.add_profiler_step(self.args.profiler_options)
def train(self): def train(self):
"""The training process control by epoch.""" """The training process control by epoch."""
@ -193,7 +194,9 @@ class Trainer():
if from_scratch: if from_scratch:
# save init model, i.e. 0 epoch # save init model, i.e. 0 epoch
self.save(tag='init', infos=None) self.save(tag='init', infos=None)
self.lr_scheduler.step(self.epoch)
# lr will resotre from optimizer ckpt
# self.lr_scheduler.step(self.epoch)
if self.parallel and hasattr(self.train_loader, "batch_sampler"): if self.parallel and hasattr(self.train_loader, "batch_sampler"):
self.train_loader.batch_sampler.set_epoch(self.epoch) self.train_loader.batch_sampler.set_epoch(self.epoch)

@ -61,6 +61,9 @@ class ProfilerOptions(object):
self._parse_from_string(options_str) self._parse_from_string(options_str)
def _parse_from_string(self, options_str): def _parse_from_string(self, options_str):
if not options_str:
return
for kv in options_str.replace(' ', '').split(';'): for kv in options_str.replace(' ', '').split(';'):
key, value = kv.split('=') key, value = kv.split('=')
if key == 'batch_range': if key == 'batch_range':

@ -48,7 +48,7 @@ training:
n_epoch: 10 n_epoch: 10
accum_grad: 1 accum_grad: 1
lr: 1e-5 lr: 1e-5
lr_decay: 1.0 lr_decay: 0.8
weight_decay: 1e-06 weight_decay: 1e-06
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 1 log_interval: 1

@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \ --model_type ${model_type} \
--profiler_options ${profiler_options} \ --profiler_options "${profiler_options}" \
--seed ${seed} --seed ${seed}
if [ ${seed} != 0 ]; then if [ ${seed} != 0 ]; then

@ -0,0 +1,2 @@
old-pd_env.txt
pd_env.txt

@ -0,0 +1,11 @@
# Benchmark Test
## Data
* Aishell
## Docker
```
registry.baidubce.com/paddlepaddle/paddle 2.1.1-gpu-cuda10.2-cudnn7 59d5ec1de486
```

@ -0,0 +1,44 @@
#!/bin/bash
ROOT_DIR=../../
# 提供可稳定复现性能的脚本默认在标准docker环境内py37执行
# collect env info
bash ${ROOT_DIR}/utils/pd_env_collect.sh
cat pd_env.txt
# 执行目录:需说明
pushd ${ROOT_DIR}/examples/aishell/s1
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
pushd ${ROOT_DIR}/tools; make; popd
source ${ROOT_DIR}/tools/venv/bin/activate
pushd ${ROOT_DIR}; bash setup.sh; popd
# 2 拷贝该模型需要数据、预训练模型
mkdir -p exp/log
loca/data.sh &> exp/log/data.log
# 3 批量运行如不方便批量12需放到单个模型中
model_mode_list=(conformer)
fp_item_list=(fp32)
bs_item=(32 64 96)
for model_mode in ${model_mode_list[@]}; do
for fp_item in ${fp_item_list[@]}; do
for bs_item in ${bs_list[@]}
do
echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min)
sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
sleep 60
done
done
done
popd # aishell/s1

@ -0,0 +1,56 @@
#!/bin/bash
set -xe
# 运行示例CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
# 参数说明
function _set_params(){
run_mode=${1:-"sp"} # 单卡sp|多卡mp
batch_size=${2:-"64"}
fp_item=${3:-"fp32"} # fp32|fp16
max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断
model_name=${5:-"model_name"}
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
# 以下不用修改
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
}
function _train(){
echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="--model_name=${model_name}
--batch_size=${batch_size}
--fp=${fp_item} \
--max_iter=${max_iter} "
case ${run_mode} in
sp) train_cmd="python -u tools/train.py "${train_cmd}" ;;
mp)
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}"
log_parse_file="mylog/workerlog.0" ;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
# 以下不用修改
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL"
export job_fail_flag=1
else
echo -e "${model_name}, SUCCESS"
export job_fail_flag=0
fi
kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
if [ $run_mode = "mp" -a -d mylog ]; then
rm ${log_file}
cp mylog/workerlog.0 ${log_file}
fi
}
_set_params $@
_train

@ -0,0 +1,167 @@
#!/usr/bin/env bash
unset GREP_OPTIONS
set -u # Check for undefined variables
die() {
# Print a message and exit with code 1.
#
# Usage: die <error_message>
# e.g., die "Something bad happened."
echo $@
exit 1
}
echo "Collecting system information..."
OUTPUT_FILE=pd_env.txt
python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
{
echo
echo '== check python ==================================================='
} >> ${OUTPUT_FILE}
cat <<EOF > /tmp/check_python.py
import platform
print("""python version: %s
python branch: %s
python build version: %s
python compiler version: %s
python implementation: %s
""" % (
platform.python_version(),
platform.python_branch(),
platform.python_build(),
platform.python_compiler(),
platform.python_implementation(),
))
EOF
${python_bin_path} /tmp/check_python.py 2>&1 >> ${OUTPUT_FILE}
{
echo
echo '== check os platform ==============================================='
} >> ${OUTPUT_FILE}
cat <<EOF > /tmp/check_os.py
import platform
print("""os: %s
os kernel version: %s
os release version: %s
os platform: %s
linux distribution: %s
linux os distribution: %s
mac version: %s
uname: %s
architecture: %s
machine: %s
""" % (
platform.system(),
platform.version(),
platform.release(),
platform.platform(),
platform.linux_distribution(),
platform.dist(),
platform.mac_ver(),
platform.uname(),
platform.architecture(),
platform.machine(),
))
EOF
${python_bin_path} /tmp/check_os.py 2>&1 >> ${OUTPUT_FILE}
{
echo
echo '== are we in docker ============================================='
num=`cat /proc/1/cgroup | grep docker | wc -l`;
if [ $num -ge 1 ]; then
echo "Yes"
else
echo "No"
fi
echo
echo '== compiler ====================================================='
c++ --version 2>&1
echo
echo '== check pips ==================================================='
pip list 2>&1 | grep "proto\|numpy\|paddlepaddle"
echo
echo '== check for virtualenv ========================================='
${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
echo
echo '== paddlepaddle import ============================================'
} >> ${OUTPUT_FILE}
cat <<EOF > /tmp/check_pd.py
import paddle as pd;
pd.set_device('cpu')
print("pd.version.full_version = %s" % pd.version.full_version)
print("pd.version.commit = %s" % pd.version.commit)
print("pd.__version__ = %s" % pd.__version__)
print("Sanity check: %r" % pd.zeros([1,2,3])[:1])
EOF
${python_bin_path} /tmp/check_pd.py 2>&1 >> ${OUTPUT_FILE}
LD_DEBUG=libs ${python_bin_path} -c "import paddle" 2>>${OUTPUT_FILE} > /tmp/loadedlibs
{
grep libcudnn.so /tmp/loadedlibs
echo
echo '== env =========================================================='
if [ -z ${LD_LIBRARY_PATH+x} ]; then
echo "LD_LIBRARY_PATH is unset";
else
echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
fi
if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
echo "DYLD_LIBRARY_PATH is unset";
else
echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
fi
echo
echo '== nvidia-smi ==================================================='
nvidia-smi 2>&1
echo
echo '== cuda libs ==================================================='
} >> ${OUTPUT_FILE}
find /usr/local -type f -name 'libcudart*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE}
find /usr/local -type f -name 'libudnn*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE}
{
echo
echo '== paddlepaddle installed from info =================='
pip show paddlepaddle-gpu
echo
echo '== python version =============================================='
echo '(major, minor, micro, releaselevel, serial)'
python -c 'import sys; print(sys.version_info[:])'
echo
echo '== bazel version ==============================================='
bazel version
echo '== cmake version ==============================================='
cmake --version
} >> ${OUTPUT_FILE}
# Remove any words with google.
mv $OUTPUT_FILE old-$OUTPUT_FILE
grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
echo "and use it to populate the fields in the github issue template."
echo
echo "cat ${OUTPUT_FILE}"
echo
Loading…
Cancel
Save