From 438e1bd34fbf5e1c1181559b4f19301657d6b4c7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 15 Sep 2021 11:03:18 +0000 Subject: [PATCH 1/6] add benmark scripts --- tests/benchmark/.gitignore | 2 + tests/benchmark/README.md | 12 +++ tests/benchmark/run_all.sh | 33 ++++++ tests/benchmark/run_benchmark.sh | 54 ++++++++++ utils/pd_env_collect.sh | 167 +++++++++++++++++++++++++++++++ 5 files changed, 268 insertions(+) create mode 100644 tests/benchmark/.gitignore create mode 100644 tests/benchmark/README.md create mode 100644 tests/benchmark/run_all.sh create mode 100644 tests/benchmark/run_benchmark.sh create mode 100644 utils/pd_env_collect.sh diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore new file mode 100644 index 000000000..7d166b066 --- /dev/null +++ b/tests/benchmark/.gitignore @@ -0,0 +1,2 @@ +old-pd_env.txt +pd_env.txt diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md new file mode 100644 index 000000000..8ec43f89e --- /dev/null +++ b/tests/benchmark/README.md @@ -0,0 +1,12 @@ +# Benchmark Test + +## Data + +* Aishell + +## Docker + +``` +registry.baidubce.com/paddlepaddle/paddle 2.1.1-gpu-cuda10.2-cudnn7 59d5ec1de486 +``` + diff --git a/tests/benchmark/run_all.sh b/tests/benchmark/run_all.sh new file mode 100644 index 000000000..7564174b4 --- /dev/null +++ b/tests/benchmark/run_all.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# collect env info +bash ../../utils/pd_env_collect.sh + + + +# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 +# 执行目录:需说明 +cd ** +# 1 安装该模型需要的依赖 (如需开启优化策略请注明) +pip install ... +# 2 拷贝该模型需要数据、预训练模型 +# 3 批量运行(如不方便批量,1,2需放到单个模型中) + +model_mode_list=(MobileNetv1 MobileNetv2) +fp_item_list=(fp32 fp16) +bs_item=(32 64 96) +for model_mode in ${model_mode_list[@]}; do + for fp_item in ${fp_item_list[@]}; do + for bs_item in ${bs_list[@]} + do + echo "index is speed, 1gpus, begin, ${model_name}" + run_mode=sp + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min) + sleep 60 + echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" + run_mode=mp + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} + sleep 60 + done + done +done diff --git a/tests/benchmark/run_benchmark.sh b/tests/benchmark/run_benchmark.sh new file mode 100644 index 000000000..2b9cf70fd --- /dev/null +++ b/tests/benchmark/run_benchmark.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -xe +# 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} +# 参数说明 +function _set_params(){ + run_mode=${1:-"sp"} # 单卡sp|多卡mp + batch_size=${2:-"64"} + fp_item=${3:-"fp32"} # fp32|fp16 + max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断 + model_name=${5:-"model_name"} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 + +# 以下不用修改 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} +} +function _train(){ + echo "Train on ${num_gpu_devices} GPUs" + echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" + + train_cmd="--model_name=${model_name} + --batch_size=${batch_size} + --fp=${fp_item} \ + --max_iter=${max_iter} " + case ${run_mode} in + sp) train_cmd="python -u tools/train.py "${train_cmd}" ;; + mp) + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}" + log_parse_file="mylog/workerlog.0" ;; + *) echo "choose run_mode(sp or mp)"; exit 1; + esac +# 以下不用修改 + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + export job_fail_flag=1 + else + echo -e "${model_name}, SUCCESS" + export job_fail_flag=0 + fi + kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + + if [ $run_mode = "mp" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi +} + +_set_params $@ +_train + diff --git a/utils/pd_env_collect.sh b/utils/pd_env_collect.sh new file mode 100644 index 000000000..64ff8886c --- /dev/null +++ b/utils/pd_env_collect.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash + +unset GREP_OPTIONS + +set -u # Check for undefined variables + +die() { + # Print a message and exit with code 1. + # + # Usage: die + # e.g., die "Something bad happened." + + echo $@ + exit 1 +} + +echo "Collecting system information..." + +OUTPUT_FILE=pd_env.txt +python_bin_path=$(which python || which python3 || die "Cannot find Python binary") + +{ +echo +echo '== check python ===================================================' +} >> ${OUTPUT_FILE} + +cat < /tmp/check_python.py +import platform +print("""python version: %s +python branch: %s +python build version: %s +python compiler version: %s +python implementation: %s +""" % ( +platform.python_version(), +platform.python_branch(), +platform.python_build(), +platform.python_compiler(), +platform.python_implementation(), +)) +EOF +${python_bin_path} /tmp/check_python.py 2>&1 >> ${OUTPUT_FILE} + +{ +echo +echo '== check os platform ===============================================' +} >> ${OUTPUT_FILE} + +cat < /tmp/check_os.py +import platform +print("""os: %s +os kernel version: %s +os release version: %s +os platform: %s +linux distribution: %s +linux os distribution: %s +mac version: %s +uname: %s +architecture: %s +machine: %s +""" % ( +platform.system(), +platform.version(), +platform.release(), +platform.platform(), +platform.linux_distribution(), +platform.dist(), +platform.mac_ver(), +platform.uname(), +platform.architecture(), +platform.machine(), +)) +EOF +${python_bin_path} /tmp/check_os.py 2>&1 >> ${OUTPUT_FILE} + +{ + echo + echo '== are we in docker =============================================' + num=`cat /proc/1/cgroup | grep docker | wc -l`; + if [ $num -ge 1 ]; then + echo "Yes" + else + echo "No" + fi + + echo + echo '== compiler =====================================================' + c++ --version 2>&1 + + echo + echo '== check pips ===================================================' + pip list 2>&1 | grep "proto\|numpy\|paddlepaddle" + + + echo + echo '== check for virtualenv =========================================' + ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))" + + echo + echo '== paddlepaddle import ============================================' +} >> ${OUTPUT_FILE} + +cat < /tmp/check_pd.py +import paddle as pd; +pd.set_device('cpu') +print("pd.version.full_version = %s" % pd.version.full_version) +print("pd.version.commit = %s" % pd.version.commit) +print("pd.__version__ = %s" % pd.__version__) +print("Sanity check: %r" % pd.zeros([1,2,3])[:1]) +EOF +${python_bin_path} /tmp/check_pd.py 2>&1 >> ${OUTPUT_FILE} + +LD_DEBUG=libs ${python_bin_path} -c "import paddle" 2>>${OUTPUT_FILE} > /tmp/loadedlibs + +{ + grep libcudnn.so /tmp/loadedlibs + echo + echo '== env ==========================================================' + if [ -z ${LD_LIBRARY_PATH+x} ]; then + echo "LD_LIBRARY_PATH is unset"; + else + echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ; + fi + if [ -z ${DYLD_LIBRARY_PATH+x} ]; then + echo "DYLD_LIBRARY_PATH is unset"; + else + echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ; + fi + + + echo + echo '== nvidia-smi ===================================================' + nvidia-smi 2>&1 + + echo + echo '== cuda libs ===================================================' +} >> ${OUTPUT_FILE} + +find /usr/local -type f -name 'libcudart*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE} +find /usr/local -type f -name 'libudnn*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE} + +{ + echo + echo '== paddlepaddle installed from info ==================' + pip show paddlepaddle-gpu + + echo + echo '== python version ==============================================' + echo '(major, minor, micro, releaselevel, serial)' + python -c 'import sys; print(sys.version_info[:])' + + echo + echo '== bazel version ===============================================' + bazel version + echo '== cmake version ===============================================' + cmake --version +} >> ${OUTPUT_FILE} + +# Remove any words with google. +mv $OUTPUT_FILE old-$OUTPUT_FILE +grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE + +echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file." +echo "and use it to populate the fields in the github issue template." +echo +echo "cat ${OUTPUT_FILE}" +echo From 7907319288d58a784d8cee981f6c2e59c609aeab Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 15 Sep 2021 11:26:07 +0000 Subject: [PATCH 2/6] fix profiler --- deepspeech/training/trainer.py | 3 ++- deepspeech/utils/profiler.py | 3 +++ examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s0/local/train.sh | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index bdb68310a..b31ddcad6 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -185,7 +185,8 @@ class Trainer(): batch_sampler.set_epoch(self.epoch) def after_train_batch(self): - profiler.add_profiler_step(self.args.profiler_options) + if self.args.profiler_options: + profiler.add_profiler_step(self.args.profiler_options) def train(self): """The training process control by epoch.""" diff --git a/deepspeech/utils/profiler.py b/deepspeech/utils/profiler.py index 5b8389be8..357840a62 100644 --- a/deepspeech/utils/profiler.py +++ b/deepspeech/utils/profiler.py @@ -61,6 +61,9 @@ class ProfilerOptions(object): self._parse_from_string(options_str) def _parse_from_string(self, options_str): + if not options_str: + return + for kv in options_str.replace(' ', '').split(';'): key, value = kv.split('=') if key == 'batch_range': diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 64598b4be..408996557 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -48,7 +48,7 @@ training: n_epoch: 10 accum_grad: 1 lr: 1e-5 - lr_decay: 1.0 + lr_decay: 0.8 weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 1 diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh index a657ce345..f96508b4f 100755 --- a/examples/tiny/s0/local/train.sh +++ b/examples/tiny/s0/local/train.sh @@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ ---profiler_options ${profiler_options} \ +--profiler_options "${profiler_options}" \ --seed ${seed} if [ ${seed} != 0 ]; then From 3a5258f6a00ba0c660bd92d241e3b24cf0554520 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 15 Sep 2021 11:36:35 +0000 Subject: [PATCH 3/6] lr and opt param will restore from ckpt, so we do not set lr manully --- deepspeech/exps/u2/model.py | 5 +++-- deepspeech/training/trainer.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 67b666ed0..1328a1cb7 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -182,9 +182,10 @@ class U2Trainer(Trainer): from_scratch = self.resume_or_scratch() if from_scratch: # save init model, i.e. 0 epoch - self.save(tag='init') + self.save(tag='init', infos=None) - self.lr_scheduler.step(self.iteration) + # lr will resotre from optimizer ckpt + # self.lr_scheduler.step(self.iteration) if self.parallel and hasattr(self.train_loader, 'batch_sampler'): self.train_loader.batch_sampler.set_epoch(self.epoch) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index b31ddcad6..6587f1290 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -194,7 +194,9 @@ class Trainer(): if from_scratch: # save init model, i.e. 0 epoch self.save(tag='init', infos=None) - self.lr_scheduler.step(self.epoch) + + # lr will resotre from optimizer ckpt + # self.lr_scheduler.step(self.epoch) if self.parallel and hasattr(self.train_loader, "batch_sampler"): self.train_loader.batch_sampler.set_epoch(self.epoch) From 16b9c33deb10c3f895286d7c7abe64d96f618e39 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 15 Sep 2021 11:42:07 +0000 Subject: [PATCH 4/6] format --- deepspeech/utils/profiler.py | 2 +- tests/benchmark/README.md | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/deepspeech/utils/profiler.py b/deepspeech/utils/profiler.py index 357840a62..83b003cad 100644 --- a/deepspeech/utils/profiler.py +++ b/deepspeech/utils/profiler.py @@ -63,7 +63,7 @@ class ProfilerOptions(object): def _parse_from_string(self, options_str): if not options_str: return - + for kv in options_str.replace(' ', '').split(';'): key, value = kv.split('=') if key == 'batch_range': diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md index 8ec43f89e..d21999ab3 100644 --- a/tests/benchmark/README.md +++ b/tests/benchmark/README.md @@ -4,9 +4,8 @@ * Aishell -## Docker +## Docker ``` registry.baidubce.com/paddlepaddle/paddle 2.1.1-gpu-cuda10.2-cudnn7 59d5ec1de486 ``` - From 0f3e5a3872defc3e7197e01f8ae7e760b22c00bf Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 16 Sep 2021 05:59:16 +0000 Subject: [PATCH 5/6] run_all with aishell/s1 --- tests/benchmark/run_all.sh | 27 +++++++++++++++++++-------- tests/benchmark/run_benchmark.sh | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tests/benchmark/run_all.sh b/tests/benchmark/run_all.sh index 7564174b4..7aa11d0f2 100644 --- a/tests/benchmark/run_all.sh +++ b/tests/benchmark/run_all.sh @@ -1,20 +1,29 @@ #!/bin/bash -# collect env info -bash ../../utils/pd_env_collect.sh - +ROOT_DIR=../../ +# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: +# collect env info +bash ${ROOT_DIR}/utils/pd_env_collect.sh +cat pd_env.txt -# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37 # 执行目录:需说明 -cd ** +pushd ${ROOT_DIR}/examples/aishell/s1 + # 1 安装该模型需要的依赖 (如需开启优化策略请注明) -pip install ... +pushd ${ROOT_DIR}/tools; make; popd +source ${ROOT_DIR}/tools/venv/bin/activate +pushd ${ROOT_DIR}; bash setup.sh; popd + + # 2 拷贝该模型需要数据、预训练模型 +mkdir -p exp/log +loca/data.sh &> exp/log/data.log + # 3 批量运行(如不方便批量,1,2需放到单个模型中) -model_mode_list=(MobileNetv1 MobileNetv2) -fp_item_list=(fp32 fp16) +model_mode_list=(conformer) +fp_item_list=(fp32) bs_item=(32 64 96) for model_mode in ${model_mode_list[@]}; do for fp_item in ${fp_item_list[@]}; do @@ -31,3 +40,5 @@ for model_mode in ${model_mode_list[@]}; do done done done + +popd # aishell/s1 diff --git a/tests/benchmark/run_benchmark.sh b/tests/benchmark/run_benchmark.sh index 2b9cf70fd..625d36160 100644 --- a/tests/benchmark/run_benchmark.sh +++ b/tests/benchmark/run_benchmark.sh @@ -1,6 +1,7 @@ #!/bin/bash set -xe + # 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # 参数说明 function _set_params(){ @@ -17,6 +18,7 @@ function _set_params(){ num_gpu_devices=${#arr[*]} log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} } + function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" From a997b5a61cd3573bb920b3c8b4a880aeee28432f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 16 Sep 2021 06:00:35 +0000 Subject: [PATCH 6/6] rename ckpt suffix to np --- deepspeech/training/extensions/snapshot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeech/training/extensions/snapshot.py b/deepspeech/training/extensions/snapshot.py index 1d3fe70cb..e81eb97fc 100644 --- a/deepspeech/training/extensions/snapshot.py +++ b/deepspeech/training/extensions/snapshot.py @@ -101,7 +101,7 @@ class Snapshot(extension.Extension): iteration = trainer.updater.state.iteration epoch = trainer.updater.state.epoch num = epoch if self.trigger[1] == 'epoch' else iteration - path = self.checkpoint_dir / f"{num}.pdz" + path = self.checkpoint_dir / f"{num}.np" # add the new one trainer.updater.save(path)