diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 67b666ed..1328a1cb 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -182,9 +182,10 @@ class U2Trainer(Trainer): from_scratch = self.resume_or_scratch() if from_scratch: # save init model, i.e. 0 epoch - self.save(tag='init') + self.save(tag='init', infos=None) - self.lr_scheduler.step(self.iteration) + # lr will resotre from optimizer ckpt + # self.lr_scheduler.step(self.iteration) if self.parallel and hasattr(self.train_loader, 'batch_sampler'): self.train_loader.batch_sampler.set_epoch(self.epoch) diff --git a/deepspeech/training/extensions/snapshot.py b/deepspeech/training/extensions/snapshot.py index 1d3fe70c..e81eb97f 100644 --- a/deepspeech/training/extensions/snapshot.py +++ b/deepspeech/training/extensions/snapshot.py @@ -101,7 +101,7 @@ class Snapshot(extension.Extension): iteration = trainer.updater.state.iteration epoch = trainer.updater.state.epoch num = epoch if self.trigger[1] == 'epoch' else iteration - path = self.checkpoint_dir / f"{num}.pdz" + path = self.checkpoint_dir / f"{num}.np" # add the new one trainer.updater.save(path) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index bdb68310..6587f129 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -185,7 +185,8 @@ class Trainer(): batch_sampler.set_epoch(self.epoch) def after_train_batch(self): - profiler.add_profiler_step(self.args.profiler_options) + if self.args.profiler_options: + profiler.add_profiler_step(self.args.profiler_options) def train(self): """The training process control by epoch.""" @@ -193,7 +194,9 @@ class Trainer(): if from_scratch: # save init model, i.e. 0 epoch self.save(tag='init', infos=None) - self.lr_scheduler.step(self.epoch) + + # lr will resotre from optimizer ckpt + # self.lr_scheduler.step(self.epoch) if self.parallel and hasattr(self.train_loader, "batch_sampler"): self.train_loader.batch_sampler.set_epoch(self.epoch) diff --git a/deepspeech/utils/profiler.py b/deepspeech/utils/profiler.py index 5b8389be..83b003ca 100644 --- a/deepspeech/utils/profiler.py +++ b/deepspeech/utils/profiler.py @@ -61,6 +61,9 @@ class ProfilerOptions(object): self._parse_from_string(options_str) def _parse_from_string(self, options_str): + if not options_str: + return + for kv in options_str.replace(' ', '').split(';'): key, value = kv.split('=') if key == 'batch_range': diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 64598b4b..40899655 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -48,7 +48,7 @@ training: n_epoch: 10 accum_grad: 1 lr: 1e-5 - lr_decay: 1.0 + lr_decay: 0.8 weight_decay: 1e-06 global_grad_clip: 5.0 log_interval: 1 diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh index a657ce34..f96508b4 100755 --- a/examples/tiny/s0/local/train.sh +++ b/examples/tiny/s0/local/train.sh @@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \ --config ${config_path} \ --output exp/${ckpt_name} \ --model_type ${model_type} \ ---profiler_options ${profiler_options} \ +--profiler_options "${profiler_options}" \ --seed ${seed} if [ ${seed} != 0 ]; then diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore new file mode 100644 index 00000000..7d166b06 --- /dev/null +++ b/tests/benchmark/.gitignore @@ -0,0 +1,2 @@ +old-pd_env.txt +pd_env.txt diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md new file mode 100644 index 00000000..d21999ab --- /dev/null +++ b/tests/benchmark/README.md @@ -0,0 +1,11 @@ +# Benchmark Test + +## Data + +* Aishell + +## Docker + +``` +registry.baidubce.com/paddlepaddle/paddle 2.1.1-gpu-cuda10.2-cudnn7 59d5ec1de486 +``` diff --git a/tests/benchmark/run_all.sh b/tests/benchmark/run_all.sh new file mode 100644 index 00000000..7aa11d0f --- /dev/null +++ b/tests/benchmark/run_all.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +ROOT_DIR=../../ + +# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: +# collect env info +bash ${ROOT_DIR}/utils/pd_env_collect.sh +cat pd_env.txt + +# 执行目录:需说明 +pushd ${ROOT_DIR}/examples/aishell/s1 + +# 1 安装该模型需要的依赖 (如需开启优化策略请注明) +pushd ${ROOT_DIR}/tools; make; popd +source ${ROOT_DIR}/tools/venv/bin/activate +pushd ${ROOT_DIR}; bash setup.sh; popd + + +# 2 拷贝该模型需要数据、预训练模型 +mkdir -p exp/log +loca/data.sh &> exp/log/data.log + +# 3 批量运行(如不方便批量,1,2需放到单个模型中) + +model_mode_list=(conformer) +fp_item_list=(fp32) +bs_item=(32 64 96) +for model_mode in ${model_mode_list[@]}; do + for fp_item in ${fp_item_list[@]}; do + for bs_item in ${bs_list[@]} + do + echo "index is speed, 1gpus, begin, ${model_name}" + run_mode=sp + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} # (5min) + sleep 60 + echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" + run_mode=mp + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} + sleep 60 + done + done +done + +popd # aishell/s1 diff --git a/tests/benchmark/run_benchmark.sh b/tests/benchmark/run_benchmark.sh new file mode 100644 index 00000000..625d3616 --- /dev/null +++ b/tests/benchmark/run_benchmark.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -xe + +# 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode} +# 参数说明 +function _set_params(){ + run_mode=${1:-"sp"} # 单卡sp|多卡mp + batch_size=${2:-"64"} + fp_item=${3:-"fp32"} # fp32|fp16 + max_iter=${4:-"500"} # 可选,如果需要修改代码提前中断 + model_name=${5:-"model_name"} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 + +# 以下不用修改 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} +} + +function _train(){ + echo "Train on ${num_gpu_devices} GPUs" + echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" + + train_cmd="--model_name=${model_name} + --batch_size=${batch_size} + --fp=${fp_item} \ + --max_iter=${max_iter} " + case ${run_mode} in + sp) train_cmd="python -u tools/train.py "${train_cmd}" ;; + mp) + train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}" + log_parse_file="mylog/workerlog.0" ;; + *) echo "choose run_mode(sp or mp)"; exit 1; + esac +# 以下不用修改 + timeout 15m ${train_cmd} > ${log_file} 2>&1 + if [ $? -ne 0 ];then + echo -e "${model_name}, FAIL" + export job_fail_flag=1 + else + echo -e "${model_name}, SUCCESS" + export job_fail_flag=0 + fi + kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + + if [ $run_mode = "mp" -a -d mylog ]; then + rm ${log_file} + cp mylog/workerlog.0 ${log_file} + fi +} + +_set_params $@ +_train + diff --git a/utils/pd_env_collect.sh b/utils/pd_env_collect.sh new file mode 100644 index 00000000..64ff8886 --- /dev/null +++ b/utils/pd_env_collect.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash + +unset GREP_OPTIONS + +set -u # Check for undefined variables + +die() { + # Print a message and exit with code 1. + # + # Usage: die + # e.g., die "Something bad happened." + + echo $@ + exit 1 +} + +echo "Collecting system information..." + +OUTPUT_FILE=pd_env.txt +python_bin_path=$(which python || which python3 || die "Cannot find Python binary") + +{ +echo +echo '== check python ===================================================' +} >> ${OUTPUT_FILE} + +cat < /tmp/check_python.py +import platform +print("""python version: %s +python branch: %s +python build version: %s +python compiler version: %s +python implementation: %s +""" % ( +platform.python_version(), +platform.python_branch(), +platform.python_build(), +platform.python_compiler(), +platform.python_implementation(), +)) +EOF +${python_bin_path} /tmp/check_python.py 2>&1 >> ${OUTPUT_FILE} + +{ +echo +echo '== check os platform ===============================================' +} >> ${OUTPUT_FILE} + +cat < /tmp/check_os.py +import platform +print("""os: %s +os kernel version: %s +os release version: %s +os platform: %s +linux distribution: %s +linux os distribution: %s +mac version: %s +uname: %s +architecture: %s +machine: %s +""" % ( +platform.system(), +platform.version(), +platform.release(), +platform.platform(), +platform.linux_distribution(), +platform.dist(), +platform.mac_ver(), +platform.uname(), +platform.architecture(), +platform.machine(), +)) +EOF +${python_bin_path} /tmp/check_os.py 2>&1 >> ${OUTPUT_FILE} + +{ + echo + echo '== are we in docker =============================================' + num=`cat /proc/1/cgroup | grep docker | wc -l`; + if [ $num -ge 1 ]; then + echo "Yes" + else + echo "No" + fi + + echo + echo '== compiler =====================================================' + c++ --version 2>&1 + + echo + echo '== check pips ===================================================' + pip list 2>&1 | grep "proto\|numpy\|paddlepaddle" + + + echo + echo '== check for virtualenv =========================================' + ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))" + + echo + echo '== paddlepaddle import ============================================' +} >> ${OUTPUT_FILE} + +cat < /tmp/check_pd.py +import paddle as pd; +pd.set_device('cpu') +print("pd.version.full_version = %s" % pd.version.full_version) +print("pd.version.commit = %s" % pd.version.commit) +print("pd.__version__ = %s" % pd.__version__) +print("Sanity check: %r" % pd.zeros([1,2,3])[:1]) +EOF +${python_bin_path} /tmp/check_pd.py 2>&1 >> ${OUTPUT_FILE} + +LD_DEBUG=libs ${python_bin_path} -c "import paddle" 2>>${OUTPUT_FILE} > /tmp/loadedlibs + +{ + grep libcudnn.so /tmp/loadedlibs + echo + echo '== env ==========================================================' + if [ -z ${LD_LIBRARY_PATH+x} ]; then + echo "LD_LIBRARY_PATH is unset"; + else + echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ; + fi + if [ -z ${DYLD_LIBRARY_PATH+x} ]; then + echo "DYLD_LIBRARY_PATH is unset"; + else + echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ; + fi + + + echo + echo '== nvidia-smi ===================================================' + nvidia-smi 2>&1 + + echo + echo '== cuda libs ===================================================' +} >> ${OUTPUT_FILE} + +find /usr/local -type f -name 'libcudart*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE} +find /usr/local -type f -name 'libudnn*' 2>/dev/null | grep cuda | grep -v "\\.cache" >> ${OUTPUT_FILE} + +{ + echo + echo '== paddlepaddle installed from info ==================' + pip show paddlepaddle-gpu + + echo + echo '== python version ==============================================' + echo '(major, minor, micro, releaselevel, serial)' + python -c 'import sys; print(sys.version_info[:])' + + echo + echo '== bazel version ===============================================' + bazel version + echo '== cmake version ===============================================' + cmake --version +} >> ${OUTPUT_FILE} + +# Remove any words with google. +mv $OUTPUT_FILE old-$OUTPUT_FILE +grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE + +echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file." +echo "and use it to populate the fields in the github issue template." +echo +echo "cat ${OUTPUT_FILE}" +echo