Merge pull request #831 from PaddlePaddle/bench

Benchmark
4 years ago · dc2cdbf3fb
parent 068c2b1417 a997b5a61c
commit dc2cdbf3fb
11 changed files with 294 additions and 7 deletions
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -182,9 +182,10 @@ class U2Trainer(Trainer):
        from_scratch = self.resume_or_scratch()
        if from_scratch:
            # save init model, i.e. 0 epoch
-            self.save(tag='init')
+            self.save(tag='init', infos=None)
-        self.lr_scheduler.step(self.iteration)
+        # lr will resotre from optimizer ckpt
        # self.lr_scheduler.step(self.iteration)
        if self.parallel and hasattr(self.train_loader, 'batch_sampler'):
            self.train_loader.batch_sampler.set_epoch(self.epoch)
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
@ -101,7 +101,7 @@ class Snapshot(extension.Extension):
        iteration = trainer.updater.state.iteration
        epoch = trainer.updater.state.epoch
        num = epoch if self.trigger[1] == 'epoch' else iteration
-        path = self.checkpoint_dir / f"{num}.pdz"
+        path = self.checkpoint_dir / f"{num}.np"
        # add the new one
        trainer.updater.save(path)
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -185,7 +185,8 @@ class Trainer():
                batch_sampler.set_epoch(self.epoch)
    def after_train_batch(self):
-        profiler.add_profiler_step(self.args.profiler_options)
+        if self.args.profiler_options:
            profiler.add_profiler_step(self.args.profiler_options)
    def train(self):
        """The training process control by epoch."""
@ -193,7 +194,9 @@ class Trainer():
        if from_scratch:
            # save init model, i.e. 0 epoch
            self.save(tag='init', infos=None)
-        self.lr_scheduler.step(self.epoch)
+
        # lr will resotre from optimizer ckpt
        # self.lr_scheduler.step(self.epoch)
        if self.parallel and hasattr(self.train_loader, "batch_sampler"):
            self.train_loader.batch_sampler.set_epoch(self.epoch)
--- a/deepspeech/utils/profiler.py
+++ b/deepspeech/utils/profiler.py
@ -61,6 +61,9 @@ class ProfilerOptions(object):
        self._parse_from_string(options_str)
    def _parse_from_string(self, options_str):
        if not options_str:
            return
        for kv in options_str.replace(' ', '').split(';'):
            key, value = kv.split('=')
            if key == 'batch_range':
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -48,7 +48,7 @@ training:
  n_epoch: 10
  accum_grad: 1
  lr: 1e-5 
-  lr_decay: 1.0 
+  lr_decay: 0.8 
  weight_decay: 1e-06
  global_grad_clip: 5.0
  log_interval: 1
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
--profiler_options ${profiler_options} \
+--profiler_options "${profiler_options}" \
 --seed ${seed}
 if [ ${seed} != 0  ]; then
--- a/tests/benchmark/.gitignore
+++ b/tests/benchmark/.gitignore
@ -0,0 +1,2 @@
 old-pd_env.txt
 pd_env.txt
--- a/tests/benchmark/README.md
+++ b/tests/benchmark/README.md
@ -0,0 +1,11 @@
 # Benchmark Test
 ## Data
 * Aishell
 ## Docker
 ```
 registry.baidubce.com/paddlepaddle/paddle   2.1.1-gpu-cuda10.2-cudnn7   59d5ec1de486  
 ```
--- a/tests/benchmark/run_all.sh
+++ b/tests/benchmark/run_all.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 ROOT_DIR=../../
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行：
 # collect env info
 bash ${ROOT_DIR}/utils/pd_env_collect.sh
 cat pd_env.txt
 # 执行目录：需说明
 pushd ${ROOT_DIR}/examples/aishell/s1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
 pushd ${ROOT_DIR}/tools; make; popd
 source ${ROOT_DIR}/tools/venv/bin/activate
 pushd ${ROOT_DIR}; bash setup.sh; popd
 # 2 拷贝该模型需要数据、预训练模型
 mkdir -p exp/log
 loca/data.sh &> exp/log/data.log
 # 3 批量运行（如不方便批量，1，2需放到单个模型中）
 model_mode_list=(conformer)
 fp_item_list=(fp32)
 bs_item=(32 64 96)
 for model_mode in ${model_mode_list[@]}; do
      for fp_item in ${fp_item_list[@]}; do
          for bs_item in ${bs_list[@]}
            do
            echo "index is speed, 1gpus, begin, ${model_name}"
            run_mode=sp
            CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
            sleep 60
            echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
            run_mode=mp
            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
            sleep 60
            done
      done
 done
 popd # aishell/s1
--- a/tests/benchmark/run_benchmark.sh
+++ b/tests/benchmark/run_benchmark.sh
@ -0,0 +1,56 @@
 #!/bin/bash
 set -xe
 # 运行示例：CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
 # 参数说明
 function _set_params(){
    run_mode=${1:-"sp"}          # 单卡sp|多卡mp
    batch_size=${2:-"64"}
    fp_item=${3:-"fp32"}        # fp32|fp16
    max_iter=${4:-"500"}       # 可选，如果需要修改代码提前中断
    model_name=${5:-"model_name"}
    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
 #   以下不用修改
    device=${CUDA_VISIBLE_DEVICES//,/ }
    arr=(${device})
    num_gpu_devices=${#arr[*]}
    log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
 }
 function _train(){
    echo "Train on ${num_gpu_devices} GPUs"
    echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
    train_cmd="--model_name=${model_name}
               --batch_size=${batch_size}
               --fp=${fp_item} \
               --max_iter=${max_iter} "
    case ${run_mode} in
    sp) train_cmd="python -u tools/train.py "${train_cmd}" ;;
    mp)
        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}"
        log_parse_file="mylog/workerlog.0" ;;
    *) echo "choose run_mode(sp or mp)"; exit 1;
    esac
 # 以下不用修改
    timeout 15m ${train_cmd} > ${log_file} 2>&1
    if [ $? -ne 0 ];then
        echo -e "${model_name}, FAIL"
        export job_fail_flag=1
    else
        echo -e "${model_name}, SUCCESS"
        export job_fail_flag=0
    fi
    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
    if [ $run_mode = "mp" -a -d mylog ]; then
        rm ${log_file}
        cp mylog/workerlog.0 ${log_file}
    fi
 }
 _set_params $@
 _train
--- a/utils/pd_env_collect.sh
+++ b/utils/pd_env_collect.sh
@ -0,0 +1,167 @@
 #!/usr/bin/env bash
 unset GREP_OPTIONS
 set -u  # Check for undefined variables
 die() {
  # Print a message and exit with code 1.
  #
  # Usage: die <error_message>
  #   e.g., die "Something bad happened."
  echo $@
  exit 1
 }
 echo "Collecting system information..."
 OUTPUT_FILE=pd_env.txt
 python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
 {
 echo
 echo '== check python ==================================================='
 } >> ${OUTPUT_FILE}
 cat <<EOF > /tmp/check_python.py
 import platform
 print("""python version: %s
 python branch: %s
 python build version: %s
 python compiler version: %s
 python implementation: %s
 """ % (
 platform.python_version(),
 platform.python_branch(),
 platform.python_build(),
 platform.python_compiler(),
 platform.python_implementation(),
 ))
 EOF
 ${python_bin_path} /tmp/check_python.py 2>&1  >> ${OUTPUT_FILE}
 {
 echo
 echo '== check os platform ==============================================='
 } >> ${OUTPUT_FILE}
 cat <<EOF > /tmp/check_os.py
 import platform
 print("""os: %s
 os kernel version: %s
 os release version: %s
 os platform: %s
 linux distribution: %s
 linux os distribution: %s
 mac version: %s
 uname: %s
 architecture: %s
 machine: %s
 """ % (
 platform.system(),
 platform.version(),
 platform.release(),
 platform.platform(),
 platform.linux_distribution(),
 platform.dist(),
 platform.mac_ver(),
 platform.uname(),
 platform.architecture(),
 platform.machine(),
 ))
 EOF
 ${python_bin_path} /tmp/check_os.py 2>&1  >> ${OUTPUT_FILE}
 {
  echo
  echo '== are we in docker ============================================='
  num=`cat /proc/1/cgroup | grep docker | wc -l`;
  if [ $num -ge 1 ]; then
    echo "Yes"
  else
    echo "No"
  fi
  echo
  echo '== compiler ====================================================='
  c++ --version 2>&1
  echo
  echo '== check pips ==================================================='
  pip list 2>&1 | grep "proto\|numpy\|paddlepaddle"
  echo
  echo '== check for virtualenv ========================================='
  ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
  echo
  echo '== paddlepaddle import ============================================'
 } >> ${OUTPUT_FILE}
 cat <<EOF > /tmp/check_pd.py
 import paddle as pd;
 pd.set_device('cpu')
 print("pd.version.full_version = %s" % pd.version.full_version)
 print("pd.version.commit = %s" % pd.version.commit)
 print("pd.__version__ = %s" % pd.__version__)
 print("Sanity check: %r" % pd.zeros([1,2,3])[:1])
 EOF
 ${python_bin_path} /tmp/check_pd.py 2>&1  >> ${OUTPUT_FILE}
 LD_DEBUG=libs ${python_bin_path} -c "import paddle"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
 {
  grep libcudnn.so /tmp/loadedlibs
  echo
  echo '== env =========================================================='
  if [ -z ${LD_LIBRARY_PATH+x} ]; then
    echo "LD_LIBRARY_PATH is unset";
  else
    echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
  fi
  if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
    echo "DYLD_LIBRARY_PATH is unset";
  else
    echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
  fi
  echo
  echo '== nvidia-smi ==================================================='
  nvidia-smi 2>&1
  echo
  echo '== cuda libs  ==================================================='
 } >> ${OUTPUT_FILE}
 find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
 find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
 {
  echo
  echo '== paddlepaddle installed from info =================='
  pip show paddlepaddle-gpu
  echo
  echo '== python version  =============================================='
  echo '(major, minor, micro, releaselevel, serial)'
  python -c 'import sys; print(sys.version_info[:])'
  echo
  echo '== bazel version  ==============================================='
  bazel version
  echo '== cmake version  ==============================================='
  cmake --version
 } >> ${OUTPUT_FILE}
 # Remove any words with google.
 mv $OUTPUT_FILE old-$OUTPUT_FILE
 grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
 echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
 echo "and use it to populate the fields in the github issue template."
 echo
 echo "cat ${OUTPUT_FILE}"
 echo