diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 67b666ed..1328a1cb 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -182,9 +182,10 @@ class U2Trainer(Trainer):
         from_scratch = self.resume_or_scratch()
         if from_scratch:
             # save init model, i.e. 0 epoch
-            self.save(tag='init')
+            self.save(tag='init', infos=None)
 
-        self.lr_scheduler.step(self.iteration)
+        # lr will resotre from optimizer ckpt
+        # self.lr_scheduler.step(self.iteration)
         if self.parallel and hasattr(self.train_loader, 'batch_sampler'):
             self.train_loader.batch_sampler.set_epoch(self.epoch)
 
diff --git a/deepspeech/training/extensions/snapshot.py b/deepspeech/training/extensions/snapshot.py
index 1d3fe70c..e81eb97f 100644
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
@@ -101,7 +101,7 @@ class Snapshot(extension.Extension):
         iteration = trainer.updater.state.iteration
         epoch = trainer.updater.state.epoch
         num = epoch if self.trigger[1] == 'epoch' else iteration
-        path = self.checkpoint_dir / f"{num}.pdz"
+        path = self.checkpoint_dir / f"{num}.np"
 
         # add the new one
         trainer.updater.save(path)
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index bdb68310..6587f129 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -185,7 +185,8 @@ class Trainer():
                 batch_sampler.set_epoch(self.epoch)
 
     def after_train_batch(self):
-        profiler.add_profiler_step(self.args.profiler_options)
+        if self.args.profiler_options:
+            profiler.add_profiler_step(self.args.profiler_options)
 
     def train(self):
         """The training process control by epoch."""
@@ -193,7 +194,9 @@ class Trainer():
         if from_scratch:
             # save init model, i.e. 0 epoch
             self.save(tag='init', infos=None)
-        self.lr_scheduler.step(self.epoch)
+
+        # lr will resotre from optimizer ckpt
+        # self.lr_scheduler.step(self.epoch)
         if self.parallel and hasattr(self.train_loader, "batch_sampler"):
             self.train_loader.batch_sampler.set_epoch(self.epoch)
 
diff --git a/deepspeech/utils/profiler.py b/deepspeech/utils/profiler.py
index 5b8389be..83b003ca 100644
--- a/deepspeech/utils/profiler.py
+++ b/deepspeech/utils/profiler.py
@@ -61,6 +61,9 @@ class ProfilerOptions(object):
         self._parse_from_string(options_str)
 
     def _parse_from_string(self, options_str):
+        if not options_str:
+            return
+
         for kv in options_str.replace(' ', '').split(';'):
             key, value = kv.split('=')
             if key == 'batch_range':
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 64598b4b..40899655 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -48,7 +48,7 @@ training:
   n_epoch: 10
   accum_grad: 1
   lr: 1e-5 
-  lr_decay: 1.0 
+  lr_decay: 0.8 
   weight_decay: 1e-06
   global_grad_clip: 5.0
   log_interval: 1
diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh
index a657ce34..f96508b4 100755
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
@@ -38,7 +38,7 @@ python3 -u ${BIN_DIR}/train.py \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
---profiler_options ${profiler_options} \
+--profiler_options "${profiler_options}" \
 --seed ${seed}
 
 if [ ${seed} != 0  ]; then
diff --git a/tests/benchmark/.gitignore b/tests/benchmark/.gitignore
new file mode 100644
index 00000000..7d166b06
--- /dev/null
+++ b/tests/benchmark/.gitignore
@@ -0,0 +1,2 @@
+old-pd_env.txt
+pd_env.txt
diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
new file mode 100644
index 00000000..d21999ab
--- /dev/null
+++ b/tests/benchmark/README.md
@@ -0,0 +1,11 @@
+# Benchmark Test
+
+## Data
+
+* Aishell
+
+## Docker
+
+```
+registry.baidubce.com/paddlepaddle/paddle   2.1.1-gpu-cuda10.2-cudnn7   59d5ec1de486  
+```
diff --git a/tests/benchmark/run_all.sh b/tests/benchmark/run_all.sh
new file mode 100644
index 00000000..7aa11d0f
--- /dev/null
+++ b/tests/benchmark/run_all.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+ROOT_DIR=../../
+
+# 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行：
+# collect env info
+bash ${ROOT_DIR}/utils/pd_env_collect.sh
+cat pd_env.txt
+
+# 执行目录：需说明
+pushd ${ROOT_DIR}/examples/aishell/s1
+
+# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
+pushd ${ROOT_DIR}/tools; make; popd
+source ${ROOT_DIR}/tools/venv/bin/activate
+pushd ${ROOT_DIR}; bash setup.sh; popd
+
+
+# 2 拷贝该模型需要数据、预训练模型
+mkdir -p exp/log
+loca/data.sh &> exp/log/data.log
+
+# 3 批量运行（如不方便批量，1，2需放到单个模型中）
+
+model_mode_list=(conformer)
+fp_item_list=(fp32)
+bs_item=(32 64 96)
+for model_mode in ${model_mode_list[@]}; do
+      for fp_item in ${fp_item_list[@]}; do
+          for bs_item in ${bs_list[@]}
+            do
+            echo "index is speed, 1gpus, begin, ${model_name}"
+            run_mode=sp
+            CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
+            sleep 60
+            echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
+            run_mode=mp
+            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+            sleep 60
+            done
+      done
+done
+
+popd # aishell/s1
diff --git a/tests/benchmark/run_benchmark.sh b/tests/benchmark/run_benchmark.sh
new file mode 100644
index 00000000..625d3616
--- /dev/null
+++ b/tests/benchmark/run_benchmark.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -xe
+
+# 运行示例：CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+# 参数说明
+function _set_params(){
+    run_mode=${1:-"sp"}          # 单卡sp|多卡mp
+    batch_size=${2:-"64"}
+    fp_item=${3:-"fp32"}        # fp32|fp16
+    max_iter=${4:-"500"}       # 可选，如果需要修改代码提前中断
+    model_name=${5:-"model_name"}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # TRAIN_LOG_DIR 后续QA设置该参数
+
+#   以下不用修改
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
+}
+
+function _train(){
+    echo "Train on ${num_gpu_devices} GPUs"
+    echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
+
+    train_cmd="--model_name=${model_name}
+               --batch_size=${batch_size}
+               --fp=${fp_item} \
+               --max_iter=${max_iter} "
+    case ${run_mode} in
+    sp) train_cmd="python -u tools/train.py "${train_cmd}" ;;
+    mp)
+        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}"
+        log_parse_file="mylog/workerlog.0" ;;
+    *) echo "choose run_mode(sp or mp)"; exit 1;
+    esac
+# 以下不用修改
+    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+        export job_fail_flag=1
+    else
+        echo -e "${model_name}, SUCCESS"
+        export job_fail_flag=0
+    fi
+    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+
+    if [ $run_mode = "mp" -a -d mylog ]; then
+        rm ${log_file}
+        cp mylog/workerlog.0 ${log_file}
+    fi
+}
+
+_set_params $@
+_train
+
diff --git a/utils/pd_env_collect.sh b/utils/pd_env_collect.sh
new file mode 100644
index 00000000..64ff8886
--- /dev/null
+++ b/utils/pd_env_collect.sh
@@ -0,0 +1,167 @@
+#!/usr/bin/env bash
+
+unset GREP_OPTIONS
+
+set -u  # Check for undefined variables
+
+die() {
+  # Print a message and exit with code 1.
+  #
+  # Usage: die <error_message>
+  #   e.g., die "Something bad happened."
+
+  echo $@
+  exit 1
+}
+
+echo "Collecting system information..."
+
+OUTPUT_FILE=pd_env.txt
+python_bin_path=$(which python || which python3 || die "Cannot find Python binary")
+
+{
+echo
+echo '== check python ==================================================='
+} >> ${OUTPUT_FILE}
+
+cat <<EOF > /tmp/check_python.py
+import platform
+print("""python version: %s
+python branch: %s
+python build version: %s
+python compiler version: %s
+python implementation: %s
+""" % (
+platform.python_version(),
+platform.python_branch(),
+platform.python_build(),
+platform.python_compiler(),
+platform.python_implementation(),
+))
+EOF
+${python_bin_path} /tmp/check_python.py 2>&1  >> ${OUTPUT_FILE}
+
+{
+echo
+echo '== check os platform ==============================================='
+} >> ${OUTPUT_FILE}
+
+cat <<EOF > /tmp/check_os.py
+import platform
+print("""os: %s
+os kernel version: %s
+os release version: %s
+os platform: %s
+linux distribution: %s
+linux os distribution: %s
+mac version: %s
+uname: %s
+architecture: %s
+machine: %s
+""" % (
+platform.system(),
+platform.version(),
+platform.release(),
+platform.platform(),
+platform.linux_distribution(),
+platform.dist(),
+platform.mac_ver(),
+platform.uname(),
+platform.architecture(),
+platform.machine(),
+))
+EOF
+${python_bin_path} /tmp/check_os.py 2>&1  >> ${OUTPUT_FILE}
+
+{
+  echo
+  echo '== are we in docker ============================================='
+  num=`cat /proc/1/cgroup | grep docker | wc -l`;
+  if [ $num -ge 1 ]; then
+    echo "Yes"
+  else
+    echo "No"
+  fi
+
+  echo
+  echo '== compiler ====================================================='
+  c++ --version 2>&1
+
+  echo
+  echo '== check pips ==================================================='
+  pip list 2>&1 | grep "proto\|numpy\|paddlepaddle"
+
+
+  echo
+  echo '== check for virtualenv ========================================='
+  ${python_bin_path} -c "import sys;print(hasattr(sys, \"real_prefix\"))"
+
+  echo
+  echo '== paddlepaddle import ============================================'
+} >> ${OUTPUT_FILE}
+
+cat <<EOF > /tmp/check_pd.py
+import paddle as pd;
+pd.set_device('cpu')
+print("pd.version.full_version = %s" % pd.version.full_version)
+print("pd.version.commit = %s" % pd.version.commit)
+print("pd.__version__ = %s" % pd.__version__)
+print("Sanity check: %r" % pd.zeros([1,2,3])[:1])
+EOF
+${python_bin_path} /tmp/check_pd.py 2>&1  >> ${OUTPUT_FILE}
+
+LD_DEBUG=libs ${python_bin_path} -c "import paddle"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+
+{
+  grep libcudnn.so /tmp/loadedlibs
+  echo
+  echo '== env =========================================================='
+  if [ -z ${LD_LIBRARY_PATH+x} ]; then
+    echo "LD_LIBRARY_PATH is unset";
+  else
+    echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH} ;
+  fi
+  if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
+    echo "DYLD_LIBRARY_PATH is unset";
+  else
+    echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH} ;
+  fi
+
+
+  echo
+  echo '== nvidia-smi ==================================================='
+  nvidia-smi 2>&1
+
+  echo
+  echo '== cuda libs  ==================================================='
+} >> ${OUTPUT_FILE}
+
+find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+
+{
+  echo
+  echo '== paddlepaddle installed from info =================='
+  pip show paddlepaddle-gpu
+
+  echo
+  echo '== python version  =============================================='
+  echo '(major, minor, micro, releaselevel, serial)'
+  python -c 'import sys; print(sys.version_info[:])'
+
+  echo
+  echo '== bazel version  ==============================================='
+  bazel version
+  echo '== cmake version  ==============================================='
+  cmake --version
+} >> ${OUTPUT_FILE}
+
+# Remove any words with google.
+mv $OUTPUT_FILE old-$OUTPUT_FILE
+grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
+
+echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
+echo "and use it to populate the fields in the github issue template."
+echo
+echo "cat ${OUTPUT_FILE}"
+echo