From cda6ca8323935038efc51e911253cb12b24c923a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 16 Sep 2021 12:16:13 +0000
Subject: [PATCH] add benchmark flags, and logic

---
 deepspeech/exps/u2/model.py        |  3 ++-
 deepspeech/training/cli.py         | 40 ++++++++++++------------------
 deepspeech/training/trainer.py     | 14 +++++++++++
 deepspeech/utils/utility.py        | 14 ++++++++++-
 examples/aishell/s1/local/train.sh | 10 +++++---
 examples/tiny/s1/local/train.sh    | 40 +++++++++++++++++++-----------
 tests/benchmark/run_all.sh         | 29 +++++++++++++---------
 tests/benchmark/run_benchmark.sh   | 21 ++++++++--------
 8 files changed, 106 insertions(+), 65 deletions(-)
 mode change 100644 => 100755 tests/benchmark/run_all.sh
 mode change 100644 => 100755 tests/benchmark/run_benchmark.sh

diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 1328a1cb..0d17d9fd 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -100,7 +100,8 @@ class U2Trainer(Trainer):
             # Disable gradient synchronizations across DDP processes.
             # Within this context, gradients will be accumulated on module
             # variables, which will later be synchronized.
-            context = self.model.no_sync
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if self.parallel else nullcontext
         else:
             # Used for single gpu training and DDP gradient synchronization
             # processes.
diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py
index 1477bdfe..d8719b3a 100644
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@@ -44,32 +44,24 @@ def default_argument_parser():
     parser = argparse.ArgumentParser()
 
     # yapf: disable
-    # data and output
-    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
-    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
-    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
-
-    # load from saved checkpoint
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
-
-    # running
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
-                        help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
-
-    # overwrite extra config and default config
-    # parser.add_argument("--opts", nargs=argparse.REMAINDER,
-    # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-    parser.add_argument("--opts", type=str, default=[], nargs='+',
-                        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-
-    # random seed
-    parser.add_argument("--seed", type=int, default=None,
+    train_group = parser.add_argument_group(title='Train Options', description=None)
+    train_group.add_argument("--seed", type=int, default=None,
                         help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
-
-    # profiler
-    parser.add_argument('--profiler_options', type=str, default=None,
+    train_group.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
+        help="device cpu and gpu are supported.")
+    train_group.add_argument("--nprocs", type=int, default=1, help="number of parallel processes. 0 for cpu.")
+    train_group.add_argument("--config", metavar="CONFIG_FILE", help="config file.")
+    train_group.add_argument("--output", metavar="CKPT_DIR", help="path to save checkpoint.")
+    train_group.add_argument("--checkpoint_path", type=str, help="path to load checkpoint")
+    train_group.add_argument("--opts", type=str, default=[], nargs='+',
+                        help="overwrite --config file, passing in LIST[KEY VALUE] pairs")
+    train_group.add_argument("--dump-config", metavar="FILE", help="dump config to `this` file.")
+
+    bech_group = parser.add_argument_group(title='Benchmark Options', description=None)
+    bech_group.add_argument('--profiler-options', type=str, default=None,
         help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".')
+    bech_group.add_argument('--benchmark-batch-size', type=int, default=None, help='batch size for benchmark.')
+    bech_group.add_argument('--benchmark-max-step', type=int, default=None, help='max iteration for benchmark.')
     # yapd: enable
 
     return parser
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index 6587f129..9549a4dd 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
 import time
 from pathlib import Path
 
@@ -24,6 +25,7 @@ from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import seed_all
+from deepspeech.utils.utility import UpdateConfig
 
 __all__ = ["Trainer"]
 
@@ -101,6 +103,12 @@ class Trainer():
             seed_all(args.seed)
             logger.info(f"Set seed {args.seed}")
 
+        if self.args.benchmark_batch_size:
+            with UpdateConfig(self.config):
+                self.config.collator.batch_size = self.args.benchmark_batch_size
+            logger.info(
+                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
+
     def setup(self):
         """Setup the experiment.
         """
@@ -188,6 +196,12 @@ class Trainer():
         if self.args.profiler_options:
             profiler.add_profiler_step(self.args.profiler_options)
 
+        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
+            logger.info(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+            sys.exit(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+
     def train(self):
         """The training process control by epoch."""
         from_scratch = self.resume_or_scratch()
diff --git a/deepspeech/utils/utility.py b/deepspeech/utils/utility.py
index e18fc1f7..6f84c41b 100644
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@@ -16,15 +16,27 @@ import distutils.util
 import math
 import os
 import random
+from contextlib import contextmanager
 from typing import List
 
 import numpy as np
 import paddle
 
-__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"]
+__all__ = [
+    "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
+]
+
+
+@contextmanager
+def UpdateConfig(config):
+    """Update yacs config"""
+    config.defrost()
+    yield
+    config.freeze()
 
 
 def seed_all(seed: int=210329):
+    """freeze random generator seed."""
     np.random.seed(seed)
     random.seed(seed)
     paddle.seed(seed)
diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/s1/local/train.sh
index e065ad6a..5b9c45f5 100755
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 
-
 profiler_options=
+benchmark_batch_size=
+benchmark_max_step=
 
 # seed may break model convergence
 seed=0
@@ -32,12 +33,15 @@ ckpt_name=$2
 mkdir -p exp
 
 python3 -u ${BIN_DIR}/train.py \
+--seed ${seed} \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
---profiler_options ${profiler_options} \
---seed ${seed}
+--profiler-options "${profiler-options}" \
+--benchmark-batch-size ${benchmark_batch_size} \
+--benchmark-max-step ${benchmark_max_step}
+
 
 if [ ${seed} != 0  ]; then
     unset FLAGS_cudnn_deterministic
diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/s1/local/train.sh
index 374608fd..56ceab41 100755
--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
@@ -1,37 +1,49 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
-    exit -1
-fi
+profiler_options=
+benchmark_batch_size=
+benchmark_max_step=
+
+# seed may break model convergence
+seed=0
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-config_path=$1
-ckpt_name=$2
-
 device=gpu
 if [ ${ngpu} == 0 ];then
     device=cpu
 fi
 
-mkdir -p exp
-
-# seed may break model convergence
-seed=0
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
     export FLAGS_cudnn_deterministic=True
+    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
+fi
+
+if [ $# != 2 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    exit -1
 fi
 
+config_path=$1
+ckpt_name=$2
+
+mkdir -p exp
+
 python3 -u ${BIN_DIR}/train.py \
+--seed ${seed} \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
---seed ${seed}
+--profiler-options "${profiler_options}" \
+--benchmark-batch-size ${benchmark_batch_size} \
+--benchmark-max-step ${benchmark_max_step}
+
 
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
     unset FLAGS_cudnn_deterministic
 fi
 
diff --git a/tests/benchmark/run_all.sh b/tests/benchmark/run_all.sh
old mode 100644
new mode 100755
index 7aa11d0f..6f707cdc
--- a/tests/benchmark/run_all.sh
+++ b/tests/benchmark/run_all.sh
@@ -1,41 +1,46 @@
 #!/bin/bash
 
+CUR_DIR=${PWD}
 ROOT_DIR=../../
 
 # 提供可稳定复现性能的脚本，默认在标准docker环境内py37执行：
 # collect env info
 bash ${ROOT_DIR}/utils/pd_env_collect.sh
-cat pd_env.txt
+#cat pd_env.txt
 
-# 执行目录：需说明
-pushd ${ROOT_DIR}/examples/aishell/s1
 
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
-pushd ${ROOT_DIR}/tools; make; popd
-source ${ROOT_DIR}/tools/venv/bin/activate
-pushd ${ROOT_DIR}; bash setup.sh; popd
+#pushd ${ROOT_DIR}/tools; make; popd
+#source ${ROOT_DIR}/tools/venv/bin/activate
+#pushd ${ROOT_DIR}; bash setup.sh; popd
 
 
 # 2 拷贝该模型需要数据、预训练模型
+
+# 执行目录：需说明
+#pushd ${ROOT_DIR}/examples/aishell/s1
+pushd ${ROOT_DIR}/examples/tiny/s1
+
 mkdir -p exp/log
-loca/data.sh &> exp/log/data.log
+. path.sh
+#bash local/data.sh &> exp/log/data.log
 
 # 3 批量运行（如不方便批量，1，2需放到单个模型中）
 
-model_mode_list=(conformer)
+model_mode_list=(conformer transformer)
 fp_item_list=(fp32)
-bs_item=(32 64 96)
+bs_item_list=(32 64 96)
 for model_mode in ${model_mode_list[@]}; do
       for fp_item in ${fp_item_list[@]}; do
-          for bs_item in ${bs_list[@]}
+          for bs_item in ${bs_item_list[@]}
             do
             echo "index is speed, 1gpus, begin, ${model_name}"
             run_mode=sp
-            CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
+            CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}     #  (5min)
             sleep 60
             echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
             run_mode=mp
-            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+            CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
             sleep 60
             done
       done
diff --git a/tests/benchmark/run_benchmark.sh b/tests/benchmark/run_benchmark.sh
old mode 100644
new mode 100755
index 625d3616..eb111793
--- a/tests/benchmark/run_benchmark.sh
+++ b/tests/benchmark/run_benchmark.sh
@@ -23,19 +23,19 @@ function _train(){
     echo "Train on ${num_gpu_devices} GPUs"
     echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
 
-    train_cmd="--model_name=${model_name}
-               --batch_size=${batch_size}
-               --fp=${fp_item} \
-               --max_iter=${max_iter} "
+    train_cmd="--benchmark-batch-size ${batch_size}
+               --benchmark-max-step ${max_iter}
+               conf/${model_name}.yaml ${model_name}"
+
     case ${run_mode} in
-    sp) train_cmd="python -u tools/train.py "${train_cmd}" ;;
+    sp) train_cmd="bash local/train.sh "${train_cmd}"" ;;
     mp)
-        train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py "${train_cmd}"
-        log_parse_file="mylog/workerlog.0" ;;
+        train_cmd="bash local/train.sh "${train_cmd}"" ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
-# 以下不用修改
-    timeout 15m ${train_cmd} > ${log_file} 2>&1
+
+    # 以下不用修改
+    CUDA_VISIBLE_DEVICES=${device} timeout 15m ${train_cmd} > ${log_file} 2>&1
     if [ $? -ne 0 ];then
         echo -e "${model_name}, FAIL"
         export job_fail_flag=1
@@ -43,7 +43,8 @@ function _train(){
         echo -e "${model_name}, SUCCESS"
         export job_fail_flag=0
     fi
-    kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+
+    trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
     if [ $run_mode = "mp" -a -d mylog ]; then
         rm ${log_file}