diff --git a/deepspeech/exps/u2/bin/train.py b/deepspeech/exps/u2/bin/train.py
index fef615ce..b664401a 100644
--- a/deepspeech/exps/u2/bin/train.py
+++ b/deepspeech/exps/u2/bin/train.py
@@ -21,6 +21,7 @@ from deepspeech.exps.u2.config import get_cfg_defaults
 from deepspeech.exps.u2.model import U2Trainer as Trainer
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils.utility import print_arguments
+
 # from deepspeech.exps.u2.trainer import U2Trainer as Trainer
 
 
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 2b6e2433..67b666ed 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -204,6 +204,7 @@ class U2Trainer(Trainer):
                         msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                         msg += "data time: {:>.3f}s, ".format(dataload_time)
                         self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
                         data_start_time = time.time()
                 except Exception as e:
                     logger.error(e)
diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py
index 095dfe34..3d15e025 100644
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@@ -205,6 +205,7 @@ class U2Trainer(Trainer):
                         msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                         msg += "data time: {:>.3f}s, ".format(dataload_time)
                         self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
                         data_start_time = time.time()
                 except Exception as e:
                     logger.error(e)
diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index 8dca1654..91a81503 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -222,6 +222,7 @@ class U2STTrainer(Trainer):
                         msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                         msg += "data time: {:>.3f}s, ".format(dataload_time)
                         self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
                         data_start_time = time.time()
                 except Exception as e:
                     logger.error(e)
diff --git a/deepspeech/training/cli.py b/deepspeech/training/cli.py
index 7f4bb804..1477bdfe 100644
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@@ -63,8 +63,13 @@ def default_argument_parser():
     parser.add_argument("--opts", type=str, default=[], nargs='+',
                         help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
 
+    # random seed
     parser.add_argument("--seed", type=int, default=None,
                         help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
+
+    # profiler
+    parser.add_argument('--profiler_options', type=str, default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".')
     # yapd: enable
 
     return parser
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index 7959b41b..bdb68310 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -20,6 +20,7 @@ from tensorboardX import SummaryWriter
 
 from deepspeech.training.timer import Timer
 from deepspeech.utils import mp_tools
+from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import seed_all
@@ -183,6 +184,9 @@ class Trainer():
             if isinstance(batch_sampler, paddle.io.DistributedBatchSampler):
                 batch_sampler.set_epoch(self.epoch)
 
+    def after_train_batch(self):
+        profiler.add_profiler_step(self.args.profiler_options)
+
     def train(self):
         """The training process control by epoch."""
         from_scratch = self.resume_or_scratch()
@@ -209,6 +213,7 @@ class Trainer():
                         msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                         msg += "data time: {:>.3f}s, ".format(dataload_time)
                         self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
                         data_start_time = time.time()
                 except Exception as e:
                     logger.error(e)
diff --git a/deepspeech/utils/profiler.py b/deepspeech/utils/profiler.py
new file mode 100644
index 00000000..5b8389be
--- /dev/null
+++ b/deepspeech/utils/profiler.py
@@ -0,0 +1,116 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import paddle
+
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+        logger.info(f"{options_str}")
+        logger.info(f"{_profiler_options._options}")
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/s1/local/train.sh
index f905b766..e065ad6a 100755
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@@ -1,38 +1,45 @@
 #!/bin/bash
 
-if [ $# != 2 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
-    exit -1
-fi
+
+profiler_options=
+
+# seed may break model convergence
+seed=0
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-config_path=$1
-ckpt_name=$2
-
 device=gpu
 if [ ${ngpu} == 0 ];then
     device=cpu
 fi
-echo "using ${device}..."
-
-mkdir -p exp
 
-# seed may break model convergence
-seed=0
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
     export FLAGS_cudnn_deterministic=True
+    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
 
+if [ $# != 2 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    exit -1
+fi
+
+config_path=$1
+ckpt_name=$2
+
+mkdir -p exp
+
 python3 -u ${BIN_DIR}/train.py \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
+--profiler_options ${profiler_options} \
 --seed ${seed}
 
-if [ ${seed} != 0 ]; then
+if [ ${seed} != 0  ]; then
     unset FLAGS_cudnn_deterministic
 fi
 
@@ -41,4 +48,4 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-exit 0
+exit 0
\ No newline at end of file
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml
index d5b1ed91..3f1a376f 100644
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -11,7 +11,7 @@ data:
   max_output_input_ratio: .inf
 
 collator:
-  batch_size: 15
+  batch_size: 20
   mean_std_filepath: data/mean_std.json
   unit_type: char
   vocab_filepath: data/vocab.txt 
@@ -45,7 +45,7 @@ model:
 
 training:
   n_epoch: 50
-  accum_grad: 4
+  accum_grad: 1
   lr: 1e-3
   lr_decay: 0.83
   weight_decay: 1e-06
diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/s0/local/train.sh
index ea29b7fc..a657ce34 100755
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
@@ -1,36 +1,44 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type"
-    exit -1
-fi
+profiler_options=
+
+# seed may break model convergence
+seed=0
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
-config_path=$1
-ckpt_name=$2
-model_type=$3
-
 device=gpu
 if [ ${ngpu} == 0 ];then
     device=cpu
 fi
 
-mkdir -p exp
-
-# seed may break model convergence
-seed=0
 if [ ${seed} != 0  ]; then
     export FLAGS_cudnn_deterministic=True
+    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
+fi
+
+
+if [ $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type"
+    exit -1
 fi
 
+config_path=$1
+ckpt_name=$2
+model_type=$3
+
+mkdir -p exp
+
 python3 -u ${BIN_DIR}/train.py \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
+--profiler_options ${profiler_options} \
 --seed ${seed}
 
 if [ ${seed} != 0  ]; then