diff --git a/deepspeech/training/gradclip.py b/deepspeech/training/gradclip.py
index f46814eb..6393197a 100644
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@@ -48,9 +48,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             sum_square_list.append(sum_square)
 
             # debug log
-            if i < 10:
-                logger.debug(
-                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
+            logger.debug(
+                f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
 
         # all parameters have been filterd out
         if len(sum_square_list) == 0:
@@ -77,9 +76,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             params_and_grads.append((p, new_grad))
 
             # debug log
-            if i < 10:
-                logger.debug(
-                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
-                )
+            logger.debug(
+                f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
+            )
 
         return params_and_grads
diff --git a/deepspeech/training/timer.py b/deepspeech/training/timer.py
index 2076737b..2ca9d638 100644
--- a/deepspeech/training/timer.py
+++ b/deepspeech/training/timer.py
@@ -27,7 +27,7 @@ class Timer():
             do some thing
     """
 
-    def __init__(self, message):
+    def __init__(self, message=None):
         self.message = message
 
     def duration(self) -> str:
@@ -40,7 +40,8 @@ class Timer():
         return self
 
     def __exit__(self, type, value, traceback):
-        logger.info(self.message.format(self.duration()))
+        if self.message:
+            logger.info(self.message.format(self.duration()))
 
     def __call__(self) -> float:
         return time.time() - self.start
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index 521297d7..25c002df 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -185,46 +185,47 @@ class Trainer():
 
     def train(self):
         """The training process control by epoch."""
-        with Timer("Load/Init Model: {}"):
-            from_scratch = self.resume_or_scratch()
-            if from_scratch:
-                # save init model, i.e. 0 epoch
-                self.save(tag='init', infos=None)
-            self.lr_scheduler.step(self.epoch)
-            if self.parallel and hasattr(self.train_loader, "batch_sampler"):
-                self.train_loader.batch_sampler.set_epoch(self.epoch)
+        from_scratch = self.resume_or_scratch()
+        if from_scratch:
+            # save init model, i.e. 0 epoch
+            self.save(tag='init', infos=None)
+        self.lr_scheduler.step(self.epoch)
+        if self.parallel and hasattr(self.train_loader, "batch_sampler"):
+            self.train_loader.batch_sampler.set_epoch(self.epoch)
 
         logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
         while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                     data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts
 
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
diff --git a/doc/src/deepspeech_architecture.md b/doc/src/deepspeech_architecture.md
index dfa60790..c4c102ba 100644
--- a/doc/src/deepspeech_architecture.md
+++ b/doc/src/deepspeech_architecture.md
@@ -1,8 +1,8 @@
 # Deepspeech2
 ## Streaming
 
-The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes. 
-The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers. 
+The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
+The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
 
 To illustrate the model implementation clearly, 3 parts are described in detail.  
 - Data Preparation
@@ -11,10 +11,10 @@ To illustrate the model implementation clearly, 3 parts are described in detail.
 
 In addition, the training process and the testing process are also introduced.
 
-The arcitecture of the model is shown in Fig.1. 
+The arcitecture of the model is shown in Fig.1.
 
 <p align="center">
-<img src="../images/ds2onlineModel.png" width=800> 
+<img src="../images/ds2onlineModel.png" width=800>
 <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>
 
@@ -28,17 +28,17 @@ For English data, the vocabulary dictionary is composed of 26 English characters
      --unit_type="char" \
      --count_threshold=0 \
      --vocab_path="data/vocab.txt" \
-     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw" 
- 
+     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
+
 # vocabulary for aishell dataset (Mandarin)
 vi examples/aishell/s0/data/vocab.txt
- 
+
 # vocabulary for librispeech dataset (English)
 vi examples/librispeech/s0/data/vocab.txt
 ```
 
 #### CMVN
-For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std. 
+For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
 ```
  # The code to compute the feature mean and std
 cd examples/aishell/s0
@@ -52,16 +52,16 @@ python3 ../../../utils/compute_mean_std.py \
      --use_dB_normalization=True \
      --num_samples=2000 \
      --num_workers=10 \
-     --output_path="data/mean_std.json" 
+     --output_path="data/mean_std.json"
 
 ```
- 
+
 #### Feature Extraction
  For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
  Currently, the released deepspeech2 online model use the linear feature extraction method.
  ```
  The code for feature extraction
- vi deepspeech/frontend/featurizer/audio_featurizer.py 
+ vi deepspeech/frontend/featurizer/audio_featurizer.py
  ```
 
 ### Encoder
@@ -70,7 +70,7 @@ The code of Encoder is in:
 ```
 vi deepspeech/models/ds2_online/deepspeech2.py
 ```
- 
+
 ### Decoder
 To got the character possibilities of each frame, the feature represention of each frame output from the backbone are input into a projection layer which is implemented as a dense layer to do projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to make frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
 The code of Encoder is in:
@@ -78,7 +78,7 @@ The code of Encoder is in:
 vi deepspeech/models/ds2_online/deepspeech2.py
 vi deepspeech/modules/ctc.py
 ```
- 
+
 ## Training Process
 Using the command below, you can train the deepspeech2 online model.
 ```
@@ -120,7 +120,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 fi
 ```
 By using the command above, the training process can be started. There are 5 stages in run.sh, and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.  
- 
+
 ## Testing Process
 Using the command below, you can test the deepspeech2 online model.
  ```
@@ -129,7 +129,7 @@ Using the command below, you can test the deepspeech2 online model.
 The detail commands are:
 ```
 conf_path=conf/deepspeech2_online.yaml
-avg_num=1 
+avg_num=1
 model_type=online
 avg_ckpt=avg_${avg_num}
 
@@ -150,29 +150,29 @@ fi
  ```
 After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
 
- 
+
 ## Non-Streaming
 The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used.
 
 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-<img src="../images/ds2offlineModel.png" width=800> 
+<img src="../images/ds2offlineModel.png" width=800>
 <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>
 
 
- 
+
 For data preparation, decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
 
  The code of encoder and decoder for deepspeech2 offline model is in:
 ```
 vi deepspeech/models/ds2/deepspeech2.py
 ```
- 
+
 The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
 Only some changes should be noticed.
 
-For training and testing, the "model_type" and the "conf_path" must be set. 
+For training and testing, the "model_type" and the "conf_path" must be set.
  ```
 # Training offline
 cd examples/aishell/s0
@@ -184,4 +184,4 @@ cd examples/aishell/s0
 bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
 ```
 
- 
+