diff --git a/README.md b/README.md
index a2de1783..424dc485 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 
 ## Features
 
- See [feature list](doc/src/feature_list.md) for more information. 
+ See [feature list](doc/src/feature_list.md) for more information.
 
 ## Setup
 
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 04137419..8e8a1824 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -272,8 +272,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             infer_model,
             input_spec=[
                 paddle.static.InputSpec(
-                    shape=[None, feat_dim, None],
-                    dtype='float32'),  # audio, [B,D,T]
+                    shape=[None, None, feat_dim],
+                    dtype='float32'),  # audio, [B,T,D]
                 paddle.static.InputSpec(shape=[None],
                                         dtype='int64'),  # audio_length, [B]
             ])
diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py
index 6b224080..287b51e5 100644
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -179,7 +179,8 @@ class FeatureNormalizer(object):
                 wav_number += batch_size
 
                 if wav_number % 1000 == 0:
-                    logger.info(f'process {wav_number} wavs,{all_number} frames.')
+                    logger.info(
+                        f'process {wav_number} wavs,{all_number} frames.')
 
         self.cmvn_info = {
             'mean_stat': list(all_mean_stat.tolist()),
diff --git a/deepspeech/modules/conv.py b/deepspeech/modules/conv.py
index f0f0d746..111f5d3b 100644
--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@@ -15,7 +15,7 @@ from paddle import nn
 from paddle.nn import functional as F
 
 from deepspeech.modules.activation import brelu
-from deepspeech.modules.mask import sequence_mask
+from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -111,8 +111,10 @@ class ConvBn(nn.Layer):
                  ) // self.stride[1] + 1
 
         # reset padding part to 0
-        masks = sequence_mask(x_len)  #[B, T]
+        masks = make_non_pad_mask(x_len)  #[B, T]
         masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+        # TODO(Hui Zhang): not support bool multiply
+        masks = masks.type_as(x)
         x = x.multiply(masks)
 
         return x, x_len
diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py
index 65a8ba31..74d4e30a 100644
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@@ -18,40 +18,12 @@ from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 
 __all__ = [
-    'sequence_mask', "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
+    "make_pad_mask", "make_non_pad_mask", "subsequent_mask",
     "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores",
     "mask_finished_preds"
 ]
 
 
-def sequence_mask(x_len, max_len=None, dtype='float32'):
-    """batch sequence mask.
-
-    Args:
-        x_len ([paddle.Tensor]): xs lenght, [B]
-        max_len ([type], optional): max sequence length. Defaults to None.
-        dtype (str, optional): mask data type. Defaults to 'float32'.
-
-    Returns:
-        paddle.Tensor: [B, Tmax]
-
-     Examples:
-        >>> sequence_mask([2, 4])
-        [[1., 1., 0., 0.],
-         [1., 1., 1., 1.]]
-    """
-    # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
-    # assert x_len.dim() == 1, (x_len.dim(), x_len)
-    max_len = max_len or x_len.max()
-    x_len = paddle.unsqueeze(x_len, -1)
-    row_vector = paddle.arange(max_len)
-    # TODO(Hui Zhang): fix this bug
-    #mask = row_vector < x_len
-    mask = row_vector > x_len  # a bug, broadcast 的时候出错了
-    mask = paddle.cast(mask, dtype)
-    return mask
-
-
 def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
     """Make mask tensor containing indices of padded part.
     See description of make_non_pad_mask.
@@ -66,7 +38,8 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
                  [0, 0, 0, 1, 1],
                  [0, 0, 1, 1, 1]]
     """
-    assert lengths.dim() == 1
+    # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim
+    # assert lengths.dim() == 1
     batch_size = int(lengths.shape[0])
     max_len = int(lengths.max())
     seq_range = paddle.arange(0, max_len, dtype=paddle.int64)
diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py
index cef731e3..29bd2883 100644
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@@ -19,7 +19,7 @@ from paddle.nn import functional as F
 from paddle.nn import initializer as I
 
 from deepspeech.modules.activation import brelu
-from deepspeech.modules.mask import sequence_mask
+from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -306,7 +306,9 @@ class RNNStack(nn.Layer):
         """
         for i, rnn in enumerate(self.rnn_stacks):
             x, x_len = rnn(x, x_len)
-            masks = sequence_mask(x_len)  #[B, T]
+            masks = make_non_pad_mask(x_len)  #[B, T]
             masks = masks.unsqueeze(-1)  # [B, T, 1]
+            # TODO(Hui Zhang): not support bool multiply
+            masks = masks.type_as(x)
             x = x.multiply(masks)
         return x, x_len
diff --git a/doc/src/alignment.md b/doc/src/alignment.md
index fa63894f..9d3231c8 100644
--- a/doc/src/alignment.md
+++ b/doc/src/alignment.md
@@ -18,4 +18,3 @@
 
 * [ctc alignment](https://mp.weixin.qq.com/s/4aGehNN7PpIvCh03qTT5oA)
 * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
-
diff --git a/doc/src/asr_text_backend.md b/doc/src/asr_text_backend.md
index 879e56f8..c3c9896c 100644
--- a/doc/src/asr_text_backend.md
+++ b/doc/src/asr_text_backend.md
@@ -98,4 +98,4 @@
 
 ## Text Filter
 
-* 敏感词（黄暴、涉政、违法违禁等）
\ No newline at end of file
+* 敏感词（黄暴、涉政、违法违禁等）
diff --git a/doc/src/benchmark.md b/doc/src/benchmark.md
index f3af2555..9c1c86fd 100644
--- a/doc/src/benchmark.md
+++ b/doc/src/benchmark.md
@@ -14,4 +14,3 @@ We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of L
 | 8         | 6.95 X |
 
 `utils/profile.sh` provides such a demo profiling tool, you can change it as need.
-
diff --git a/doc/src/chinese_syllable.md b/doc/src/chinese_syllable.md
index b7fd9322..3ada44f4 100644
--- a/doc/src/chinese_syllable.md
+++ b/doc/src/chinese_syllable.md
@@ -67,4 +67,4 @@
 * https://github.com/KuangDD/phkit
 * https://github.com/mozillazg/python-pinyin
 * https://github.com/Kyubyong/g2pC
-* https://github.com/kakaobrain/g2pM
\ No newline at end of file
+* https://github.com/kakaobrain/g2pM
diff --git a/doc/src/dataset.md b/doc/src/dataset.md
index d70d0e0d..aaa80551 100644
--- a/doc/src/dataset.md
+++ b/doc/src/dataset.md
@@ -18,4 +18,4 @@
 
 ### ASR Noise
 
-* [asr-noises](https://github.com/speechio/asr-noises)
\ No newline at end of file
+* [asr-noises](https://github.com/speechio/asr-noises)
diff --git a/doc/src/decoding.md b/doc/src/decoding.md
index ade06c4c..347a4098 100644
--- a/doc/src/decoding.md
+++ b/doc/src/decoding.md
@@ -3,4 +3,3 @@
 ## Reference
 
 * [时间戳和N-Best](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247483956&idx=1&sn=80ce595238d84155d50f08c0d52267d3&chksm=fcaacae0cbdd43f62b1da60c8e8671a9e0bb2aeee94f58751839b03a1c45b9a3889b96705080&scene=21#wechat_redirect)
-
diff --git a/doc/src/feature_list.md b/doc/src/feature_list.md
index 57641d5e..573669fa 100644
--- a/doc/src/feature_list.md
+++ b/doc/src/feature_list.md
@@ -58,4 +58,4 @@
 ### Grapheme To Phoneme
 
 * syallable
-* phoneme
\ No newline at end of file
+* phoneme
diff --git a/doc/src/ngram_lm.md b/doc/src/ngram_lm.md
index 07aa5411..119a3b21 100644
--- a/doc/src/ngram_lm.md
+++ b/doc/src/ngram_lm.md
@@ -83,4 +83,4 @@ Please notice that the released language models only contain Chinese simplified
 
    ```
    build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
-   ```
\ No newline at end of file
+   ```
diff --git a/doc/src/praat_textgrid.md b/doc/src/praat_textgrid.md
index c25c760a..06c4f879 100644
--- a/doc/src/praat_textgrid.md
+++ b/doc/src/praat_textgrid.md
@@ -76,7 +76,7 @@ pip3 install textgrid
    tg.read('file.TextGrid')  # 'file.TextGrid' 是文件名
    ```
 
-   tg.tiers属性: 
+   tg.tiers属性:
    会把文件中的所有item打印出来, print(tg.tiers) 的结果如下:
 
    ```text
@@ -86,7 +86,7 @@ pip3 install textgrid
            Interval(1361.89250, 1362.01250, R),
            Interval(1362.01250, 1362.13250, AY1),
            Interval(1362.13250, 1362.16250, T),
-   
+
    ...
            ]
        )
@@ -113,7 +113,7 @@ pip3 install textgrid
    Interval  可以理解为时长
    ```
 
-   
+
 
 2. textgrid库中的对象
    **IntervalTier** 对象:
@@ -148,7 +148,7 @@ pip3 install textgrid
    strict  -- > 返回bool值, 表示是否严格TextGrid格式
    ```
 
-   ​      
+   ​  
 
    **PointTier** 对象:
    方法
@@ -174,7 +174,7 @@ pip3 install textgrid
    name    返回name
    ```
 
-   
+
 
    **Point** 对象:
        支持比较大小, 支持加减运算
@@ -185,7 +185,7 @@ pip3 install textgrid
    time:
    ```
 
-   ​       
+   ​  
 
    **Interval** 对象:
        支持比较大小, 支持加减运算
@@ -250,10 +250,9 @@ pip3 install textgrid
    grids:  --> 返回读取的grids的列表
    ```
 
-   
+
 
 ## Reference
 
 * https://zh.wikipedia.org/wiki/Praat%E8%AF%AD%E9%9F%B3%E5%AD%A6%E8%BD%AF%E4%BB%B6
 * https://blog.csdn.net/duxin_csdn/article/details/88966295
-
diff --git a/doc/src/tools.md b/doc/src/tools.md
index 4ec09f6a..5fcca923 100644
--- a/doc/src/tools.md
+++ b/doc/src/tools.md
@@ -1,4 +1,3 @@
 # Useful Tools
 
 * [正则可视化和常用正则表达式](https://wangwl.net/static/projects/visualRegex/#)
-
diff --git a/doc/src/tts_text_front_end.md b/doc/src/tts_text_front_end.md
index fe0f7e24..b13ab615 100644
--- a/doc/src/tts_text_front_end.md
+++ b/doc/src/tts_text_front_end.md
@@ -23,7 +23,7 @@ Therefore, procedures like stemming and lemmatization are not useful for Chinese
 
 ### Tokenization
 
-**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.   
+**Tokenizing breaks up text data into shorter pre-set strings**, which help build context and meaning for the machine learning model.  
 
 These “tags” label the part of speech. There are 24 part of speech tags and 4 proper name category labels in the `**jieba**` package’s existing dictionary.
 
@@ -31,7 +31,7 @@ These “tags” label the part of speech. There are 24 part of speech tags and
 
 ### Stop Words
 
-In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous. 
+In NLP, **stop words are “meaningless” words** that make the data too noisy or ambiguous.
 
 Instead of manually removing them, you could import the `**stopwordsiso**` package for a full list of Chinese stop words. More information can be found [here](https://pypi.org/project/stopwordsiso/). And with this, we can easily create code to filter out any stop words in large text data.
 
@@ -209,4 +209,4 @@ TN: 基于规则的方法
 ## Reference
 * [Text Front End](https://slyne.github.io/%E5%85%AC%E5%BC%80%E8%AF%BE/2020/10/03/TTS1/)
 * [Chinese Natural Language (Pre)processing: An Introduction](https://towardsdatascience.com/chinese-natural-language-pre-processing-an-introduction-995d16c2705f)
-* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
\ No newline at end of file
+* [Beginner’s Guide to Sentiment Analysis for Simplified Chinese using SnowNLP](https://towardsdatascience.com/beginners-guide-to-sentiment-analysis-for-simplified-chinese-using-snownlp-ce88a8407efb)
diff --git a/doc/src/vad.md b/doc/src/vad.md
index 56fe9587..e73e9cf7 100644
--- a/doc/src/vad.md
+++ b/doc/src/vad.md
@@ -29,4 +29,3 @@
 * [Endpoint 检测](https://mp.weixin.qq.com/s?__biz=MzU2NjUwMTgxOQ==&mid=2247484024&idx=1&sn=12da2ee76347de4a18856274ba6ba61f&chksm=fcaacaaccbdd43ba6b3e996bbf1e2ac6d5f1b449dfd80fcaccfbbe0a240fa1668b931dbf4bd5&scene=21#wechat_redirect)
 * Kaldi: *https://github.com/kaldi-asr/kaldi/blob/6260b27d146e466c7e1e5c60858e8da9fd9c78ae/src/online2/online-endpoint.h#L132-L150*
 * End-to-End Automatic Speech Recognition Integrated with CTC-Based Voice Activity Detection: *https://arxiv.org/pdf/2002.00551.pdf*
-
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index 10c3a282..36d56723 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -24,7 +24,7 @@ data:
   n_fft: None
   stride_ms: 10.0
   window_ms: 25.0
-  use_dB_normalization: False 
+  use_dB_normalization: True 
   target_dB: -20
   random_seed: 0
   keep_transcription_text: False
@@ -76,7 +76,7 @@ model:
 training:
   n_epoch: 240 
   accum_grad: 2
-  global_grad_clip: 5.0
+  global_grad_clip: 3.0
   optim: adam
   optim_conf:
     lr: 0.002
diff --git a/examples/cc-cedict/local/parser.py b/examples/cc-cedict/local/parser.py
index d6acb834..e1e10b3d 100644
--- a/examples/cc-cedict/local/parser.py
+++ b/examples/cc-cedict/local/parser.py
@@ -1,18 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 # https://github.com/rubber-duck-dragon/rubber-duck-dragon.github.io/blob/master/cc-cedict_parser/parser.py
-
 #A parser for the CC-Cedict. Convert the Chinese-English dictionary into a list of python dictionaries with "traditional","simplified", "pinyin", and "english" keys.
-
 #Make sure that the cedict_ts.u8 file is in the same folder as this file, and that the name matches the file name on line 13.
-
 #Before starting, open the CEDICT text file and delete the copyright information at the top. Otherwise the program will try to parse it and you will get an error message.
-
 #Characters that are commonly used as surnames have two entries in CC-CEDICT. This program will remove the surname entry if there is another entry for the character. If you want to include the surnames, simply delete lines 59 and 60.
-
 #This code was written by Franki Allegra in February 2020.
-
-
-import sys
 import json
+import sys
 
 # usage: bin ccedict dump.json
 
@@ -50,9 +56,10 @@ with open(sys.argv[1], 'rt') as file:
         list_of_dicts.append(parsed)
 
     def remove_surnames():
-        for x in range(len(list_of_dicts)-1, -1, -1):
+        for x in range(len(list_of_dicts) - 1, -1, -1):
             if "surname " in list_of_dicts[x]['english']:
-                if list_of_dicts[x]['traditional'] == list_of_dicts[x+1]['traditional']:
+                if list_of_dicts[x]['traditional'] == list_of_dicts[x + 1][
+                        'traditional']:
                     list_of_dicts.pop(x)
 
     def main():
@@ -60,13 +67,12 @@ with open(sys.argv[1], 'rt') as file:
         #make each line into a dictionary
         print("Parsing dictionary . . .")
         for line in dict_lines:
-                parse_line(line)
+            parse_line(line)
 
         #remove entries for surnames from the data (optional):
         print("Removing Surnames . . .")
         remove_surnames()
 
-
         print("Saving to database (this may take a few minutes) . . .")
         with open(sys.argv[2], 'wt') as fout:
             for one_dict in list_of_dicts:
@@ -74,5 +80,6 @@ with open(sys.argv[1], 'rt') as file:
                 fout.write(json_str + "\n")
         print('Done!')
 
+
 list_of_dicts = []
 parsed_dict = main()
diff --git a/requirements.txt b/requirements.txt
index a6facb6c..57a951bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 coverage
 pre-commit
+pybind11
 resampy==0.2.2
 scipy==1.2.1
 sentencepiece
@@ -7,7 +8,6 @@ snakeviz
 SoundFile==0.9.0.post1
 sox
 tensorboardX
+textgrid
 typeguard
 yacs
-pybind11
-textgrid
diff --git a/tests/mask_test.py b/tests/mask_test.py
index cd37a899..f44aca8f 100644
--- a/tests/mask_test.py
+++ b/tests/mask_test.py
@@ -18,7 +18,6 @@ import paddle
 
 from deepspeech.modules.mask import make_non_pad_mask
 from deepspeech.modules.mask import make_pad_mask
-from deepspeech.modules.mask import sequence_mask
 
 
 class TestU2Model(unittest.TestCase):
@@ -36,16 +35,10 @@ class TestU2Model(unittest.TestCase):
             [False, False, True, True, True],
         ])
 
-    def test_sequence_mask(self):
-        res = sequence_mask(self.lengths, dtype='bool')
-        self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
-
     def test_make_non_pad_mask(self):
         res = make_non_pad_mask(self.lengths)
-        res1 = sequence_mask(self.lengths, dtype='bool')
         res2 = make_pad_mask(self.lengths).logical_not()
         self.assertSequenceEqual(res.numpy().tolist(), self.masks.tolist())
-        self.assertSequenceEqual(res.numpy().tolist(), res1.numpy().tolist())
         self.assertSequenceEqual(res.numpy().tolist(), res2.numpy().tolist())
 
     def test_make_pad_mask(self):
diff --git a/third_party/phkit/README.md b/third_party/phkit/README.md
index e8f0745c..002425ba 100644
--- a/third_party/phkit/README.md
+++ b/third_party/phkit/README.md
@@ -39,7 +39,7 @@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
 
 标点：
 ! ? . , ; : " # ( )
-注：!=!！|?=?？|.=.。|,=,，、|;=;；|:=:：|"="“|#=# 　	|(=(（[［{｛【<《|)=)）]］}｝】>》
+注：!=!！|?=?？|.=.。|,=,，、|;=;；|:=:：|"="“|#=# 　    |(=(（[［{｛【<《|)=)）]］}｝】>》
 
 预留：
 w y 0 6 7 8 9