diff --git a/.notebook/jit_infer.ipynb b/.notebook/jit_infer.ipynb
index 6c48bb407..397c59603 100644
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
@@ -37,26 +37,26 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-03-24 06:30:47,727 - WARNING - register user softmax to paddle, remove this when fixed!\n",
-      "2021-03-24 06:30:47,728 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
-      "2021-03-24 06:30:47,729 - WARNING - register user relu to paddle, remove this when fixed!\n",
-      "2021-03-24 06:30:47,729 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
-      "2021-03-24 06:30:47,730 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "2021-03-24 06:30:47,731 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "2021-03-24 06:30:47,731 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
-      "2021-03-24 06:30:47,732 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,732 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,733 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,733 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,734 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,734 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,735 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,735 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,736 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,736 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
-      "2021-03-24 06:30:47,737 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
-      "2021-03-24 06:30:47,737 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
-      "2021-03-24 06:30:47,738 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
+      "2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
+      "2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
+      "2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
+      "2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
+      "2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
       "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
       "  from numpy.dual import register_func\n",
       "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
@@ -102,10 +102,10 @@
      "output_type": "stream",
      "text": [
       "0.0.0\n",
-      "607856a949ed7356237ed8148947f7fd2b0f4631\n",
-      "ON\n",
-      "ON\n",
-      "commit: 607856a949ed7356237ed8148947f7fd2b0f4631\n",
+      "e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
+      "OFF\n",
+      "OFF\n",
+      "commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
       "None\n",
       "0\n"
      ]
@@ -226,7 +226,7 @@
       "output: None\n",
       "params_file: examples/aishell/jit.model.pdiparams\n",
       "speech_save_dir: demo_cache\n",
-      "use_gpu: True\n",
+      "use_gpu: False\n",
       "warmup_manifest: examples/aishell/data/manifest.test\n",
       "------------------------------------------------\n"
      ]
@@ -266,7 +266,7 @@
     "    help=\n",
     "    \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
     ")\n",
-    "add_arg(\"--use_gpu\",type=bool,default=True, help=\"Whether use gpu.\")\n",
+    "add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
     "\n",
     "\n",
     "args = parser.parse_args(\n",
@@ -321,7 +321,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2021-03-24 06:31:20,943 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
+      "2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
      ]
     },
     {
@@ -407,7 +407,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -418,7 +418,7 @@
       "examples/aishell/jit.model.pdmodel\n",
       "examples/aishell/jit.model.pdiparams\n",
       "0\n",
-      "True\n"
+      "False\n"
      ]
     }
    ],
@@ -428,7 +428,8 @@
     "from paddle.inference import PrecisionType\n",
     "from paddle.inference import create_predictor\n",
     "\n",
-    "args.use_gpu=True\n",
+    "args.use_gpu=False\n",
+    "paddle.set_device('cpu')\n",
     "\n",
     "def init_predictor(args):\n",
     "    if args.model_dir is not None:\n",
@@ -438,8 +439,8 @@
     "\n",
     "    if args.use_gpu:\n",
     "        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
-    "        config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
-    "                              use_calib_mode=True) # 开启TensorRT预测，精度为fp32，开启int8离线量化\n",
+    "#         config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
+    "#                               use_calib_mode=True) # 开启TensorRT预测，精度为fp32，开启int8离线量化\n",
     "    else:\n",
     "        # If not specific mkldnn, you can set the blas thread.\n",
     "        # The thread num should not be greater than the number of cores in the CPU.\n",
@@ -447,7 +448,7 @@
     "        config.enable_mkldnn()\n",
     "        \n",
     "    config.enable_memory_optim()\n",
-    "    config.switch_ir_optim(False)\n",
+    "    config.switch_ir_optim(True)\n",
     "    \n",
     "    print(config.model_dir())\n",
     "    print(config.prog_file())\n",
@@ -534,7 +535,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -546,49 +547,35 @@
       "input: 0 audio\n",
       "input: 1 audio_len\n",
       "output: 0 tmp_75\n",
-      "jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
-      "   8.91578738e-12 4.64319072e-08]\n",
-      "  [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
-      "   1.55893121e-15 9.99992609e-01]\n",
-      "  [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
-      "   1.24587735e-17 1.00000000e+00]\n",
+      "jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
+      "   8.91573707e-12 4.64317296e-08]\n",
+      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
+      "   1.55891342e-15 9.99992609e-01]\n",
+      "  [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
+      "   1.24587264e-17 1.00000000e+00]\n",
       "  ...\n",
-      "  [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
-      "   4.37358093e-15 1.00000000e+00]\n",
-      "  [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
-      "   3.89255983e-13 1.00000000e+00]\n",
-      "  [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
-      "   1.00334671e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
+      "  [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
+      "   4.37354747e-15 1.00000000e+00]\n",
+      "  [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
+      "   3.89252270e-13 1.00000000e+00]\n",
+      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
+      "   1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
       "[1, 161, 522]\n",
-      "[1]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
-      "  return (isinstance(seq, collections.Sequence) and\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "paddle: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
-      "   8.91578738e-12 4.64319072e-08]\n",
-      "  [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
-      "   1.55893121e-15 9.99992609e-01]\n",
-      "  [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
+      "[1]\n",
+      "paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
+      "   8.91577090e-12 4.64319072e-08]\n",
+      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
+      "   1.55891342e-15 9.99992609e-01]\n",
+      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
       "   1.24587735e-17 1.00000000e+00]\n",
       "  ...\n",
-      "  [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
-      "   4.37358093e-15 1.00000000e+00]\n",
-      "  [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
-      "   3.89255983e-13 1.00000000e+00]\n",
-      "  [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
-      "   1.00334671e-10 9.99998808e-01]]]\n",
-      "True\n"
+      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
+      "   4.37354747e-15 1.00000000e+00]\n",
+      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
+      "   3.89253761e-13 1.00000000e+00]\n",
+      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
+      "   1.00334095e-10 9.99998808e-01]]]\n",
+      "False\n"
      ]
     }
    ],
@@ -607,7 +594,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -618,19 +605,19 @@
       "input: 0 audio\n",
       "input: 1 audio_len\n",
       "output: 0 tmp_75\n",
-      "jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
-      "   8.91578738e-12 4.64319072e-08]\n",
-      "  [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
-      "   1.55893121e-15 9.99992609e-01]\n",
-      "  [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
+      "jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
+      "   8.91577090e-12 4.64319072e-08]\n",
+      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
+      "   1.55891342e-15 9.99992609e-01]\n",
+      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
       "   1.24587735e-17 1.00000000e+00]\n",
       "  ...\n",
-      "  [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
-      "   4.37358093e-15 1.00000000e+00]\n",
-      "  [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
-      "   3.89255983e-13 1.00000000e+00]\n",
-      "  [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
-      "   1.00334671e-10 9.99998808e-01]]]\n"
+      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
+      "   4.37354747e-15 1.00000000e+00]\n",
+      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
+      "   3.89253761e-13 1.00000000e+00]\n",
+      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
+      "   1.00334095e-10 9.99998808e-01]]]\n"
      ]
     }
    ],
diff --git a/.notebook/train_test.ipynb b/.notebook/train_test.ipynb
index bedad6e11..b2e454395 100644
--- a/.notebook/train_test.ipynb
+++ b/.notebook/train_test.ipynb
@@ -454,7 +454,7 @@
     "            act='brelu')\n",
     "\n",
     "        out_channel = 32\n",
-    "        self.conv_stack = nn.LayerList([\n",
+    "        self.conv_stack = nn.Sequential([\n",
     "            ConvBn(\n",
     "                num_channels_in=32,\n",
     "                num_channels_out=out_channel,\n",
@@ -1884,4 +1884,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 98d523890..8757ef631 100644
--- a/README.md
+++ b/README.md
@@ -59,3 +59,6 @@ You are welcome to submit questions and bug reports in [Github Issues](https://g
 ## License
 
 DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
+
+## Acknowledgement
+We depends on many open source repos. See [References](docs/reference.md) for more information.
diff --git a/README_cn.md b/README_cn.md
index 713e16ebd..e90489689 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -56,3 +56,6 @@ source tools/venv/bin/activate
 ## License
 
 DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。
+
+## 感谢
+开发中参考一些优秀的仓库，详情参见 [References](docs/reference.md)。
diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py
index 563746f41..3e96c40f1 100644
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -266,8 +266,17 @@ logger.warn(
 )
 F.ctc_loss = ctc_loss
 
-
 ########### hcak paddle.nn #############
+if not hasattr(paddle.nn, 'Module'):
+    logger.warn("register user Module to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'Module', paddle.nn.Layer)
+
+if not hasattr(paddle.nn, 'ModuleList'):
+    logger.warn(
+        "register user ModuleList to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
+
+
 class GLU(nn.Layer):
     """Gated Linear Units (GLU) Layer"""
 
diff --git a/deepspeech/modules/activation.py b/deepspeech/modules/activation.py
index 7769a7855..60be811e0 100644
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@@ -143,7 +143,8 @@ def get_activation(act):
         "relu": paddle.nn.ReLU,
         "selu": paddle.nn.SELU,
         "swish": paddle.nn.Swish,
-        "gelu": paddle.nn.GELU
+        "gelu": paddle.nn.GELU,
+        "brelu": brelu,
     }
 
     return activation_funcs[act]()
diff --git a/deepspeech/modules/embedding.py b/deepspeech/modules/embedding.py
index be2103292..efefd75ac 100644
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@@ -51,7 +51,7 @@ class PositionalEncoding(nn.Layer):
         self.pe = paddle.zeros(self.max_len, self.d_model)  #[T,D]
 
         position = paddle.arange(
-            0, self.max_len, dtype=paddle.float32).unsqueeze(1)
+            0, self.max_len, dtype=paddle.float32).unsqueeze(1)  #[T, 1]
         div_term = paddle.exp(
             paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
             -(math.log(10000.0) / self.d_model))
@@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
             offset (int): position offset
         Returns:
             paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
-            paddle.Tensor: for compatibility to RelPositionalEncoding
+            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
         """
         T = paddle.shape(x)[1]
         assert offset + T < self.max_len
diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py
index 0a20fe299..9a4017fec 100644
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@@ -59,16 +59,16 @@ class BaseEncoder(nn.Layer):
             concat_after: bool=False,
             static_chunk_size: int=0,
             use_dynamic_chunk: bool=False,
-            global_cmvn: torch.nn.Module=None,
+            global_cmvn: paddle.nn.Layer=None,
             use_dynamic_left_chunk: bool=False, ):
         """
         Args:
-            input_size (int): input dim
-            output_size (int): dimension of attention
+            input_size (int): input dim, d_feature
+            output_size (int): dimension of attention, d_model
             attention_heads (int): the number of heads of multi head attention
             linear_units (int): the hidden units number of position-wise feed
                 forward
-            num_blocks (int): the number of decoder blocks
+            num_blocks (int): the number of encoder blocks
             dropout_rate (float): dropout rate
             attention_dropout_rate (float): dropout rate in attention
             positional_dropout_rate (float): dropout rate after adding
@@ -89,7 +89,7 @@ class BaseEncoder(nn.Layer):
             use_dynamic_chunk (bool): whether use dynamic chunk size for
                 training or not, You can only use fixed chunk(chunk_size > 0)
                 or dyanmic chunk size(use_dynamic_chunk = True)
-            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
             use_dynamic_left_chunk (bool): whether use dynamic left chunk in
                 dynamic chunk training
         """
@@ -117,13 +117,14 @@ class BaseEncoder(nn.Layer):
 
         self.global_cmvn = global_cmvn
         self.embed = subsampling_class(
-            input_size,
-            output_size,
-            dropout_rate,
-            pos_enc_class(output_size, positional_dropout_rate), )
+            idim=input_size,
+            odim=output_size,
+            dropout_rate=dropout_rate,
+            pos_enc_class=pos_enc_class(
+                d_model=output_size, dropout_rate=positional_dropout_rate), )
 
         self.normalize_before = normalize_before
-        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12)
+        self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
         self.static_chunk_size = static_chunk_size
         self.use_dynamic_chunk = use_dynamic_chunk
         self.use_dynamic_left_chunk = use_dynamic_left_chunk
@@ -133,11 +134,11 @@ class BaseEncoder(nn.Layer):
 
     def forward(
             self,
-            xs: torch.Tensor,
-            xs_lens: torch.Tensor,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
             decoding_chunk_size: int=0,
             num_decoding_left_chunks: int=-1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Embed positions in tensor.
         Args:
             xs: padded input tensor (B, L, D)
@@ -147,16 +148,16 @@ class BaseEncoder(nn.Layer):
                 <0: for decoding, use full chunk.
                 >0: for decoding, use fixed chunk size as set.
             num_decoding_left_chunks: number of left chunks, this is for decoding,
-            the chunk size is decoding_chunk_size.
+                the chunk size is decoding_chunk_size.
                 >=0: use num_decoding_left_chunks
                 <0: use all left chunks
         Returns:
             encoder output tensor, lens and mask
         """
-        masks = ~make_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
         if self.global_cmvn is not None:
             xs = self.global_cmvn(xs)
-        xs, pos_emb, masks = self.embed(xs, masks)
+        xs, pos_emb, masks = self.embed(xs, masks, offset=0)
         mask_pad = ~masks
         chunk_masks = add_optional_chunk_mask(
             xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
@@ -173,48 +174,52 @@ class BaseEncoder(nn.Layer):
 
     def forward_chunk(
             self,
-            xs: torch.Tensor,
+            xs: paddle.Tensor,
             offset: int,
             required_cache_size: int,
-            subsampling_cache: Optional[torch.Tensor]=None,
-            elayers_output_cache: Optional[List[torch.Tensor]]=None,
-            conformer_cnn_cache: Optional[List[torch.Tensor]]=None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], List[
-            torch.Tensor]]:
+            subsampling_cache: Optional[paddle.Tensor]=None,
+            elayers_output_cache: Optional[List[paddle.Tensor]]=None,
+            conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
+            paddle.Tensor]]:
         """ Forward just one chunk
         Args:
-            xs (torch.Tensor): chunk input
+            xs (paddle.Tensor): chunk input, [B=1, T, D]
             offset (int): current offset in encoder output time stamp
             required_cache_size (int): cache size required for next chunk
                 compuation
                 >=0: actual cache size
                 <0: means all history cache is required
-            subsampling_cache (Optional[torch.Tensor]): subsampling cache
-            elayers_output_cache (Optional[List[torch.Tensor]]):
+            subsampling_cache (Optional[paddle.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[paddle.Tensor]]):
                 transformer/conformer encoder layers output cache
-            conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer
+            conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
                 cnn cache
         Returns:
-            torch.Tensor: output of current input xs
-            torch.Tensor: subsampling cache required for next chunk computation
-            List[torch.Tensor]: encoder layers output cache required for next
+            paddle.Tensor: output of current input xs
+            paddle.Tensor: subsampling cache required for next chunk computation
+            List[paddle.Tensor]: encoder layers output cache required for next
                 chunk computation
-            List[torch.Tensor]: conformer cnn cache
+            List[paddle.Tensor]: conformer cnn cache
         """
-        assert xs.size(0) == 1
+        assert xs.size(0) == 1  # batch size must be one
         # tmp_masks is just for interface compatibility
-        tmp_masks = torch.ones(
-            1, xs.size(1), device=xs.device, dtype=torch.bool)
-        tmp_masks = tmp_masks.unsqueeze(1)
+        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
+
         if self.global_cmvn is not None:
             xs = self.global_cmvn(xs)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+
+        xs, pos_emb, _ = self.embed(
+            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
         if subsampling_cache is not None:
-            cache_size = subsampling_cache.size(1)
-            xs = torch.cat((subsampling_cache, xs), dim=1)
+            cache_size = subsampling_cache.size(1)  #T
+            xs = paddle.cat((subsampling_cache, xs), dim=1)
         else:
             cache_size = 0
-        pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1))
+        pos_emb = self.embed.position_encoding(
+            offset=offset - cache_size, size=xs.size(1))
+
         if required_cache_size < 0:
             next_cache_start = 0
         elif required_cache_size == 0:
@@ -222,20 +227,17 @@ class BaseEncoder(nn.Layer):
         else:
             next_cache_start = xs.size(1) - required_cache_size
         r_subsampling_cache = xs[:, next_cache_start:, :]
+
         # Real mask for transformer/conformer layers
-        masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
-        masks = masks.unsqueeze(1)
+        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        masks = masks.unsqueeze(1)  #[B=1, C=1, T]
         r_elayers_output_cache = []
         r_conformer_cnn_cache = []
         for i, layer in enumerate(self.encoders):
-            if elayers_output_cache is None:
-                attn_cache = None
-            else:
-                attn_cache = elayers_output_cache[i]
-            if conformer_cnn_cache is None:
-                cnn_cache = None
-            else:
-                cnn_cache = conformer_cnn_cache[i]
+            attn_cache = None if elayers_output_cache is None else elayers_output_cache[
+                i]
+            cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
+                i]
             xs, _, new_cnn_cache = layer(
                 xs,
                 masks,
@@ -252,10 +254,10 @@ class BaseEncoder(nn.Layer):
 
     def forward_chunk_by_chunk(
             self,
-            xs: torch.Tensor,
+            xs: paddle.Tensor,
             decoding_chunk_size: int,
             num_decoding_left_chunks: int=-1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """ Forward input chunk by chunk with chunk_size like a streaming
             fashion
         Here we should pay special attention to computation cache in the
@@ -277,24 +279,27 @@ class BaseEncoder(nn.Layer):
                layers in subsampling, we need to rewrite it to make it work
                with cache, which is not prefered.
         Args:
-            xs (torch.Tensor): (1, max_len, dim)
-            chunk_size (int): decoding chunk size
+            xs (paddle.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size.
+            num_left_chunks (int): decoding with num left chunks.
         """
         assert decoding_chunk_size > 0
         # The model is trained by static or dynamic chunk
         assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+
+        # feature stride and window for `subsampling` module
         subsampling = self.embed.subsampling_rate
         context = self.embed.right_context + 1  # Add current frame
         stride = subsampling * decoding_chunk_size
         decoding_window = (decoding_chunk_size - 1) * subsampling + context
+
         num_frames = xs.size(1)
-        subsampling_cache: Optional[torch.Tensor] = None
-        elayers_output_cache: Optional[List[torch.Tensor]] = None
-        conformer_cnn_cache: Optional[List[torch.Tensor]] = None
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+        subsampling_cache: Optional[paddle.Tensor] = None
+        elayers_output_cache: Optional[List[paddle.Tensor]] = None
+        conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
         outputs = []
         offset = 0
-        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
-
         # Feed forward overlap input step by step
         for cur in range(0, num_frames - context + 1, stride):
             end = min(cur + decoding_window, num_frames)
@@ -305,8 +310,9 @@ class BaseEncoder(nn.Layer):
                  elayers_output_cache, conformer_cnn_cache)
             outputs.append(y)
             offset += y.size(1)
-        ys = torch.cat(outputs, 1)
-        masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool)
+        ys = paddle.cat(outputs, 1)
+        # fake mask, just for jit script and compatibility with `forward` api
+        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
         masks = masks.unsqueeze(1)
         return ys, masks
 
@@ -330,7 +336,7 @@ class TransformerEncoder(BaseEncoder):
             concat_after: bool=False,
             static_chunk_size: int=0,
             use_dynamic_chunk: bool=False,
-            global_cmvn: torch.nn.Module=None,
+            global_cmvn: nn.Layer=None,
             use_dynamic_left_chunk: bool=False, ):
         """ Construct TransformerEncoder
         See Encoder for the meaning of each parameter.
@@ -342,14 +348,16 @@ class TransformerEncoder(BaseEncoder):
                          pos_enc_layer_type, normalize_before, concat_after,
                          static_chunk_size, use_dynamic_chunk, global_cmvn,
                          use_dynamic_left_chunk)
-        self.encoders = torch.nn.ModuleList([
+        self.encoders = nn.ModuleList([
             TransformerEncoderLayer(
-                output_size,
-                MultiHeadedAttention(attention_heads, output_size,
-                                     attention_dropout_rate),
-                PositionwiseFeedForward(output_size, linear_units,
-                                        dropout_rate), dropout_rate,
-                normalize_before, concat_after) for _ in range(num_blocks)
+                size=output_size,
+                self_attn=MultiHeadedAttention(attention_heads, output_size,
+                                               attention_dropout_rate),
+                feed_forward=PositionwiseFeedForward(output_size, linear_units,
+                                                     dropout_rate),
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after) for _ in range(num_blocks)
         ])
 
 
@@ -396,6 +404,7 @@ class ConformerEncoder(BaseEncoder):
             use_cnn_module (bool): Whether to use convolution module.
             cnn_module_kernel (int): Kernel size of convolution module.
             causal (bool): whether to use causal convolution or not.
+            cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
         """
         assert check_argument_types()
         super().__init__(input_size, output_size, attention_heads, linear_units,
@@ -409,26 +418,26 @@ class ConformerEncoder(BaseEncoder):
         # self-attention module definition
         encoder_selfattn_layer = RelPositionMultiHeadedAttention
         encoder_selfattn_layer_args = (attention_heads, output_size,
-                                       attention_dropout_rate, )
+                                       attention_dropout_rate)
         # feed-forward module definition
         positionwise_layer = PositionwiseFeedForward
         positionwise_layer_args = (output_size, linear_units, dropout_rate,
-                                   activation, )
+                                   activation)
         # convolution module definition
         convolution_layer = ConvolutionModule
         convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                   cnn_module_norm, causal)
 
-        self.encoders = torch.nn.ModuleList([
+        self.encoders = nn.ModuleList([
             ConformerEncoderLayer(
-                output_size,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args),
-                positionwise_layer(*positionwise_layer_args),
-                positionwise_layer(*positionwise_layer_args)
-                if macaron_style else None,
-                convolution_layer(*convolution_layer_args)
+                size=output_size,
+                eself_attn=ncoder_selfattn_layer(*encoder_selfattn_layer_args),
+                feed_forward=positionwise_layer(*positionwise_layer_args),
+                feed_forward_macaron=positionwise_layer(
+                    *positionwise_layer_args) if macaron_style else None,
+                conv_module=convolution_layer(*convolution_layer_args)
                 if use_cnn_module else None,
-                dropout_rate,
-                normalize_before,
-                concat_after, ) for _ in range(num_blocks)
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after) for _ in range(num_blocks)
         ])
diff --git a/deepspeech/modules/encoder_layer.py b/deepspeech/modules/encoder_layer.py
index 734caae6c..2828f0053 100644
--- a/deepspeech/modules/encoder_layer.py
+++ b/deepspeech/modules/encoder_layer.py
@@ -72,6 +72,7 @@ class TransformerEncoderLayer(nn.Layer):
             x: paddle.Tensor,
             mask: paddle.Tensor,
             pos_emb: paddle.Tensor,
+            mask_pad: Optional[paddle.Tensor]=None,
             output_cache: Optional[paddle.Tensor]=None,
             cnn_cache: Optional[paddle.Tensor]=None,
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@@ -81,6 +82,8 @@ class TransformerEncoderLayer(nn.Layer):
             mask (paddle.Tensor): Mask tensor for the input (#batch, time).
             pos_emb (paddle.Tensor): just for interface compatibility
                 to ConformerEncoderLayer
+            mask_pad (paddle.Tensor): does not used in transformer layer,
+                just for unified api with conformer.
             output_cache (paddle.Tensor): Cache tensor of the output
                 (#batch, time2, size), time2 < time in x.
             cnn_cache (paddle.Tensor): not used here, it's for interface
@@ -88,6 +91,7 @@ class TransformerEncoderLayer(nn.Layer):
         Returns:
             paddle.Tensor: Output tensor (#batch, time, size).
             paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: Fake cnn cache tensor for api compatibility with Conformer (#batch, channels, time').
         """
         residual = x
         if self.normalize_before:
@@ -202,12 +206,13 @@ class ConformerEncoderLayer(nn.Layer):
             pos_emb (paddle.Tensor): positional encoding, must not be None
                 for ConformerEncoderLayer.
             mask_pad (paddle.Tensor): batch padding mask used for conv module, (B, 1, T).
-            output_cache (paddle.Tensor): Cache tensor of the output
+            output_cache (paddle.Tensor): Cache tensor of the encoder output
                 (#batch, time2, size), time2 < time in x.
             cnn_cache (paddle.Tensor): Convolution cache in conformer layer
         Returns:
             paddle.Tensor: Output tensor (#batch, time, size).
             paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: New cnn cache tensor (#batch, channels, time').
         """
         # whether to use macaron style FFN
         if self.feed_forward_macaron is not None:
diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py
index 5749c353c..007b18e0a 100644
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@@ -56,13 +56,13 @@ def subsequent_mask(
     This mask is used only in decoder which works in an auto-regressive mode.
     This means the current step could only do attention with its left steps.
     In encoder, fully attention is used when streaming is not necessary and
-    the sequence is not long. In this  case, no attention mask is needed.
+    the sequence is not long. In this case, no attention mask is needed.
     When streaming is need, chunk-based attention is used in encoder. See
     subsequent_chunk_mask for the chunk-based attention mask.
     Args:
         size (int): size of mask
     Returns:
-        paddle.Tensor: mask
+        paddle.Tensor: mask, [size, size]
     Examples:
         >>> subsequent_mask(3)
         [[1, 0, 0],
@@ -86,7 +86,7 @@ def subsequent_chunk_mask(
             <0: use full chunk
             >=0: use num_left_chunks
     Returns:
-        paddle.Tensor: mask
+        paddle.Tensor: mask, [size, size]
     Examples:
         >>> subsequent_chunk_mask(4, 2)
         [[1, 1, 0, 0],
@@ -99,8 +99,8 @@ def subsequent_chunk_mask(
         if num_left_chunks < 0:
             start = 0
         else:
-            start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
-        ending = min((i // chunk_size + 1) * chunk_size, size)
+            start = max(0, (i // chunk_size - num_left_chunks) * chunk_size)
+        ending = min(size, (i // chunk_size + 1) * chunk_size)
         ret[i, start:ending] = True
     return ret
 
diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py
index 3cb8c7d05..a6da4a11c 100644
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@@ -41,7 +41,7 @@ class RNNCell(nn.RNNCellBase):
     """
 
     def __init__(self,
-                 hidden_size,
+                 hidden_size: int,
                  activation="tanh",
                  weight_ih_attr=None,
                  weight_hh_attr=None,
@@ -108,8 +108,8 @@ class GRUCell(nn.RNNCellBase):
     """
 
     def __init__(self,
-                 input_size,
-                 hidden_size,
+                 input_size: int,
+                 hidden_size: int,
                  weight_ih_attr=None,
                  weight_hh_attr=None,
                  bias_ih_attr=None,
@@ -132,7 +132,6 @@ class GRUCell(nn.RNNCellBase):
         self.input_size = input_size
         self._gate_activation = F.sigmoid
         self._activation = paddle.tanh
-        #self._activation = F.relu
 
     def forward(self, inputs, states=None):
         if states is None:
@@ -171,8 +170,6 @@ class BiRNNWithBN(nn.Layer):
     """Bidirectonal simple rnn layer with sequence-wise batch normalization.
     The batch normalization is only performed on input-state weights.
 
-    :param name: Name of the layer parameters.
-    :type name: string
     :param size: Dimension of RNN cells.
     :type size: int
     :param share_weights: Whether to share input-hidden weights between
@@ -182,7 +179,7 @@ class BiRNNWithBN(nn.Layer):
     :rtype: Variable
     """
 
-    def __init__(self, i_size, h_size, share_weights):
+    def __init__(self, i_size: int, h_size: int, share_weights: bool):
         super().__init__()
         self.share_weights = share_weights
         if self.share_weights:
@@ -208,7 +205,7 @@ class BiRNNWithBN(nn.Layer):
         self.bw_rnn = nn.RNN(
             self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]
 
-    def forward(self, x, x_len):
+    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
         # x, shape [B, T, D]
         fw_x = self.fw_bn(self.fw_fc(x))
         bw_x = self.bw_bn(self.bw_fc(x))
@@ -234,7 +231,7 @@ class BiGRUWithBN(nn.Layer):
     :rtype: Variable
     """
 
-    def __init__(self, i_size, h_size, act):
+    def __init__(self, i_size: int, h_size: int):
         super().__init__()
         hidden_size = h_size * 3
 
@@ -281,23 +278,29 @@ class RNNStack(nn.Layer):
     :rtype: Variable
     """
 
-    def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
+    def __init__(self,
+                 i_size: int,
+                 h_size: int,
+                 num_stacks: int,
+                 use_gru: bool,
+                 share_rnn_weights: bool):
         super().__init__()
-        self.rnn_stacks = nn.LayerList()
+        rnn_stacks = []
         for i in range(num_stacks):
             if use_gru:
                 #default:GRU using tanh
-                self.rnn_stacks.append(
-                    BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
+                rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
             else:
-                self.rnn_stacks.append(
+                rnn_stacks.append(
                     BiRNNWithBN(
                         i_size=i_size,
                         h_size=h_size,
                         share_weights=share_rnn_weights))
             i_size = h_size * 2
 
-    def forward(self, x, x_len):
+        self.rnn_stacks = nn.Sequential(rnn_stacks)
+
+    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
         """
         x: shape [B, T, D]
         x_len: shpae [B]
diff --git a/deepspeech/modules/subsampling.py b/deepspeech/modules/subsampling.py
index a01374d71..a0b80b844 100644
--- a/deepspeech/modules/subsampling.py
+++ b/deepspeech/modules/subsampling.py
@@ -32,10 +32,12 @@ __all__ = [
 
 
 class BaseSubsampling(nn.Layer):
-    def __init__(self, pos_enc_class: PositionalEncoding):
+    def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding):
         super().__init__()
         self.pos_enc = pos_enc_class
+        # window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
         self.right_context = 0
+        # stride = chunk_size * subsampling_rate
         self.subsampling_rate = 1
 
     def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
@@ -49,7 +51,7 @@ class LinearNoSubsampling(BaseSubsampling):
                  idim: int,
                  odim: int,
                  dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
         """Construct an linear object.
         Args:
             idim (int): Input dimension.
@@ -71,6 +73,7 @@ class LinearNoSubsampling(BaseSubsampling):
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, idim).
             x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
         Returns:
             paddle.Tensor: linear input tensor (#batch, time', odim),
                 where time' = time .
@@ -90,7 +93,7 @@ class Conv2dSubsampling4(BaseSubsampling):
                  idim: int,
                  odim: int,
                  dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
         """Construct an Conv2dSubsampling4 object.
         
         Args:
@@ -117,6 +120,7 @@ class Conv2dSubsampling4(BaseSubsampling):
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, idim).
             x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
         Returns:
             paddle.Tensor: Subsampled tensor (#batch, time', odim),
                 where time' = time // 4.
@@ -139,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling):
                  idim: int,
                  odim: int,
                  dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
         """Construct an Conv2dSubsampling6 object.
         
         Args:
@@ -169,6 +173,7 @@ class Conv2dSubsampling6(BaseSubsampling):
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, idim).
             x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
         Returns:
             paddle.Tensor: Subsampled tensor (#batch, time', odim),
                 where time' = time // 6.
@@ -191,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling):
                  idim: int,
                  odim: int,
                  dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
         """Construct an Conv2dSubsampling8 object.
         
         Args:
@@ -221,6 +226,7 @@ class Conv2dSubsampling8(BaseSubsampling):
         Args:
             x (paddle.Tensor): Input tensor (#batch, time, idim).
             x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
         Returns:
             paddle.Tensor: Subsampled tensor (#batch, time', odim),
                 where time' = time // 8.
diff --git a/docs/install.md b/docs/install.md
index 71396590f..bd4d5a432 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -43,7 +43,7 @@ bash setup.sh
 source tools/venv/bin/activate
 ```
 
-## Running in Docker Container
+## Running in Docker Container (optional)
 
 Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.
 
diff --git a/docs/reference.md b/docs/reference.md
new file mode 100644
index 000000000..69ff6ab88
--- /dev/null
+++ b/docs/reference.md
@@ -0,0 +1,3 @@
+# Reference
+
+* [wenet](https://github.com/mobvoi/wenet)