add encoder

5 years ago · b6d729a675
parent 426d370413
commit b6d729a675
14 changed files with 226 additions and 197 deletions
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
@ -37,26 +37,26 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2021-03-24 06:30:47,727 - WARNING - register user softmax to paddle, remove this when fixed!\n",
-      "2021-03-24 06:30:47,728 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
-      "2021-03-24 06:30:47,729 - WARNING - register user relu to paddle, remove this when fixed!\n",
-      "2021-03-24 06:30:47,729 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
-      "2021-03-24 06:30:47,730 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "2021-03-24 06:30:47,731 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "2021-03-24 06:30:47,731 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
-      "2021-03-24 06:30:47,732 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,732 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,733 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,733 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,734 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,734 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,735 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,735 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,736 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-24 06:30:47,736 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
-      "2021-03-24 06:30:47,737 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
-      "2021-03-24 06:30:47,737 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
-      "2021-03-24 06:30:47,738 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
+      "2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
+      "2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
+      "2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
+      "2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
+      "2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
      "  from numpy.dual import register_func\n",
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
@ -102,10 +102,10 @@
     "output_type": "stream",
     "text": [
      "0.0.0\n",
-      "607856a949ed7356237ed8148947f7fd2b0f4631\n",
-      "ON\n",
-      "ON\n",
-      "commit: 607856a949ed7356237ed8148947f7fd2b0f4631\n",
+      "e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
+      "OFF\n",
+      "OFF\n",
+      "commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
      "None\n",
      "0\n"
     ]
@ -226,7 +226,7 @@
      "output: None\n",
      "params_file: examples/aishell/jit.model.pdiparams\n",
      "speech_save_dir: demo_cache\n",
-      "use_gpu: True\n",
+      "use_gpu: False\n",
      "warmup_manifest: examples/aishell/data/manifest.test\n",
      "------------------------------------------------\n"
     ]
@ -266,7 +266,7 @@
    "    help=\n",
    "    \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
    ")\n",
-    "add_arg(\"--use_gpu\",type=bool,default=True, help=\"Whether use gpu.\")\n",
+    "add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
    "\n",
    "\n",
    "args = parser.parse_args(\n",
@ -321,7 +321,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2021-03-24 06:31:20,943 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
+      "2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
     ]
    },
    {
@ -407,7 +407,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
@ -418,7 +418,7 @@
      "examples/aishell/jit.model.pdmodel\n",
      "examples/aishell/jit.model.pdiparams\n",
      "0\n",
-      "True\n"
+      "False\n"
     ]
    }
   ],
@ -428,7 +428,8 @@
    "from paddle.inference import PrecisionType\n",
    "from paddle.inference import create_predictor\n",
    "\n",
-    "args.use_gpu=True\n",
+    "args.use_gpu=False\n",
+    "paddle.set_device('cpu')\n",
    "\n",
    "def init_predictor(args):\n",
    "    if args.model_dir is not None:\n",
@ -438,8 +439,8 @@
    "\n",
    "    if args.use_gpu:\n",
    "        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
-    "        config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
-    "                              use_calib_mode=True) # 开启TensorRT预测，精度为fp32，开启int8离线量化\n",
+    "#         config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
+    "#                               use_calib_mode=True) # 开启TensorRT预测，精度为fp32，开启int8离线量化\n",
    "    else:\n",
    "        # If not specific mkldnn, you can set the blas thread.\n",
    "        # The thread num should not be greater than the number of cores in the CPU.\n",
@ -447,7 +448,7 @@
    "        config.enable_mkldnn()\n",
    "        \n",
    "    config.enable_memory_optim()\n",
-    "    config.switch_ir_optim(False)\n",
+    "    config.switch_ir_optim(True)\n",
    "    \n",
    "    print(config.model_dir())\n",
    "    print(config.prog_file())\n",
@ -534,7 +535,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
@ -546,49 +547,35 @@
      "input: 0 audio\n",
      "input: 1 audio_len\n",
      "output: 0 tmp_75\n",
-      "jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
-      "   8.91578738e-12 4.64319072e-08]\n",
-      "  [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
-      "   1.55893121e-15 9.99992609e-01]\n",
-      "  [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
-      "   1.24587735e-17 1.00000000e+00]\n",
+      "jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
+      "   8.91573707e-12 4.64317296e-08]\n",
+      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
+      "   1.55891342e-15 9.99992609e-01]\n",
+      "  [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
+      "   1.24587264e-17 1.00000000e+00]\n",
      "  ...\n",
-      "  [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
-      "   4.37358093e-15 1.00000000e+00]\n",
-      "  [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
-      "   3.89255983e-13 1.00000000e+00]\n",
-      "  [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
-      "   1.00334671e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
+      "  [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
+      "   4.37354747e-15 1.00000000e+00]\n",
+      "  [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
+      "   3.89252270e-13 1.00000000e+00]\n",
+      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
+      "   1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
      "[1, 161, 522]\n",
-      "[1]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
-      "  return (isinstance(seq, collections.Sequence) and\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "paddle: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
-      "   8.91578738e-12 4.64319072e-08]\n",
-      "  [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
-      "   1.55893121e-15 9.99992609e-01]\n",
-      "  [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
+      "[1]\n",
+      "paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
+      "   8.91577090e-12 4.64319072e-08]\n",
+      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
+      "   1.55891342e-15 9.99992609e-01]\n",
+      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
      "   1.24587735e-17 1.00000000e+00]\n",
      "  ...\n",
-      "  [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
-      "   4.37358093e-15 1.00000000e+00]\n",
-      "  [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
-      "   3.89255983e-13 1.00000000e+00]\n",
-      "  [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
-      "   1.00334671e-10 9.99998808e-01]]]\n",
-      "True\n"
+      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
+      "   4.37354747e-15 1.00000000e+00]\n",
+      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
+      "   3.89253761e-13 1.00000000e+00]\n",
+      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
+      "   1.00334095e-10 9.99998808e-01]]]\n",
+      "False\n"
     ]
    }
   ],
@ -607,7 +594,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
@ -618,19 +605,19 @@
      "input: 0 audio\n",
      "input: 1 audio_len\n",
      "output: 0 tmp_75\n",
-      "jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
-      "   8.91578738e-12 4.64319072e-08]\n",
-      "  [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
-      "   1.55893121e-15 9.99992609e-01]\n",
-      "  [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
+      "jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
+      "   8.91577090e-12 4.64319072e-08]\n",
+      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
+      "   1.55891342e-15 9.99992609e-01]\n",
+      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
      "   1.24587735e-17 1.00000000e+00]\n",
      "  ...\n",
-      "  [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
-      "   4.37358093e-15 1.00000000e+00]\n",
-      "  [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
-      "   3.89255983e-13 1.00000000e+00]\n",
-      "  [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
-      "   1.00334671e-10 9.99998808e-01]]]\n"
+      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
+      "   4.37354747e-15 1.00000000e+00]\n",
+      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
+      "   3.89253761e-13 1.00000000e+00]\n",
+      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
+      "   1.00334095e-10 9.99998808e-01]]]\n"
     ]
    }
   ],
--- a/.notebook/train_test.ipynb
+++ b/.notebook/train_test.ipynb
@ -454,7 +454,7 @@
    "            act='brelu')\n",
    "\n",
    "        out_channel = 32\n",
-    "        self.conv_stack = nn.LayerList([\n",
+    "        self.conv_stack = nn.Sequential([\n",
    "            ConvBn(\n",
    "                num_channels_in=32,\n",
    "                num_channels_out=out_channel,\n",
--- a/README.md
+++ b/README.md
@ -59,3 +59,6 @@ You are welcome to submit questions and bug reports in [Github Issues](https://g
 ## License

 DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
+
+## Acknowledgement
+We depends on many open source repos. See [References](docs/reference.md) for more information.
--- a/README_cn.md
+++ b/README_cn.md
@ -56,3 +56,6 @@ source tools/venv/bin/activate
 ## License

 DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。
+
+## 感谢
+开发中参考一些优秀的仓库，详情参见 [References](docs/reference.md)。
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -266,8 +266,17 @@ logger.warn(
 )
 F.ctc_loss = ctc_loss

-
 ########### hcak paddle.nn #############
+if not hasattr(paddle.nn, 'Module'):
+    logger.warn("register user Module to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'Module', paddle.nn.Layer)
+
+if not hasattr(paddle.nn, 'ModuleList'):
+    logger.warn(
+        "register user ModuleList to paddle.nn, remove this when fixed!")
+    setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
+
+
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""

--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@ -143,7 +143,8 @@ def get_activation(act):
        "relu": paddle.nn.ReLU,
        "selu": paddle.nn.SELU,
        "swish": paddle.nn.Swish,
-        "gelu": paddle.nn.GELU
+        "gelu": paddle.nn.GELU,
+        "brelu": brelu,
    }

    return activation_funcs[act]()
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@ -51,7 +51,7 @@ class PositionalEncoding(nn.Layer):
        self.pe = paddle.zeros(self.max_len, self.d_model)  #[T,D]

        position = paddle.arange(
-            0, self.max_len, dtype=paddle.float32).unsqueeze(1)
+            0, self.max_len, dtype=paddle.float32).unsqueeze(1)  #[T, 1]
        div_term = paddle.exp(
            paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
            -(math.log(10000.0) / self.d_model))
@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
            offset (int): position offset
        Returns:
            paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
-            paddle.Tensor: for compatibility to RelPositionalEncoding
+            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
        """
        T = paddle.shape(x)[1]
        assert offset + T < self.max_len
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -59,16 +59,16 @@ class BaseEncoder(nn.Layer):
            concat_after: bool=False,
            static_chunk_size: int=0,
            use_dynamic_chunk: bool=False,
-            global_cmvn: torch.nn.Module=None,
+            global_cmvn: paddle.nn.Layer=None,
            use_dynamic_left_chunk: bool=False, ):
        """
        Args:
-            input_size (int): input dim
-            output_size (int): dimension of attention
+            input_size (int): input dim, d_feature
+            output_size (int): dimension of attention, d_model
            attention_heads (int): the number of heads of multi head attention
            linear_units (int): the hidden units number of position-wise feed
                forward
-            num_blocks (int): the number of decoder blocks
+            num_blocks (int): the number of encoder blocks
            dropout_rate (float): dropout rate
            attention_dropout_rate (float): dropout rate in attention
            positional_dropout_rate (float): dropout rate after adding
@ -89,7 +89,7 @@ class BaseEncoder(nn.Layer):
            use_dynamic_chunk (bool): whether use dynamic chunk size for
                training or not, You can only use fixed chunk(chunk_size > 0)
                or dyanmic chunk size(use_dynamic_chunk = True)
-            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
                dynamic chunk training
        """
@ -117,13 +117,14 @@ class BaseEncoder(nn.Layer):

        self.global_cmvn = global_cmvn
        self.embed = subsampling_class(
-            input_size,
-            output_size,
-            dropout_rate,
-            pos_enc_class(output_size, positional_dropout_rate), )
+            idim=input_size,
+            odim=output_size,
+            dropout_rate=dropout_rate,
+            pos_enc_class=pos_enc_class(
+                d_model=output_size, dropout_rate=positional_dropout_rate), )

        self.normalize_before = normalize_before
-        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12)
+        self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
        self.static_chunk_size = static_chunk_size
        self.use_dynamic_chunk = use_dynamic_chunk
        self.use_dynamic_left_chunk = use_dynamic_left_chunk
@ -133,11 +134,11 @@ class BaseEncoder(nn.Layer):

    def forward(
            self,
-            xs: torch.Tensor,
-            xs_lens: torch.Tensor,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
            decoding_chunk_size: int=0,
            num_decoding_left_chunks: int=-1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """Embed positions in tensor.
        Args:
            xs: padded input tensor (B, L, D)
@ -153,10 +154,10 @@ class BaseEncoder(nn.Layer):
        Returns:
            encoder output tensor, lens and mask
        """
-        masks = ~make_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
-        xs, pos_emb, masks = self.embed(xs, masks)
+        xs, pos_emb, masks = self.embed(xs, masks, offset=0)
        mask_pad = ~masks
        chunk_masks = add_optional_chunk_mask(
            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
@ -173,48 +174,52 @@ class BaseEncoder(nn.Layer):

    def forward_chunk(
            self,
-            xs: torch.Tensor,
+            xs: paddle.Tensor,
            offset: int,
            required_cache_size: int,
-            subsampling_cache: Optional[torch.Tensor]=None,
-            elayers_output_cache: Optional[List[torch.Tensor]]=None,
-            conformer_cnn_cache: Optional[List[torch.Tensor]]=None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], List[
-            torch.Tensor]]:
+            subsampling_cache: Optional[paddle.Tensor]=None,
+            elayers_output_cache: Optional[List[paddle.Tensor]]=None,
+            conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
+            paddle.Tensor]]:
        """ Forward just one chunk
        Args:
-            xs (torch.Tensor): chunk input
+            xs (paddle.Tensor): chunk input, [B=1, T, D]
            offset (int): current offset in encoder output time stamp
            required_cache_size (int): cache size required for next chunk
                compuation
                >=0: actual cache size
                <0: means all history cache is required
-            subsampling_cache (Optional[torch.Tensor]): subsampling cache
-            elayers_output_cache (Optional[List[torch.Tensor]]):
+            subsampling_cache (Optional[paddle.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[paddle.Tensor]]):
                transformer/conformer encoder layers output cache
-            conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer
+            conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
                cnn cache
        Returns:
-            torch.Tensor: output of current input xs
-            torch.Tensor: subsampling cache required for next chunk computation
-            List[torch.Tensor]: encoder layers output cache required for next
+            paddle.Tensor: output of current input xs
+            paddle.Tensor: subsampling cache required for next chunk computation
+            List[paddle.Tensor]: encoder layers output cache required for next
                chunk computation
-            List[torch.Tensor]: conformer cnn cache
+            List[paddle.Tensor]: conformer cnn cache
        """
-        assert xs.size(0) == 1
+        assert xs.size(0) == 1  # batch size must be one
        # tmp_masks is just for interface compatibility
-        tmp_masks = torch.ones(
-            1, xs.size(1), device=xs.device, dtype=torch.bool)
-        tmp_masks = tmp_masks.unsqueeze(1)
+        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
+
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+
+        xs, pos_emb, _ = self.embed(
+            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
        if subsampling_cache is not None:
-            cache_size = subsampling_cache.size(1)
-            xs = torch.cat((subsampling_cache, xs), dim=1)
+            cache_size = subsampling_cache.size(1)  #T
+            xs = paddle.cat((subsampling_cache, xs), dim=1)
        else:
            cache_size = 0
-        pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1))
+        pos_emb = self.embed.position_encoding(
+            offset=offset - cache_size, size=xs.size(1))
+
        if required_cache_size < 0:
            next_cache_start = 0
        elif required_cache_size == 0:
@ -222,20 +227,17 @@ class BaseEncoder(nn.Layer):
        else:
            next_cache_start = xs.size(1) - required_cache_size
        r_subsampling_cache = xs[:, next_cache_start:, :]
+
        # Real mask for transformer/conformer layers
-        masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
-        masks = masks.unsqueeze(1)
+        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        masks = masks.unsqueeze(1)  #[B=1, C=1, T]
        r_elayers_output_cache = []
        r_conformer_cnn_cache = []
        for i, layer in enumerate(self.encoders):
-            if elayers_output_cache is None:
-                attn_cache = None
-            else:
-                attn_cache = elayers_output_cache[i]
-            if conformer_cnn_cache is None:
-                cnn_cache = None
-            else:
-                cnn_cache = conformer_cnn_cache[i]
+            attn_cache = None if elayers_output_cache is None else elayers_output_cache[
+                i]
+            cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
+                i]
            xs, _, new_cnn_cache = layer(
                xs,
                masks,
@ -252,10 +254,10 @@ class BaseEncoder(nn.Layer):

    def forward_chunk_by_chunk(
            self,
-            xs: torch.Tensor,
+            xs: paddle.Tensor,
            decoding_chunk_size: int,
            num_decoding_left_chunks: int=-1,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
        """ Forward input chunk by chunk with chunk_size like a streaming
            fashion
        Here we should pay special attention to computation cache in the
@ -277,24 +279,27 @@ class BaseEncoder(nn.Layer):
               layers in subsampling, we need to rewrite it to make it work
               with cache, which is not prefered.
        Args:
-            xs (torch.Tensor): (1, max_len, dim)
-            chunk_size (int): decoding chunk size
+            xs (paddle.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size.
+            num_left_chunks (int): decoding with num left chunks.
        """
        assert decoding_chunk_size > 0
        # The model is trained by static or dynamic chunk
        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+
+        # feature stride and window for `subsampling` module
        subsampling = self.embed.subsampling_rate
        context = self.embed.right_context + 1  # Add current frame
        stride = subsampling * decoding_chunk_size
        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+
        num_frames = xs.size(1)
-        subsampling_cache: Optional[torch.Tensor] = None
-        elayers_output_cache: Optional[List[torch.Tensor]] = None
-        conformer_cnn_cache: Optional[List[torch.Tensor]] = None
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+        subsampling_cache: Optional[paddle.Tensor] = None
+        elayers_output_cache: Optional[List[paddle.Tensor]] = None
+        conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
        outputs = []
        offset = 0
-        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
-
        # Feed forward overlap input step by step
        for cur in range(0, num_frames - context + 1, stride):
            end = min(cur + decoding_window, num_frames)
@ -305,8 +310,9 @@ class BaseEncoder(nn.Layer):
                 elayers_output_cache, conformer_cnn_cache)
            outputs.append(y)
            offset += y.size(1)
-        ys = torch.cat(outputs, 1)
-        masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool)
+        ys = paddle.cat(outputs, 1)
+        # fake mask, just for jit script and compatibility with `forward` api
+        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
        masks = masks.unsqueeze(1)
        return ys, masks

@ -330,7 +336,7 @@ class TransformerEncoder(BaseEncoder):
            concat_after: bool=False,
            static_chunk_size: int=0,
            use_dynamic_chunk: bool=False,
-            global_cmvn: torch.nn.Module=None,
+            global_cmvn: nn.Layer=None,
            use_dynamic_left_chunk: bool=False, ):
        """ Construct TransformerEncoder
        See Encoder for the meaning of each parameter.
@ -342,14 +348,16 @@ class TransformerEncoder(BaseEncoder):
                         pos_enc_layer_type, normalize_before, concat_after,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk)
-        self.encoders = torch.nn.ModuleList([
+        self.encoders = nn.ModuleList([
            TransformerEncoderLayer(
-                output_size,
-                MultiHeadedAttention(attention_heads, output_size,
+                size=output_size,
+                self_attn=MultiHeadedAttention(attention_heads, output_size,
                                               attention_dropout_rate),
-                PositionwiseFeedForward(output_size, linear_units,
-                                        dropout_rate), dropout_rate,
-                normalize_before, concat_after) for _ in range(num_blocks)
+                feed_forward=PositionwiseFeedForward(output_size, linear_units,
+                                                     dropout_rate),
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after) for _ in range(num_blocks)
        ])


@ -396,6 +404,7 @@ class ConformerEncoder(BaseEncoder):
            use_cnn_module (bool): Whether to use convolution module.
            cnn_module_kernel (int): Kernel size of convolution module.
            causal (bool): whether to use causal convolution or not.
+            cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
        """
        assert check_argument_types()
        super().__init__(input_size, output_size, attention_heads, linear_units,
@ -409,26 +418,26 @@ class ConformerEncoder(BaseEncoder):
        # self-attention module definition
        encoder_selfattn_layer = RelPositionMultiHeadedAttention
        encoder_selfattn_layer_args = (attention_heads, output_size,
-                                       attention_dropout_rate, )
+                                       attention_dropout_rate)
        # feed-forward module definition
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (output_size, linear_units, dropout_rate,
-                                   activation, )
+                                   activation)
        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                  cnn_module_norm, causal)

-        self.encoders = torch.nn.ModuleList([
+        self.encoders = nn.ModuleList([
            ConformerEncoderLayer(
-                output_size,
-                encoder_selfattn_layer(*encoder_selfattn_layer_args),
-                positionwise_layer(*positionwise_layer_args),
-                positionwise_layer(*positionwise_layer_args)
-                if macaron_style else None,
-                convolution_layer(*convolution_layer_args)
+                size=output_size,
+                eself_attn=ncoder_selfattn_layer(*encoder_selfattn_layer_args),
+                feed_forward=positionwise_layer(*positionwise_layer_args),
+                feed_forward_macaron=positionwise_layer(
+                    *positionwise_layer_args) if macaron_style else None,
+                conv_module=convolution_layer(*convolution_layer_args)
                if use_cnn_module else None,
-                dropout_rate,
-                normalize_before,
-                concat_after, ) for _ in range(num_blocks)
+                dropout_rate=dropout_rate,
+                normalize_before=normalize_before,
+                concat_after=concat_after) for _ in range(num_blocks)
        ])
--- a/deepspeech/modules/encoder_layer.py
+++ b/deepspeech/modules/encoder_layer.py
@ -72,6 +72,7 @@ class TransformerEncoderLayer(nn.Layer):
            x: paddle.Tensor,
            mask: paddle.Tensor,
            pos_emb: paddle.Tensor,
+            mask_pad: Optional[paddle.Tensor]=None,
            output_cache: Optional[paddle.Tensor]=None,
            cnn_cache: Optional[paddle.Tensor]=None,
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -81,6 +82,8 @@ class TransformerEncoderLayer(nn.Layer):
            mask (paddle.Tensor): Mask tensor for the input (#batch, time).
            pos_emb (paddle.Tensor): just for interface compatibility
                to ConformerEncoderLayer
+            mask_pad (paddle.Tensor): does not used in transformer layer,
+                just for unified api with conformer.
            output_cache (paddle.Tensor): Cache tensor of the output
                (#batch, time2, size), time2 < time in x.
            cnn_cache (paddle.Tensor): not used here, it's for interface
@ -88,6 +91,7 @@ class TransformerEncoderLayer(nn.Layer):
        Returns:
            paddle.Tensor: Output tensor (#batch, time, size).
            paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: Fake cnn cache tensor for api compatibility with Conformer (#batch, channels, time').
        """
        residual = x
        if self.normalize_before:
@ -202,12 +206,13 @@ class ConformerEncoderLayer(nn.Layer):
            pos_emb (paddle.Tensor): positional encoding, must not be None
                for ConformerEncoderLayer.
            mask_pad (paddle.Tensor): batch padding mask used for conv module, (B, 1, T).
-            output_cache (paddle.Tensor): Cache tensor of the output
+            output_cache (paddle.Tensor): Cache tensor of the encoder output
                (#batch, time2, size), time2 < time in x.
            cnn_cache (paddle.Tensor): Convolution cache in conformer layer
        Returns:
            paddle.Tensor: Output tensor (#batch, time, size).
            paddle.Tensor: Mask tensor (#batch, time).
+            paddle.Tensor: New cnn cache tensor (#batch, channels, time').
        """
        # whether to use macaron style FFN
        if self.feed_forward_macaron is not None:
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -62,7 +62,7 @@ def subsequent_mask(
    Args:
        size (int): size of mask
    Returns:
-        paddle.Tensor: mask
+        paddle.Tensor: mask, [size, size]
    Examples:
        >>> subsequent_mask(3)
        [[1, 0, 0],
@ -86,7 +86,7 @@ def subsequent_chunk_mask(
            <0: use full chunk
            >=0: use num_left_chunks
    Returns:
-        paddle.Tensor: mask
+        paddle.Tensor: mask, [size, size]
    Examples:
        >>> subsequent_chunk_mask(4, 2)
        [[1, 1, 0, 0],
@ -99,8 +99,8 @@ def subsequent_chunk_mask(
        if num_left_chunks < 0:
            start = 0
        else:
-            start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
-        ending = min((i // chunk_size + 1) * chunk_size, size)
+            start = max(0, (i // chunk_size - num_left_chunks) * chunk_size)
+        ending = min(size, (i // chunk_size + 1) * chunk_size)
        ret[i, start:ending] = True
    return ret

--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -41,7 +41,7 @@ class RNNCell(nn.RNNCellBase):
    """

    def __init__(self,
-                 hidden_size,
+                 hidden_size: int,
                 activation="tanh",
                 weight_ih_attr=None,
                 weight_hh_attr=None,
@ -108,8 +108,8 @@ class GRUCell(nn.RNNCellBase):
    """

    def __init__(self,
-                 input_size,
-                 hidden_size,
+                 input_size: int,
+                 hidden_size: int,
                 weight_ih_attr=None,
                 weight_hh_attr=None,
                 bias_ih_attr=None,
@ -132,7 +132,6 @@ class GRUCell(nn.RNNCellBase):
        self.input_size = input_size
        self._gate_activation = F.sigmoid
        self._activation = paddle.tanh
-        #self._activation = F.relu

    def forward(self, inputs, states=None):
        if states is None:
@ -171,8 +170,6 @@ class BiRNNWithBN(nn.Layer):
    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.

-    :param name: Name of the layer parameters.
-    :type name: string
    :param size: Dimension of RNN cells.
    :type size: int
    :param share_weights: Whether to share input-hidden weights between
@ -182,7 +179,7 @@ class BiRNNWithBN(nn.Layer):
    :rtype: Variable
    """

-    def __init__(self, i_size, h_size, share_weights):
+    def __init__(self, i_size: int, h_size: int, share_weights: bool):
        super().__init__()
        self.share_weights = share_weights
        if self.share_weights:
@ -208,7 +205,7 @@ class BiRNNWithBN(nn.Layer):
        self.bw_rnn = nn.RNN(
            self.fw_cell, is_reverse=True, time_major=False)  #[B, T, D]

-    def forward(self, x, x_len):
+    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        # x, shape [B, T, D]
        fw_x = self.fw_bn(self.fw_fc(x))
        bw_x = self.bw_bn(self.bw_fc(x))
@ -234,7 +231,7 @@ class BiGRUWithBN(nn.Layer):
    :rtype: Variable
    """

-    def __init__(self, i_size, h_size, act):
+    def __init__(self, i_size: int, h_size: int):
        super().__init__()
        hidden_size = h_size * 3

@ -281,23 +278,29 @@ class RNNStack(nn.Layer):
    :rtype: Variable
    """

-    def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
+    def __init__(self,
+                 i_size: int,
+                 h_size: int,
+                 num_stacks: int,
+                 use_gru: bool,
+                 share_rnn_weights: bool):
        super().__init__()
-        self.rnn_stacks = nn.LayerList()
+        rnn_stacks = []
        for i in range(num_stacks):
            if use_gru:
                #default:GRU using tanh
-                self.rnn_stacks.append(
-                    BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
+                rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
            else:
-                self.rnn_stacks.append(
+                rnn_stacks.append(
                    BiRNNWithBN(
                        i_size=i_size,
                        h_size=h_size,
                        share_weights=share_rnn_weights))
            i_size = h_size * 2

-    def forward(self, x, x_len):
+        self.rnn_stacks = nn.Sequential(rnn_stacks)
+
+    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """
        x: shape [B, T, D]
        x_len: shpae [B]
--- a/deepspeech/modules/subsampling.py
+++ b/deepspeech/modules/subsampling.py
@ -32,10 +32,12 @@ __all__ = [


 class BaseSubsampling(nn.Layer):
-    def __init__(self, pos_enc_class: PositionalEncoding):
+    def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding):
        super().__init__()
        self.pos_enc = pos_enc_class
+        # window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
        self.right_context = 0
+        # stride = chunk_size * subsampling_rate
        self.subsampling_rate = 1

    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
@ -49,7 +51,7 @@ class LinearNoSubsampling(BaseSubsampling):
                 idim: int,
                 odim: int,
                 dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an linear object.
        Args:
            idim (int): Input dimension.
@ -71,6 +73,7 @@ class LinearNoSubsampling(BaseSubsampling):
        Args:
            x (paddle.Tensor): Input tensor (#batch, time, idim).
            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
        Returns:
            paddle.Tensor: linear input tensor (#batch, time', odim),
                where time' = time .
@ -90,7 +93,7 @@ class Conv2dSubsampling4(BaseSubsampling):
                 idim: int,
                 odim: int,
                 dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an Conv2dSubsampling4 object.
        
        Args:
@ -117,6 +120,7 @@ class Conv2dSubsampling4(BaseSubsampling):
        Args:
            x (paddle.Tensor): Input tensor (#batch, time, idim).
            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
        Returns:
            paddle.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 4.
@ -139,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling):
                 idim: int,
                 odim: int,
                 dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an Conv2dSubsampling6 object.
        
        Args:
@ -169,6 +173,7 @@ class Conv2dSubsampling6(BaseSubsampling):
        Args:
            x (paddle.Tensor): Input tensor (#batch, time, idim).
            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
        Returns:
            paddle.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 6.
@ -191,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling):
                 idim: int,
                 odim: int,
                 dropout_rate: float,
-                 pos_enc_class: PositionalEncoding):
+                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an Conv2dSubsampling8 object.
        
        Args:
@ -221,6 +226,7 @@ class Conv2dSubsampling8(BaseSubsampling):
        Args:
            x (paddle.Tensor): Input tensor (#batch, time, idim).
            x_mask (paddle.Tensor): Input mask (#batch, 1, time).
+            offset (int): position encoding offset.
        Returns:
            paddle.Tensor: Subsampled tensor (#batch, time', odim),
                where time' = time // 8.
--- a/docs/install.md
+++ b/docs/install.md
@ -43,7 +43,7 @@ bash setup.sh
 source tools/venv/bin/activate
 ```

-## Running in Docker Container
+## Running in Docker Container (optional)

 Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.

--- a/docs/reference.md
+++ b/docs/reference.md
@ -0,0 +1,3 @@
+# Reference
+
+* [wenet](https://github.com/mobvoi/wenet)