add test; attention

5 years ago · 16fa4245ec
parent bc6da7a123
commit 16fa4245ec
5 changed files with 1180 additions and 5 deletions
--- a/.notebook/hack_api_test.ipynb
+++ b/.notebook/hack_api_test.ipynb
@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "breeding-haven",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "appropriate-theta",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LICENSE       deepspeech  examples\t\t    requirements.txt  tools\r\n",
+      "README.md     docs\t  libsndfile-1.0.28\t    setup.sh\t      utils\r\n",
+      "README_cn.md  env.sh\t  libsndfile-1.0.28.tar.gz  tests\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "entire-bloom",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n",
+      "WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from deepspeech.modules import loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "governmental-aircraft",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import paddle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "proprietary-disaster",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "paddle.Tensor.repeat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "first-diagram",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<property at 0x7fb515eeeb88>"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "paddle.Tensor.size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "intelligent-david",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "paddle.Tensor.cat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "bronze-tenant",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "balanced-bearing",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "extreme-republic",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
+    "    nargs = len(args)\n",
+    "    assert (nargs <= 1)\n",
+    "    s = paddle.shape(xs)\n",
+    "    if nargs == 1:\n",
+    "        return s[args[0]]\n",
+    "    else:\n",
+    "        return s\n",
+    "\n",
+    "# logger.warn(\n",
+    "#     \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
+    "# )\n",
+    "paddle.Tensor.size = size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "gross-addiction",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
+       "       [7])"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size(0)\n",
+    "a.size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "adverse-dining",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
+       "       [7])"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "popular-potato",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
@ -0,0 +1,616 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2021-03-17 11:09:34,972 - WARNING - override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-03-17 11:09:34,973 - WARNING - override size of paddle.Tensor if exists or register (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "2021-03-17 11:09:34,974 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-17 11:09:34,975 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-17 11:09:34,975 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "2021-03-17 11:09:34,976 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "2021-03-17 11:09:34,976 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
+      "2021-03-17 11:09:34,977 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "2021-03-17 11:09:34,977 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
+      "  from numpy.dual import register_func\n",
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "import argparse\n",
+    "import functools\n",
+    "import paddle\n",
+    "import numpy as np\n",
+    "\n",
+    "from deepspeech.utils.socket_server import warm_up_test\n",
+    "from deepspeech.utils.socket_server import AsrTCPServer\n",
+    "from deepspeech.utils.socket_server import AsrRequestHandler\n",
+    "\n",
+    "from deepspeech.training.cli import default_argument_parser\n",
+    "from deepspeech.exps.deepspeech2.config import get_cfg_defaults\n",
+    "\n",
+    "from deepspeech.frontend.utility import read_manifest\n",
+    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
+    "\n",
+    "from deepspeech.models.deepspeech2 import DeepSpeech2Model\n",
+    "from deepspeech.models.deepspeech2 import DeepSpeech2InferModel\n",
+    "from deepspeech.io.dataset import ManifestDataset\n",
+    "\n",
+    "\n",
+    "from paddle.inference import Config\n",
+    "from paddle.inference import create_predictor\n",
+    "\n",
+    "from deepspeech.frontend.utility import read_manifest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.0.0\n",
+      "<module 'paddle.version' from '/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/version.py'>\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(paddle.__version__)\n",
+    "print(paddle.version)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data:\n",
+      "  augmentation_config: conf/augmentation.config\n",
+      "  batch_size: 64\n",
+      "  dev_manifest: data/manifest.dev\n",
+      "  keep_transcription_text: False\n",
+      "  max_duration: 27.0\n",
+      "  max_freq: None\n",
+      "  mean_std_filepath: examples/aishell/data/mean_std.npz\n",
+      "  min_duration: 0.0\n",
+      "  n_fft: None\n",
+      "  num_workers: 0\n",
+      "  random_seed: 0\n",
+      "  shuffle_method: batch_shuffle\n",
+      "  sortagrad: True\n",
+      "  specgram_type: linear\n",
+      "  stride_ms: 10.0\n",
+      "  target_dB: -20\n",
+      "  target_sample_rate: 16000\n",
+      "  test_manifest: examples/aishell/data/manifest.test\n",
+      "  train_manifest: data/manifest.train\n",
+      "  use_dB_normalization: True\n",
+      "  vocab_filepath: examples/aishell/data/vocab.txt\n",
+      "  window_ms: 20.0\n",
+      "decoding:\n",
+      "  alpha: 2.6\n",
+      "  batch_size: 128\n",
+      "  beam_size: 300\n",
+      "  beta: 5.0\n",
+      "  cutoff_prob: 0.99\n",
+      "  cutoff_top_n: 40\n",
+      "  decoding_method: ctc_beam_search\n",
+      "  error_rate_type: cer\n",
+      "  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm\n",
+      "  num_proc_bsearch: 10\n",
+      "model:\n",
+      "  num_conv_layers: 2\n",
+      "  num_rnn_layers: 3\n",
+      "  rnn_layer_size: 1024\n",
+      "  share_rnn_weights: False\n",
+      "  use_gru: True\n",
+      "training:\n",
+      "  global_grad_clip: 5.0\n",
+      "  lr: 0.0005\n",
+      "  lr_decay: 0.83\n",
+      "  n_epoch: 30\n",
+      "  weight_decay: 1e-06\n",
+      "-----------  Configuration Arguments -----------\n",
+      "checkpoint_path: examples/aishell/ckpt/checkpoints/step-1876\n",
+      "config: examples/aishell/conf/deepspeech2.yaml\n",
+      "device: gpu\n",
+      "dump_config: None\n",
+      "export_path: None\n",
+      "host_ip: localhost\n",
+      "host_port: 8086\n",
+      "model_dir: None\n",
+      "model_file: examples/aishell/jit.model.pdmodel\n",
+      "nprocs: 1\n",
+      "opts: ['data.test_manifest', 'examples/aishell/data/manifest.test', 'data.mean_std_filepath', 'examples/aishell/data/mean_std.npz', 'data.vocab_filepath', 'examples/aishell/data/vocab.txt']\n",
+      "output: None\n",
+      "params_file: examples/aishell/jit.model.pdiparams\n",
+      "speech_save_dir: demo_cache\n",
+      "use_gpu: True\n",
+      "warmup_manifest: examples/aishell/data/manifest.test\n",
+      "------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "parser = default_argument_parser()\n",
+    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
+    "add_arg('host_ip',          str,\n",
+    "        'localhost',\n",
+    "        \"Server's IP address.\")\n",
+    "add_arg('host_port',        int,    8086,    \"Server's IP port.\")\n",
+    "add_arg('speech_save_dir',  str,\n",
+    "        'demo_cache',\n",
+    "        \"Directory to save demo audios.\")\n",
+    "add_arg('warmup_manifest',  str, \"examples/aishell/data/manifest.test\", \"Filepath of manifest to warm up.\")\n",
+    "add_arg(\n",
+    "    \"--model_file\",\n",
+    "    type=str,\n",
+    "    default=\"examples/aishell/jit.model.pdmodel\",\n",
+    "    help=\"Model filename, Specify this when your model is a combined model.\"\n",
+    ")\n",
+    "add_arg(\n",
+    "    \"--params_file\",\n",
+    "    type=str,\n",
+    "    default=\"examples/aishell/jit.model.pdiparams\",\n",
+    "    help=\n",
+    "    \"Parameter filename, Specify this when your model is a combined model.\"\n",
+    ")\n",
+    "add_arg(\n",
+    "    \"--model_dir\",\n",
+    "    type=str,\n",
+    "    default=None,\n",
+    "    help=\n",
+    "    \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
+    ")\n",
+    "add_arg(\"--use_gpu\",type=bool,default=True, help=\"Whether use gpu.\")\n",
+    "args = parser.parse_args(\"--checkpoint_path examples/aishell/ckpt/checkpoints/step-1876 --config examples/aishell/conf/deepspeech2.yaml --opts data.test_manifest examples/aishell/data/manifest.test data.mean_std_filepath examples/aishell/data/mean_std.npz  data.vocab_filepath examples/aishell/data/vocab.txt\".split())\n",
+    "\n",
+    "\n",
+    "config = get_cfg_defaults()\n",
+    "if args.config:\n",
+    "    config.merge_from_file(args.config)\n",
+    "if args.opts:\n",
+    "    config.merge_from_list(args.opts)\n",
+    "config.freeze()\n",
+    "print(config)\n",
+    "\n",
+    "args.warmup_manifest = config.data.test_manifest\n",
+    "print_arguments(args)\n",
+    "\n",
+    "if args.dump_config:\n",
+    "    with open(args.dump_config, 'w') as f:\n",
+    "        print(config, file=f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = ManifestDataset(\n",
+    "        config.data.test_manifest,\n",
+    "        config.data.vocab_filepath,\n",
+    "        config.data.mean_std_filepath,\n",
+    "        augmentation_config=\"{}\",\n",
+    "        max_duration=config.data.max_duration,\n",
+    "        min_duration=config.data.min_duration,\n",
+    "        stride_ms=config.data.stride_ms,\n",
+    "        window_ms=config.data.window_ms,\n",
+    "        n_fft=config.data.n_fft,\n",
+    "        max_freq=config.data.max_freq,\n",
+    "        target_sample_rate=config.data.target_sample_rate,\n",
+    "        specgram_type=config.data.specgram_type,\n",
+    "        use_dB_normalization=config.data.use_dB_normalization,\n",
+    "        target_dB=config.data.target_dB,\n",
+    "        random_seed=config.data.random_seed,\n",
+    "        keep_transcription_text=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/dygraph/layers.py:1303: UserWarning: Skip loading for decoder.ctc_lo.weight. decoder.ctc_lo.weight is not found in the provided dict.\n",
+      "  warnings.warn((\"Skip loading for {}. \".format(key) + str(err)))\n",
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/dygraph/layers.py:1303: UserWarning: Skip loading for decoder.ctc_lo.bias. decoder.ctc_lo.bias is not found in the provided dict.\n",
+      "  warnings.warn((\"Skip loading for {}. \".format(key) + str(err)))\n",
+      "2021-03-17 11:10:00,017 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt/checkpoints/step-1876.pdparams\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "layer summary:\n",
+      "encoder.conv.conv_in.conv.weight|[32, 1, 41, 11]|14432\n",
+      "encoder.conv.conv_in.bn.weight|[32]|32\n",
+      "encoder.conv.conv_in.bn.bias|[32]|32\n",
+      "encoder.conv.conv_in.bn._mean|[32]|32\n",
+      "encoder.conv.conv_in.bn._variance|[32]|32\n",
+      "encoder.conv.conv_stack.0.conv.weight|[32, 32, 21, 11]|236544\n",
+      "encoder.conv.conv_stack.0.bn.weight|[32]|32\n",
+      "encoder.conv.conv_stack.0.bn.bias|[32]|32\n",
+      "encoder.conv.conv_stack.0.bn._mean|[32]|32\n",
+      "encoder.conv.conv_stack.0.bn._variance|[32]|32\n",
+      "encoder.rnn.rnn_stacks.0.fw_fc.weight|[1312, 3072]|4030464\n",
+      "encoder.rnn.rnn_stacks.0.fw_bn.weight|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.fw_bn.bias|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.fw_bn._mean|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.fw_bn._variance|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.bw_fc.weight|[1312, 3072]|4030464\n",
+      "encoder.rnn.rnn_stacks.0.bw_bn.weight|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.bw_bn.bias|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.bw_bn._mean|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.bw_bn._variance|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.fw_cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.0.fw_cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.bw_cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.0.bw_cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.0.fw_rnn.cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.0.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.0.bw_rnn.cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.fw_fc.weight|[2048, 3072]|6291456\n",
+      "encoder.rnn.rnn_stacks.1.fw_bn.weight|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.fw_bn.bias|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.fw_bn._mean|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.fw_bn._variance|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.bw_fc.weight|[2048, 3072]|6291456\n",
+      "encoder.rnn.rnn_stacks.1.bw_bn.weight|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.bw_bn.bias|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.bw_bn._mean|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.bw_bn._variance|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.fw_cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.1.fw_cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.bw_cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.1.bw_cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.1.fw_rnn.cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.1.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.1.bw_rnn.cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.fw_fc.weight|[2048, 3072]|6291456\n",
+      "encoder.rnn.rnn_stacks.2.fw_bn.weight|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.fw_bn.bias|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.fw_bn._mean|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.fw_bn._variance|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.bw_fc.weight|[2048, 3072]|6291456\n",
+      "encoder.rnn.rnn_stacks.2.bw_bn.weight|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.bw_bn.bias|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.bw_bn._mean|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.bw_bn._variance|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.fw_cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.2.fw_cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.bw_cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.2.bw_cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.2.fw_rnn.cell.bias_hh|[3072]|3072\n",
+      "encoder.rnn.rnn_stacks.2.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
+      "encoder.rnn.rnn_stacks.2.bw_rnn.cell.bias_hh|[3072]|3072\n",
+      "decoder.ctc_lo.weight|[2048, 4300]|8806400\n",
+      "decoder.ctc_lo.bias|[4300]|4300\n",
+      "layer has 66 parameters, 80148012 elements.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = DeepSpeech2InferModel.from_pretrained(dataset, config,\n",
+    "                                             args.checkpoint_path)\n",
+    "model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def init_predictor(args):\n",
+    "    if args.model_dir is not None:\n",
+    "        config = Config(args.model_dir)\n",
+    "    else:\n",
+    "        config = Config(args.model_file, args.params_file)\n",
+    "\n",
+    "    config.enable_memory_optim()\n",
+    "    if args.use_gpu:\n",
+    "        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
+    "    else:\n",
+    "        # If not specific mkldnn, you can set the blas thread.\n",
+    "        # The thread num should not be greater than the number of cores in the CPU.\n",
+    "        config.set_cpu_math_library_num_threads(4)\n",
+    "        #config.enable_mkldnn()\n",
+    "        \n",
+    "    config.switch_ir_optim(False)\n",
+    "\n",
+    "    predictor = create_predictor(config)\n",
+    "    return predictor\n",
+    "\n",
+    "def run(predictor, audio, audio_len):\n",
+    "    # copy img data to input tensor\n",
+    "    input_names = predictor.get_input_names()\n",
+    "    for i, name in enumerate(input_names):\n",
+    "        print(\"input:\", i, name)\n",
+    "        \n",
+    "    audio_tensor = predictor.get_input_handle('audio')\n",
+    "    audio_tensor.reshape(audio.shape)\n",
+    "    audio_tensor.copy_from_cpu(audio.copy())\n",
+    "    \n",
+    "    audiolen_tensor = predictor.get_input_handle('audio_len')\n",
+    "    audiolen_tensor.reshape(audio_len.shape)\n",
+    "    audiolen_tensor.copy_from_cpu(audio_len.copy())\n",
+    "\n",
+    "    output_names = predictor.get_output_names()\n",
+    "    for i, name in enumerate(output_names):\n",
+    "        print(\"output:\", i, name)\n",
+    "\n",
+    "    # do the inference\n",
+    "    predictor.run()\n",
+    "\n",
+    "    results = []\n",
+    "    # get out data from output tensor\n",
+    "    output_names = predictor.get_output_names()\n",
+    "    for i, name in enumerate(output_names):\n",
+    "        output_tensor = predictor.get_output_handle(name)\n",
+    "        output_data = output_tensor.copy_to_cpu()\n",
+    "        results.append(output_data)\n",
+    "\n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "predictor = init_predictor(args)\n",
+    "\n",
+    "def file_to_transcript(filename):\n",
+    "        print(filename)\n",
+    "        feature = dataset.process_utterance(filename, \"\")\n",
+    "        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]\n",
+    "        audio_len = feature[0].shape[1]\n",
+    "        audio_len = np.array([audio_len]).astype('int64')  # [1]\n",
+    "        \n",
+    "        \n",
+    "        i_probs = run(predictor, audio, audio_len)\n",
+    "        print('jit:', i_probs[0], type(i_probs[0]))\n",
+    "        \n",
+    "        audio = paddle.to_tensor(audio)\n",
+    "        audio_len = paddle.to_tensor(audio_len)\n",
+    "        print(audio.shape)\n",
+    "        print(audio_len.shape)\n",
+    "        \n",
+    "        #eouts, eouts_len = model.encoder(audio, audio_len)\n",
+    "        #probs = model.decoder.probs(eouts)\n",
+    "        probs = model.forward(audio, audio_len)\n",
+    "        print('paddle:', probs.numpy())\n",
+    "        \n",
+    "        flag = np.allclose(i_probs[0], probs.numpy())\n",
+    "        print(flag)\n",
+    "        \n",
+    "        return probs\n",
+    "\n",
+    "#         result_transcript = model.decode(\n",
+    "#             audio,\n",
+    "#             audio_len,\n",
+    "#             vocab_list=dataset.vocab_list,\n",
+    "#             decoding_method=config.decoding.decoding_method,\n",
+    "#             lang_model_path=config.decoding.lang_model_path,\n",
+    "#             beam_alpha=config.decoding.alpha,\n",
+    "#             beam_beta=config.decoding.beta,\n",
+    "#             beam_size=config.decoding.beam_size,\n",
+    "#             cutoff_prob=config.decoding.cutoff_prob,\n",
+    "#             cutoff_top_n=config.decoding.cutoff_top_n,\n",
+    "#             num_processes=config.decoding.num_proc_bsearch)\n",
+    "#         return result_transcript[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warm-up Test Case %d: %s 0 /home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../..//examples/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0121.wav\n",
+      "/home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../..//examples/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0121.wav\n",
+      "input: 0 audio\n",
+      "input: 1 audio_len\n",
+      "output: 0 tmp_75\n",
+      "jit: [[[1.40282078e-04 3.31296207e-04 5.57157793e-04 ... 1.07916087e-04\n",
+      "   8.73636964e-05 1.96113906e-04]\n",
+      "  [1.38032061e-04 2.70526099e-04 4.53807996e-04 ... 1.02293277e-04\n",
+      "   8.40202629e-05 1.90612729e-04]\n",
+      "  [1.38912103e-04 2.45687814e-04 3.99624696e-04 ... 9.70420660e-05\n",
+      "   7.88255784e-05 1.80084753e-04]\n",
+      "  ...\n",
+      "  [3.28999187e-04 2.59723864e-04 3.03535169e-04 ... 2.82066030e-04\n",
+      "   1.11002744e-04 1.27009131e-04]\n",
+      "  [2.91427423e-04 2.20203598e-04 2.85082555e-04 ... 3.27318383e-04\n",
+      "   1.09202861e-04 1.17112293e-04]\n",
+      "  [3.63971514e-04 1.47859042e-04 2.24457763e-04 ... 3.63016297e-04\n",
+      "   1.34765272e-04 1.61947115e-04]]] <class 'numpy.ndarray'>\n",
+      "[1, 161, 419]\n",
+      "[1]\n",
+      "paddle: [[[3.4913886e-04 2.5836096e-04 4.2449642e-04 ... 7.2210147e-05\n",
+      "   7.1211573e-05 2.0057644e-04]\n",
+      "  [3.8406707e-04 2.4088801e-04 5.0910388e-04 ... 6.1701416e-05\n",
+      "   6.7852285e-05 2.3967208e-04]\n",
+      "  [4.1069370e-04 2.5478008e-04 6.7985675e-04 ... 5.8369777e-05\n",
+      "   6.2065104e-05 2.5938542e-04]\n",
+      "  ...\n",
+      "  [6.6656910e-04 3.1835871e-04 7.5929717e-04 ... 1.1990797e-04\n",
+      "   3.7087579e-05 3.4520373e-04]\n",
+      "  [4.7881933e-04 2.7979453e-04 6.7949941e-04 ... 1.2511105e-04\n",
+      "   4.5631223e-05 3.7984925e-04]\n",
+      "  [2.8661705e-04 2.9201157e-04 4.5970027e-04 ... 1.4581002e-04\n",
+      "   7.8281126e-05 3.8263199e-04]]]\n",
+      "False\n"
+     ]
+    }
+   ],
+   "source": [
+    "manifest = read_manifest(args.warmup_manifest)\n",
+    "\n",
+    "for idx, sample in enumerate(manifest[:1]):\n",
+    "    print(\"Warm-up Test Case %d: %s\", idx, sample['audio_filepath'])\n",
+    "    start_time = time.time()\n",
+    "    transcript = file_to_transcript(sample['audio_filepath'])\n",
+    "    finish_time = time.time()\n",
+    "#     print(\"Response Time: %f, Transcript: %s\" %\n",
+    "#           (finish_time - start_time, transcript))\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 161, 419) (1,)\n",
+      "input: 0 audio\n",
+      "input: 1 audio_len\n",
+      "output: 0 tmp_75\n",
+      "jit: [[[1.40282078e-04 3.31296207e-04 5.57157793e-04 ... 1.07916087e-04\n",
+      "   8.73636964e-05 1.96113906e-04]\n",
+      "  [1.38032061e-04 2.70526099e-04 4.53807996e-04 ... 1.02293277e-04\n",
+      "   8.40202629e-05 1.90612729e-04]\n",
+      "  [1.38912103e-04 2.45687814e-04 3.99624696e-04 ... 9.70420660e-05\n",
+      "   7.88255784e-05 1.80084753e-04]\n",
+      "  ...\n",
+      "  [3.28999187e-04 2.59723864e-04 3.03535169e-04 ... 2.82066030e-04\n",
+      "   1.11002744e-04 1.27009131e-04]\n",
+      "  [2.91427423e-04 2.20203598e-04 2.85082555e-04 ... 3.27318383e-04\n",
+      "   1.09202861e-04 1.17112293e-04]\n",
+      "  [3.63971514e-04 1.47859042e-04 2.24457763e-04 ... 3.63016297e-04\n",
+      "   1.34765272e-04 1.61947115e-04]]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def test(filename):\n",
+    "    feature = dataset.process_utterance(filename, \"\")\n",
+    "    audio = np.array([feature[0]]).astype('float32')  #[1, D, T]\n",
+    "    audio_len = feature[0].shape[1]\n",
+    "    audio_len = np.array([audio_len]).astype('int64')  # [1]\n",
+    "    \n",
+    "    print(audio.shape, audio_len.shape)\n",
+    "\n",
+    "    i_probs = run(predictor, audio, audio_len)\n",
+    "    print('jit:', i_probs[0])\n",
+    "    return i_probs\n",
+    "    \n",
+    "probs = test(sample['audio_filepath'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/tensor/creation.py:143: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  if data.dtype == np.object:\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = paddle.to_tensor([1,3,4])\n",
+    "a.numpy?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/deepspeech/modules/init.py
+++ b/deepspeech/modules/init.py
@ -43,20 +43,53 @@ if not hasattr(paddle.Tensor, 'cat'):
    paddle.Tensor.cat = paddle.Tensor.concat


+def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
+    return xs.equal(paddle.to_tensor(ys, dtype=xs.dtype, place=xs.place))
+
+
+if not hasattr(paddle.Tensor, 'eq'):
+    logger.warn(
+        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.eq = eq
+
+
+def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
+    return xs
+
+
+if not hasattr(paddle.Tensor, 'contiguous'):
+    logger.warn(
+        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
+    )
+    paddle.Tensor.contiguous = contiguous
+
+
 def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
    nargs = len(args)
    assert (nargs <= 1)
    s = paddle.shape(xs)
    if nargs == 1:
-        return s[args]
+        return s[args[0]]
    else:
        return s


-# logger.warn(
-#     "override size of paddle.Tensor if exists or register, remove this when fixed!"
-# )
-# paddle.Tensor.size = size
+#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
+logger.warn(
+    "override size of paddle.Tensor "
+    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
+)
+paddle.Tensor.size = size
+
+
+def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
+    return xs.reshape(args)
+
+
+if not hasattr(paddle.Tensor, 'view'):
+    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
+    paddle.Tensor.view = view


 def masked_fill(xs: paddle.Tensor,
@ -185,6 +218,14 @@ if not hasattr(paddle.nn, 'ConstantPad2d'):
        "register user ConstantPad2d to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'ConstantPad2d', ConstantPad2d)

+if not hasattr(paddle, 'softmax'):
+    logger.warn("register user softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
+
+if not hasattr(paddle, 'sigmoid'):
+    logger.warn("register user softmax to paddle, remove this when fixed!")
+    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
+

 # hack loss
 def ctc_loss(logits,
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
@ -0,0 +1,227 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multi-Head Attention layer definition."""
+import math
+import logging
+from typing import Optional, Tuple
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from paddle.nn import initializer as I
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
+
+
+class MultiHeadedAttention(nn.Layer):
+    """Multi-Head Attention layer."""
+
+    def __init__(self, n_head: int, n_feat: int, dropout_rate: float):
+        """Construct an MultiHeadedAttention object.
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+    def forward_qkv(self,
+                    query: paddle.Tensor,
+                    key: paddle.Tensor,
+                    value: paddle.Tensor
+                    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (paddle.Tensor): Query tensor (#batch, time1, size).
+            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            value (paddle.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            paddle.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            paddle.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            paddle.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
+        k = k.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
+        v = v.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
+
+        return q, k, v
+
+    def forward_attention(self,
+                          value: paddle.Tensor,
+                          scores: paddle.Tensor,
+                          mask: Optional[paddle.Tensor]) -> paddle.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (paddle.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (paddle.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            paddle.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = paddle.softmax(
+                scores, axis=-1).masked_fill(mask,
+                                             0.0)  # (batch, head, time1, time2)
+        else:
+            attn = paddle.softmax(
+                scores, axis=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+        x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = x.transpose([0, 2, 1, 3]).contiguous().view(
+            n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: Optional[paddle.Tensor]) -> paddle.Tensor:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        scores = paddle.matmul(q, k.transpose(
+            [0, 1, 3, 2])) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask)
+
+
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding."""
+
+    def __init__(self, n_head, n_feat, dropout_rate):
+        """Construct an RelPositionMultiHeadedAttention object.
+        Paper: https://arxiv.org/abs/1901.02860
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(n_head, n_feat, dropout_rate)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+
+    def rel_shift(self, x, zero_triu: bool=False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+
+        zero_pad = torch.zeros(
+            (x.size()[0], x.size()[1], x.size()[2], 1),
+            device=x.device,
+            dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+
+        return x
+
+    def forward(self,
+                query: torch.Tensor,
+                key: torch.Tensor,
+                value: torch.Tensor,
+                pos_emb: torch.Tensor,
+                mask: Optional[torch.Tensor]):
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # Remove rel_shift since it is useless in speech recognition,
+        # and it requires special attention for streaming.
+        # matrix_bd = self.rel_shift(matrix_bd)
+
+        scores = (matrix_ac + matrix_bd) / math.sqrt(
+            self.d_k)  # (batch, head, time1, time2)
+
+        return self.forward_attention(v, scores, mask)
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ python_speech_features
 tensorboardX
 yacs
 typeguard
+pre-commit