Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into fix_bug

4 years ago · f4e59293bf
parent 1d7072731d 5ed56b3f7e
commit f4e59293bf
274 changed files with 7396 additions and 24655 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,5 +18,7 @@ tools/sox-14.4.2
 tools/soxbindings
 tools/montreal-forced-aligner/
 tools/Montreal-Forced-Aligner/
+tools/sctk
+tools/sctk-20159b5/

 *output/
--- a/.notebook/Linear_test.ipynb
+++ b/.notebook/Linear_test.ipynb
@ -1,605 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "academic-surname",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import paddle\n",
-    "from paddle import nn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "fundamental-treasure",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    }
-   ],
-   "source": [
-    "L = nn.Linear(256, 2048)\n",
-    "L2 = nn.Linear(2048, 256)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "consolidated-elephant",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import torch\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "moderate-noise",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "float64\n",
-      "Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[[-1.54171216, -2.61531472, -1.79881978, ..., -0.31395876,  0.56513089, -0.44516513],\n",
-      "         [-0.79492962,  1.91157901,  0.66567147, ...,  0.54825783, -1.01471853, -0.84924090],\n",
-      "         [-1.22556651, -0.36225814,  0.65063190, ...,  0.65726501,  0.05563191,  0.09009409],\n",
-      "         ...,\n",
-      "         [ 0.38615900, -0.77905393,  0.99732304, ..., -1.38463700, -3.32365036, -1.31089687],\n",
-      "         [ 0.05579993,  0.06885809, -1.66662002, ..., -0.23346378, -3.29372883,  1.30561364],\n",
-      "         [ 1.90676069,  1.95093191, -0.28849599, ..., -0.06860496,  0.95347673,  1.00475824]],\n",
-      "\n",
-      "        [[-0.91453546,  0.55298805, -1.06146812, ..., -0.86378336,  1.00454640,  1.26062179],\n",
-      "         [ 0.10223761,  0.81301165,  2.36865163, ...,  0.16821407,  0.29240361,  1.05408621],\n",
-      "         [-1.33196676,  1.94433689,  0.01934209, ...,  0.48036841,  0.51585966,  1.22893548],\n",
-      "         ...,\n",
-      "         [-0.19558455, -0.47075930,  0.90796155, ..., -1.28598249, -0.24321797,  0.17734711],\n",
-      "         [ 0.89819717, -1.39516675,  0.17138045, ...,  2.39761519,  1.76364994, -0.52177650],\n",
-      "         [ 0.94122332, -0.18581429,  1.36099780, ...,  0.67647684, -0.04699665,  1.51205540]]])\n",
-      "tensor([[[-1.5417, -2.6153, -1.7988,  ..., -0.3140,  0.5651, -0.4452],\n",
-      "         [-0.7949,  1.9116,  0.6657,  ...,  0.5483, -1.0147, -0.8492],\n",
-      "         [-1.2256, -0.3623,  0.6506,  ...,  0.6573,  0.0556,  0.0901],\n",
-      "         ...,\n",
-      "         [ 0.3862, -0.7791,  0.9973,  ..., -1.3846, -3.3237, -1.3109],\n",
-      "         [ 0.0558,  0.0689, -1.6666,  ..., -0.2335, -3.2937,  1.3056],\n",
-      "         [ 1.9068,  1.9509, -0.2885,  ..., -0.0686,  0.9535,  1.0048]],\n",
-      "\n",
-      "        [[-0.9145,  0.5530, -1.0615,  ..., -0.8638,  1.0045,  1.2606],\n",
-      "         [ 0.1022,  0.8130,  2.3687,  ...,  0.1682,  0.2924,  1.0541],\n",
-      "         [-1.3320,  1.9443,  0.0193,  ...,  0.4804,  0.5159,  1.2289],\n",
-      "         ...,\n",
-      "         [-0.1956, -0.4708,  0.9080,  ..., -1.2860, -0.2432,  0.1773],\n",
-      "         [ 0.8982, -1.3952,  0.1714,  ...,  2.3976,  1.7636, -0.5218],\n",
-      "         [ 0.9412, -0.1858,  1.3610,  ...,  0.6765, -0.0470,  1.5121]]])\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    }
-   ],
-   "source": [
-    "x = np.random.randn(2, 51, 256)\n",
-    "print(x.dtype)\n",
-    "px = paddle.to_tensor(x, dtype='float32')\n",
-    "tx = torch.tensor(x, dtype=torch.float32)\n",
-    "print(px)\n",
-    "print(tx)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cooked-progressive",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "mechanical-prisoner",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
-    "t_norm_ff = data['norm_ff']\n",
-    "t_ff_out = data['ff_out']\n",
-    "t_ff_l_x = data['ff_l_x']\n",
-    "t_ff_l_a_x = data['ff_l_a_x']\n",
-    "t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
-    "t_ps = data['ps']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "indie-marriage",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "assured-zambia",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n",
-      "True\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
-    "L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
-    "\n",
-    "ps = []\n",
-    "for n, p in L.named_parameters():\n",
-    "   ps.append(p)\n",
-    "\n",
-    "for n, p in L2.state_dict().items():\n",
-    "    ps.append(p)\n",
-    "    \n",
-    "for p, tp in zip(ps, t_ps):\n",
-    "    print(np.allclose(p.numpy(), tp.T))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "committed-jacob",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "extreme-traffic",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "optimum-milwaukee",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "viral-indian",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n",
-      "True\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
-    "# t_norm_ff = data['norm_ff']\n",
-    "# t_ff_out = data['ff_out']\n",
-    "# t_ff_l_x = data['ff_l_x']\n",
-    "# t_ff_l_a_x = data['ff_l_a_x']\n",
-    "# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
-    "# t_ps = data['ps']\n",
-    "TL = torch.nn.Linear(256, 2048)\n",
-    "TL2 = torch.nn.Linear(2048, 256)\n",
-    "TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
-    "TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
-    "\n",
-    "# for n, p in TL.named_parameters():\n",
-    "#    print(n, p)\n",
-    "# for n, p in TL2.named_parameters():\n",
-    "#    print(n, p)\n",
-    "\n",
-    "ps = []\n",
-    "for n, p in TL.state_dict().items():\n",
-    "    ps.append(p.data.numpy())\n",
-    "    \n",
-    "for n, p in TL2.state_dict().items():\n",
-    "    ps.append(p.data.numpy())\n",
-    "    \n",
-    "for p, tp in zip(ps, t_ps):\n",
-    "    print(np.allclose(p, tp))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "skilled-vietnamese",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[[ 0.67277956  0.08313607 -0.62761104 ... -0.17480263  0.42718208\n",
-      "   -0.5787626 ]\n",
-      "  [ 0.91516656  0.5393416   1.7159258  ...  0.06144593  0.06486575\n",
-      "   -0.03350811]\n",
-      "  [ 0.438351    0.6227843   0.24096036 ...  1.0912522  -0.90929437\n",
-      "   -1.012989  ]\n",
-      "  ...\n",
-      "  [ 0.68631977  0.14240924  0.10763275 ... -0.11513516  0.48065388\n",
-      "    0.04070369]\n",
-      "  [-0.9525228   0.23197874  0.31264272 ...  0.5312439   0.18773697\n",
-      "   -0.8450228 ]\n",
-      "  [ 0.42024016 -0.04561988  0.54541194 ... -0.41933843 -0.00436018\n",
-      "   -0.06663495]]\n",
-      "\n",
-      " [[-0.11638781 -0.33566502 -0.20887226 ...  0.17423287 -0.9195841\n",
-      "   -0.8161046 ]\n",
-      "  [-0.3469874   0.88269687 -0.11887559 ... -0.15566081  0.16357468\n",
-      "   -0.20766167]\n",
-      "  [-0.3847657   0.3984318  -0.06963477 ... -0.00360622  1.2360432\n",
-      "   -0.26811332]\n",
-      "  ...\n",
-      "  [ 0.08230796 -0.46158582  0.54582864 ...  0.15747628 -0.44790155\n",
-      "    0.06020184]\n",
-      "  [-0.8095085   0.43163058 -0.42837143 ...  0.8627463   0.90656304\n",
-      "    0.15847842]\n",
-      "  [-1.485811   -0.18216592 -0.8882585  ...  0.32596245  0.7822631\n",
-      "   -0.6460344 ]]]\n",
-      "[[[ 0.67278004  0.08313602 -0.6276114  ... -0.17480245  0.42718196\n",
-      "   -0.5787625 ]\n",
-      "  [ 0.91516703  0.5393413   1.7159253  ...  0.06144581  0.06486579\n",
-      "   -0.03350812]\n",
-      "  [ 0.43835106  0.62278455  0.24096027 ...  1.0912521  -0.9092943\n",
-      "   -1.0129892 ]\n",
-      "  ...\n",
-      "  [ 0.6863195   0.14240888  0.10763284 ... -0.11513527  0.48065376\n",
-      "    0.04070365]\n",
-      "  [-0.9525231   0.23197863  0.31264275 ...  0.53124386  0.18773702\n",
-      "   -0.84502304]\n",
-      "  [ 0.42024007 -0.04561983  0.545412   ... -0.41933888 -0.00436005\n",
-      "   -0.066635  ]]\n",
-      "\n",
-      " [[-0.11638767 -0.33566508 -0.20887226 ...  0.17423296 -0.9195838\n",
-      "   -0.8161046 ]\n",
-      "  [-0.34698725  0.88269705 -0.11887549 ... -0.15566081  0.16357464\n",
-      "   -0.20766166]\n",
-      "  [-0.3847657   0.3984319  -0.06963488 ... -0.00360619  1.2360426\n",
-      "   -0.26811326]\n",
-      "  ...\n",
-      "  [ 0.08230786 -0.4615857   0.5458287  ...  0.15747619 -0.44790167\n",
-      "    0.06020182]\n",
-      "  [-0.8095083   0.4316307  -0.42837155 ...  0.862746    0.9065631\n",
-      "    0.15847899]\n",
-      "  [-1.485811   -0.18216613 -0.8882584  ...  0.32596254  0.7822631\n",
-      "   -0.6460344 ]]]\n",
-      "True\n",
-      "False\n"
-     ]
-    }
-   ],
-   "source": [
-    "y = L(px)\n",
-    "print(y.numpy())\n",
-    "\n",
-    "ty = TL(tx)\n",
-    "print(ty.data.numpy())\n",
-    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
-    "print(np.allclose(y.numpy(), ty.detach().numpy()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "incorrect-allah",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "prostate-cameroon",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "governmental-surge",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[ 0.04476918  0.554463   -0.3027508  ... -0.49600336  0.3751858\n",
-      "   0.8254095 ]\n",
-      " [ 0.95594174 -0.29528382 -1.2899452  ...  0.43718258  0.05584608\n",
-      "  -0.06974669]]\n",
-      "[[ 0.04476918  0.5544631  -0.3027507  ... -0.49600336  0.37518573\n",
-      "   0.8254096 ]\n",
-      " [ 0.95594174 -0.29528376 -1.2899454  ...  0.4371827   0.05584623\n",
-      "  -0.0697467 ]]\n",
-      "True\n",
-      "False\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "x = np.random.randn(2, 256)\n",
-    "px = paddle.to_tensor(x, dtype='float32')\n",
-    "tx = torch.tensor(x, dtype=torch.float32)\n",
-    "y = L(px)\n",
-    "print(y.numpy())\n",
-    "ty = TL(tx)\n",
-    "print(ty.data.numpy())\n",
-    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
-    "print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
-    "print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "confidential-jacket",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "improved-civilization",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "5e7e7c9fde8350084abf1898cf52651cfc84b17a\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(paddle.version.commit)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "d1e2d3b4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['__builtins__',\n",
-       " '__cached__',\n",
-       " '__doc__',\n",
-       " '__file__',\n",
-       " '__loader__',\n",
-       " '__name__',\n",
-       " '__package__',\n",
-       " '__spec__',\n",
-       " 'commit',\n",
-       " 'full_version',\n",
-       " 'istaged',\n",
-       " 'major',\n",
-       " 'minor',\n",
-       " 'mkl',\n",
-       " 'patch',\n",
-       " 'rc',\n",
-       " 'show',\n",
-       " 'with_mkl']"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dir(paddle.version)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "c880c719",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2.1.0\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(paddle.version.full_version)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "f26977bf",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "commit: 5e7e7c9fde8350084abf1898cf52651cfc84b17a\n",
-      "None\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(paddle.version.show())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "04ad47f6",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1.6.0\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(torch.__version__)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "e1e03830",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['__builtins__',\n",
-       " '__cached__',\n",
-       " '__doc__',\n",
-       " '__file__',\n",
-       " '__loader__',\n",
-       " '__name__',\n",
-       " '__package__',\n",
-       " '__spec__',\n",
-       " '__version__',\n",
-       " 'cuda',\n",
-       " 'debug',\n",
-       " 'git_version',\n",
-       " 'hip']"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "dir(torch.version)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "4ad0389b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'b31f58de6fa8bbda5353b3c77d9be4914399724d'"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "torch.version.git_version"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "7870ea10",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'10.2'"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "torch.version.cuda"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "db8ee5a7",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6321ec2a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.notebook/WarmupLR.ipynb
+++ b/.notebook/WarmupLR.ipynb
--- a/.notebook/audio_feature.ipynb
+++ b/.notebook/audio_feature.ipynb
--- a/.notebook/compute_cmvn_loader_test.ipynb
+++ b/.notebook/compute_cmvn_loader_test.ipynb
--- a/.notebook/dataloader.ipynb
+++ b/.notebook/dataloader.ipynb
@ -1,389 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "emerging-meter",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  def convert_to_list(value, n, name, dtype=np.int):\n",
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
-      "  from numpy.dual import register_func\n",
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  long_ = _make_signed(np.long)\n",
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  ulong = _make_unsigned(np.long)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import math\n",
-    "import random\n",
-    "import tarfile\n",
-    "import logging\n",
-    "import numpy as np\n",
-    "from collections import namedtuple\n",
-    "from functools import partial\n",
-    "\n",
-    "import paddle\n",
-    "from paddle.io import Dataset\n",
-    "from paddle.io import DataLoader\n",
-    "from paddle.io import BatchSampler\n",
-    "from paddle.io import DistributedBatchSampler\n",
-    "from paddle import distributed as dist\n",
-    "\n",
-    "from data_utils.utility import read_manifest\n",
-    "from data_utils.augmentor.augmentation import AugmentationPipeline\n",
-    "from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n",
-    "from data_utils.speech import SpeechSegment\n",
-    "from data_utils.normalizer import FeatureNormalizer\n",
-    "\n",
-    "\n",
-    "from data_utils.dataset import (\n",
-    "    DeepSpeech2Dataset,\n",
-    "    DeepSpeech2DistributedBatchSampler,\n",
-    "    DeepSpeech2BatchSampler,\n",
-    "    SpeechCollator,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "excessive-american",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_dataloader(manifest_path,\t\n",
-    "                      vocab_filepath,\t\n",
-    "                      mean_std_filepath,\t\n",
-    "                      augmentation_config='{}',\t\n",
-    "                      max_duration=float('inf'),\t\n",
-    "                      min_duration=0.0,\t\n",
-    "                      stride_ms=10.0,\t\n",
-    "                      window_ms=20.0,\t\n",
-    "                      max_freq=None,\t\n",
-    "                      specgram_type='linear',\t\n",
-    "                      use_dB_normalization=True,\t\n",
-    "                      random_seed=0,\t\n",
-    "                      keep_transcription_text=False,\t\n",
-    "                      is_training=False,\t\n",
-    "                      batch_size=1,\t\n",
-    "                      num_workers=0,\t\n",
-    "                      sortagrad=False,\t\n",
-    "                      shuffle_method=None,\t\n",
-    "                      dist=False):\t\n",
-    "\n",
-    "    dataset = DeepSpeech2Dataset(\t\n",
-    "        manifest_path,\t\n",
-    "        vocab_filepath,\t\n",
-    "        mean_std_filepath,\t\n",
-    "        augmentation_config=augmentation_config,\t\n",
-    "        max_duration=max_duration,\t\n",
-    "        min_duration=min_duration,\t\n",
-    "        stride_ms=stride_ms,\t\n",
-    "        window_ms=window_ms,\t\n",
-    "        max_freq=max_freq,\t\n",
-    "        specgram_type=specgram_type,\t\n",
-    "        use_dB_normalization=use_dB_normalization,\t\n",
-    "        random_seed=random_seed,\t\n",
-    "        keep_transcription_text=keep_transcription_text)\t\n",
-    "\n",
-    "    if dist:\t\n",
-    "        batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n",
-    "            dataset,\t\n",
-    "            batch_size,\t\n",
-    "            num_replicas=None,\t\n",
-    "            rank=None,\t\n",
-    "            shuffle=is_training,\t\n",
-    "            drop_last=is_training,\t\n",
-    "            sortagrad=is_training,\t\n",
-    "            shuffle_method=shuffle_method)\t\n",
-    "    else:\t\n",
-    "        batch_sampler = DeepSpeech2BatchSampler(\t\n",
-    "            dataset,\t\n",
-    "            shuffle=is_training,\t\n",
-    "            batch_size=batch_size,\t\n",
-    "            drop_last=is_training,\t\n",
-    "            sortagrad=is_training,\t\n",
-    "            shuffle_method=shuffle_method)\t\n",
-    "\n",
-    "    def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n",
-    "        \"\"\"\t\n",
-    "        Padding audio features with zeros to make them have the same shape (or\t\n",
-    "        a user-defined shape) within one bach.\t\n",
-    "\n",
-    "        If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n",
-    "        as the target shape for padding. Otherwise, `padding_to` will be the\t\n",
-    "        target shape (only refers to the second axis).\t\n",
-    "\n",
-    "        If `flatten` is True, features will be flatten to 1darray.\t\n",
-    "        \"\"\"\t\n",
-    "        new_batch = []\t\n",
-    "        # get target shape\t\n",
-    "        max_length = max([audio.shape[1] for audio, text in batch])\t\n",
-    "        if padding_to != -1:\t\n",
-    "            if padding_to < max_length:\t\n",
-    "                raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n",
-    "                                 \"than any instance's shape in the batch\")\t\n",
-    "            max_length = padding_to\t\n",
-    "        max_text_length = max([len(text) for audio, text in batch])\t\n",
-    "        # padding\t\n",
-    "        padded_audios = []\t\n",
-    "        audio_lens = []\t\n",
-    "        texts, text_lens = [], []\t\n",
-    "        for audio, text in batch:\t\n",
-    "            padded_audio = np.zeros([audio.shape[0], max_length])\t\n",
-    "            padded_audio[:, :audio.shape[1]] = audio\t\n",
-    "            if flatten:\t\n",
-    "                padded_audio = padded_audio.flatten()\t\n",
-    "            padded_audios.append(padded_audio)\t\n",
-    "            audio_lens.append(audio.shape[1])\t\n",
-    "\n",
-    "            padded_text = np.zeros([max_text_length])\n",
-    "            if is_training:\n",
-    "                padded_text[:len(text)] = text\t# ids\n",
-    "            else:\n",
-    "                padded_text[:len(text)] = [ord(t) for t in text] # string\n",
-    "            \n",
-    "            texts.append(padded_text)\t\n",
-    "            text_lens.append(len(text))\t\n",
-    "\n",
-    "        padded_audios = np.array(padded_audios).astype('float32')\t\n",
-    "        audio_lens = np.array(audio_lens).astype('int64')\t\n",
-    "        texts = np.array(texts).astype('int32')\t\n",
-    "        text_lens = np.array(text_lens).astype('int64')\t\n",
-    "        return padded_audios, texts, audio_lens, text_lens\t\n",
-    "\n",
-    "    loader = DataLoader(\t\n",
-    "        dataset,\t\n",
-    "        batch_sampler=batch_sampler,\t\n",
-    "        collate_fn=partial(padding_batch, is_training=is_training),\t\n",
-    "        num_workers=num_workers)\t\n",
-    "    return loader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "naval-brave",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n"
-     ]
-    }
-   ],
-   "source": [
-    "import sys\n",
-    "import argparse\n",
-    "import functools\n",
-    "from utils.utility import add_arguments, print_arguments\n",
-    "parser = argparse.ArgumentParser(description=__doc__)\n",
-    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
-    "# yapf: disable\n",
-    "add_arg('num_samples',      int,    5,     \"# of samples to infer.\")\n",
-    "add_arg('beam_size',        int,    500,    \"Beam search width.\")\n",
-    "add_arg('num_proc_bsearch', int,    8,      \"# of CPUs for beam search.\")\n",
-    "add_arg('num_conv_layers',  int,    2,      \"# of convolution layers.\")\n",
-    "add_arg('num_rnn_layers',   int,    3,      \"# of recurrent layers.\")\n",
-    "add_arg('rnn_layer_size',   int,    2048,   \"# of recurrent cells per layer.\")\n",
-    "add_arg('alpha',            float,  2.5,    \"Coef of LM for beam search.\")\n",
-    "add_arg('beta',             float,  0.3,    \"Coef of WC for beam search.\")\n",
-    "add_arg('cutoff_prob',      float,  1.0,    \"Cutoff probability for pruning.\")\n",
-    "add_arg('cutoff_top_n',     int,    40,     \"Cutoff number for pruning.\")\n",
-    "add_arg('use_gru',          bool,   False,  \"Use GRUs instead of simple RNNs.\")\n",
-    "add_arg('use_gpu',          bool,   True,   \"Use GPU or not.\")\n",
-    "add_arg('share_rnn_weights',bool,   True,   \"Share input-hidden weights across \"\n",
-    "                                            \"bi-directional RNNs. Not for GRU.\")\n",
-    "add_arg('infer_manifest',   str,\n",
-    "        'examples/aishell/data/manifest.dev',\n",
-    "        \"Filepath of manifest to infer.\")\n",
-    "add_arg('mean_std_path',    str,\n",
-    "        'examples/aishell/data/mean_std.npz',\n",
-    "        \"Filepath of normalizer's mean & std.\")\n",
-    "add_arg('vocab_path',       str,\n",
-    "        'examples/aishell/data/vocab.txt',\n",
-    "        \"Filepath of vocabulary.\")\n",
-    "add_arg('lang_model_path',  str,\n",
-    "        'models/lm/common_crawl_00.prune01111.trie.klm',\n",
-    "        \"Filepath for language model.\")\n",
-    "add_arg('model_path',       str,\n",
-    "        'examples/aishell/checkpoints/step_final',\n",
-    "        \"If None, the training starts from scratch, \"\n",
-    "        \"otherwise, it resumes from the pre-trained model.\")\n",
-    "add_arg('decoding_method',  str,\n",
-    "        'ctc_beam_search',\n",
-    "        \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
-    "        choices = ['ctc_beam_search', 'ctc_greedy'])\n",
-    "add_arg('error_rate_type',  str,\n",
-    "        'wer',\n",
-    "        \"Error rate type for evaluation.\",\n",
-    "        choices=['wer', 'cer'])\n",
-    "add_arg('specgram_type',    str,\n",
-    "        'linear',\n",
-    "        \"Audio feature type. Options: linear, mfcc.\",\n",
-    "        choices=['linear', 'mfcc'])\n",
-    "# yapf: disable\n",
-    "args = parser.parse_args([])\n",
-    "print(vars(args))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "bearing-physics",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "batch_reader = create_dataloader(\n",
-    "            manifest_path=args.infer_manifest,\n",
-    "            vocab_filepath=args.vocab_path,\n",
-    "            mean_std_filepath=args.mean_std_path,\n",
-    "            augmentation_config='{}',\n",
-    "            #max_duration=float('inf'),\n",
-    "            max_duration=27.0,\n",
-    "            min_duration=0.0,\n",
-    "            stride_ms=10.0,\n",
-    "            window_ms=20.0,\n",
-    "            max_freq=None,\n",
-    "            specgram_type=args.specgram_type,\n",
-    "            use_dB_normalization=True,\n",
-    "            random_seed=0,\n",
-    "            keep_transcription_text=True,\n",
-    "            is_training=False,\n",
-    "            batch_size=args.num_samples,\n",
-    "            sortagrad=True,\n",
-    "            shuffle_method=None,\n",
-    "            dist=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "classified-melissa",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [[22823, 26102, 20195, 37324, 0    , 0    ],\n",
-      "        [22238, 26469, 23601, 22909, 0    , 0    ],\n",
-      "        [20108, 26376, 22235, 26085, 0    , 0    ],\n",
-      "        [36824, 35201, 20445, 25345, 32654, 24863],\n",
-      "        [29042, 27748, 21463, 23456, 0    , 0    ]])\n",
-      "test raw 大时代里\n",
-      "test raw 煲汤受宠\n",
-      "audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [163, 167, 180, 186, 186])\n",
-      "test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [4, 4, 4, 6, 4])\n",
-      "audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [[[ 1.11669052,  0.79015088,  0.93658292, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [ 0.83549136,  0.72643483,  0.83578080, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.89155018, -0.18894747, -0.53357804, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         ...,\n",
-      "         [ 0.33386710, -0.81240511,  0.12869737, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.17537928,  0.58380985,  0.70696265, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.84175998,  1.22041416,  0.07929770, ...,  0.        ,  0.        ,  0.        ]],\n",
-      "\n",
-      "        [[-0.35964420,  0.77392709,  0.71409988, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.15990183,  0.42962283,  0.06222462, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.31166190, -0.74864638, -0.52836996, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         ...,\n",
-      "         [-0.27546275,  0.32889456,  0.12410031, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [ 0.16264282,  0.49418071, -0.15960945, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [ 0.12476666,  0.00516864,  1.16021466, ...,  0.        ,  0.        ,  0.        ]],\n",
-      "\n",
-      "        [[ 0.90202141,  1.48541915,  0.92062062, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [ 0.82661545,  1.37171340,  0.86746097, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.62287915, -0.48645937,  0.35041964, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         ...,\n",
-      "         [ 0.07376949,  0.07138316,  0.76355994, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.32306790,  0.43247896,  1.27311838, ...,  0.        ,  0.        ,  0.        ],\n",
-      "         [-0.97667056,  0.60747612,  0.79181534, ...,  0.        ,  0.        ,  0.        ]],\n",
-      "\n",
-      "        [[ 0.72022128,  0.95428467,  0.92766261, ...,  0.29105374, -0.45564806, -0.62151009],\n",
-      "         [ 0.42083180,  0.49279949,  0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n",
-      "         [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n",
-      "         ...,\n",
-      "         [-0.23009977,  1.06155431,  1.09065628, ...,  0.25581080,  0.53794998, -1.22650719],\n",
-      "         [-1.37693381,  0.30778193,  0.17152318, ...,  0.51650339,  0.25580606,  0.83097816],\n",
-      "         [-1.62180591,  1.30567718,  1.09928656, ..., -0.77590007,  1.27712476,  0.53189957]],\n",
-      "\n",
-      "        [[ 1.03205252, -0.51535392,  0.21077573, ...,  0.76618457,  1.27425683,  1.52250278],\n",
-      "         [ 0.82059991,  0.43990925,  0.13090958, ...,  0.86662549,  1.01687658,  1.48495352],\n",
-      "         [-0.75489789, -0.01997089, -0.65174174, ...,  0.09061214, -0.55211234, -0.01614586],\n",
-      "         ...,\n",
-      "         [ 0.50985396,  1.84555030,  0.79185146, ...,  1.13666189,  1.19898069,  1.98158395],\n",
-      "         [ 1.98721015,  2.52385354,  1.11714780, ...,  0.19416514,  1.11329341,  0.64460152],\n",
-      "         [ 2.69512844,  1.90993905,  0.50245082, ..., -0.50902629,  0.03333465, -1.24584770]]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
-    "    print('test', text)\n",
-    "    print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
-    "    print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
-    "    print('audio len', audio_len)\n",
-    "    print('test len', text_len)\n",
-    "    print('audio', audio)\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "unexpected-skating",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "minus-modern",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.notebook/dataloader_with_tokens_tokenids.ipynb
+++ b/.notebook/dataloader_with_tokens_tokenids.ipynb
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
--- a/.notebook/hack_api_test.ipynb
+++ b/.notebook/hack_api_test.ipynb
@ -1,290 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "breeding-haven",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "%cd ..\n",
-    "%pwd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "appropriate-theta",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LICENSE       deepspeech  examples\t\t    requirements.txt  tools\r\n",
-      "README.md     docs\t  libsndfile-1.0.28\t    setup.sh\t      utils\r\n",
-      "README_cn.md  env.sh\t  libsndfile-1.0.28.tar.gz  tests\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "!ls"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "entire-bloom",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  def convert_to_list(value, n, name, dtype=np.int):\n",
-      "WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
-      "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
-      "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
-      "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
-     ]
-    }
-   ],
-   "source": [
-    "from deepspeech.modules import loss"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "governmental-aircraft",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import paddle"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "proprietary-disaster",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "paddle.Tensor.repeat"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "first-diagram",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<property at 0x7fb515eeeb88>"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "paddle.Tensor.size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "intelligent-david",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "paddle.Tensor.cat"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "bronze-tenant",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "balanced-bearing",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "7"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a.size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "extreme-republic",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
-    "    nargs = len(args)\n",
-    "    assert (nargs <= 1)\n",
-    "    s = paddle.shape(xs)\n",
-    "    if nargs == 1:\n",
-    "        return s[args[0]]\n",
-    "    else:\n",
-    "        return s\n",
-    "\n",
-    "# logger.warn(\n",
-    "#     \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
-    "# )\n",
-    "paddle.Tensor.size = size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "gross-addiction",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
-       "       [7])"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a.size(0)\n",
-    "a.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "adverse-dining",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
-       "       [7])"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "a.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "popular-potato",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
@ -1,672 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "%cd ..\n",
-    "%pwd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
-      "2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
-      "2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
-      "2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
-      "2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
-      "2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
-      "2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
-      "2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
-      "2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
-      "2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
-      "  from numpy.dual import register_func\n",
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import time\n",
-    "import argparse\n",
-    "import functools\n",
-    "import paddle\n",
-    "import numpy as np\n",
-    "\n",
-    "from deepspeech.utils.socket_server import warm_up_test\n",
-    "from deepspeech.utils.socket_server import AsrTCPServer\n",
-    "from deepspeech.utils.socket_server import AsrRequestHandler\n",
-    "\n",
-    "from deepspeech.training.cli import default_argument_parser\n",
-    "from deepspeech.exps.deepspeech2.config import get_cfg_defaults\n",
-    "\n",
-    "from deepspeech.frontend.utility import read_manifest\n",
-    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
-    "\n",
-    "from deepspeech.models.ds2 import DeepSpeech2Model\n",
-    "from deepspeech.models.ds2 import DeepSpeech2InferModel\n",
-    "from deepspeech.io.dataset import ManifestDataset\n",
-    "\n",
-    "\n",
-    "\n",
-    "from deepspeech.frontend.utility import read_manifest"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0.0.0\n",
-      "e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
-      "OFF\n",
-      "OFF\n",
-      "commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
-      "None\n",
-      "0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['__builtins__',\n",
-       " '__cached__',\n",
-       " '__doc__',\n",
-       " '__file__',\n",
-       " '__loader__',\n",
-       " '__name__',\n",
-       " '__package__',\n",
-       " '__spec__',\n",
-       " 'commit',\n",
-       " 'full_version',\n",
-       " 'istaged',\n",
-       " 'major',\n",
-       " 'minor',\n",
-       " 'mkl',\n",
-       " 'patch',\n",
-       " 'rc',\n",
-       " 'show',\n",
-       " 'with_mkl']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(paddle.__version__)\n",
-    "print(paddle.version.commit)\n",
-    "print(paddle.version.with_mkl)\n",
-    "print(paddle.version.mkl())\n",
-    "print(paddle.version.show())\n",
-    "print(paddle.version.patch)\n",
-    "dir(paddle.version)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data:\n",
-      "  augmentation_config: conf/augmentation.config\n",
-      "  batch_size: 64\n",
-      "  dev_manifest: data/manifest.dev\n",
-      "  keep_transcription_text: False\n",
-      "  max_duration: 27.0\n",
-      "  max_freq: None\n",
-      "  mean_std_filepath: examples/aishell/data/mean_std.npz\n",
-      "  min_duration: 0.0\n",
-      "  n_fft: None\n",
-      "  num_workers: 0\n",
-      "  random_seed: 0\n",
-      "  shuffle_method: batch_shuffle\n",
-      "  sortagrad: True\n",
-      "  specgram_type: linear\n",
-      "  stride_ms: 10.0\n",
-      "  target_dB: -20\n",
-      "  target_sample_rate: 16000\n",
-      "  test_manifest: examples/aishell/data/manifest.test\n",
-      "  train_manifest: data/manifest.train\n",
-      "  use_dB_normalization: True\n",
-      "  vocab_filepath: examples/aishell/data/vocab.txt\n",
-      "  window_ms: 20.0\n",
-      "decoding:\n",
-      "  alpha: 2.6\n",
-      "  batch_size: 128\n",
-      "  beam_size: 300\n",
-      "  beta: 5.0\n",
-      "  cutoff_prob: 0.99\n",
-      "  cutoff_top_n: 40\n",
-      "  decoding_method: ctc_beam_search\n",
-      "  error_rate_type: cer\n",
-      "  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm\n",
-      "  num_proc_bsearch: 10\n",
-      "model:\n",
-      "  num_conv_layers: 2\n",
-      "  num_rnn_layers: 3\n",
-      "  rnn_layer_size: 1024\n",
-      "  share_rnn_weights: False\n",
-      "  use_gru: True\n",
-      "training:\n",
-      "  global_grad_clip: 5.0\n",
-      "  lr: 0.0005\n",
-      "  lr_decay: 0.83\n",
-      "  n_epoch: 30\n",
-      "  weight_decay: 1e-06\n",
-      "-----------  Configuration Arguments -----------\n",
-      "checkpoint_path: examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725\n",
-      "config: examples/aishell/conf/deepspeech2.yaml\n",
-      "device: gpu\n",
-      "dump_config: None\n",
-      "export_path: None\n",
-      "host_ip: localhost\n",
-      "host_port: 8086\n",
-      "model_dir: None\n",
-      "model_file: examples/aishell/jit.model.pdmodel\n",
-      "nprocs: 1\n",
-      "opts: ['data.test_manifest', 'examples/aishell/data/manifest.test', 'data.mean_std_filepath', 'examples/aishell/data/mean_std.npz', 'data.vocab_filepath', 'examples/aishell/data/vocab.txt']\n",
-      "output: None\n",
-      "params_file: examples/aishell/jit.model.pdiparams\n",
-      "speech_save_dir: demo_cache\n",
-      "use_gpu: False\n",
-      "warmup_manifest: examples/aishell/data/manifest.test\n",
-      "------------------------------------------------\n"
-     ]
-    }
-   ],
-   "source": [
-    "parser = default_argument_parser()\n",
-    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
-    "add_arg('host_ip',          str,\n",
-    "        'localhost',\n",
-    "        \"Server's IP address.\")\n",
-    "add_arg('host_port',        int,    8086,    \"Server's IP port.\")\n",
-    "add_arg('speech_save_dir',  str,\n",
-    "        'demo_cache',\n",
-    "        \"Directory to save demo audios.\")\n",
-    "add_arg('warmup_manifest',  \n",
-    "        str, \n",
-    "        \"examples/aishell/data/manifest.test\", \n",
-    "        \"Filepath of manifest to warm up.\")\n",
-    "add_arg(\n",
-    "    \"--model_file\",\n",
-    "    type=str,\n",
-    "    default=\"examples/aishell/jit.model.pdmodel\",\n",
-    "    help=\"Model filename, Specify this when your model is a combined model.\"\n",
-    ")\n",
-    "add_arg(\n",
-    "    \"--params_file\",\n",
-    "    type=str,\n",
-    "    default=\"examples/aishell/jit.model.pdiparams\",\n",
-    "    help=\n",
-    "    \"Parameter filename, Specify this when your model is a combined model.\"\n",
-    ")\n",
-    "add_arg(\n",
-    "    \"--model_dir\",\n",
-    "    type=str,\n",
-    "    default=None,\n",
-    "    help=\n",
-    "    \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
-    ")\n",
-    "add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
-    "\n",
-    "\n",
-    "args = parser.parse_args(\n",
-    "    \"--checkpoint_path examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725 --config examples/aishell/conf/deepspeech2.yaml --opts data.test_manifest examples/aishell/data/manifest.test data.mean_std_filepath examples/aishell/data/mean_std.npz  data.vocab_filepath examples/aishell/data/vocab.txt\".split()\n",
-    ")\n",
-    "\n",
-    "\n",
-    "config = get_cfg_defaults()\n",
-    "if args.config:\n",
-    "    config.merge_from_file(args.config)\n",
-    "if args.opts:\n",
-    "    config.merge_from_list(args.opts)\n",
-    "config.freeze()\n",
-    "print(config)\n",
-    "\n",
-    "args.warmup_manifest = config.data.test_manifest\n",
-    "\n",
-    "print_arguments(args)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = ManifestDataset(\n",
-    "        config.data.test_manifest,\n",
-    "        config.data.unit_type,\n",
-    "        config.data.vocab_filepath,\n",
-    "        config.data.mean_std_filepath,\n",
-    "        augmentation_config=\"{}\",\n",
-    "        max_duration=config.data.max_duration,\n",
-    "        min_duration=config.data.min_duration,\n",
-    "        stride_ms=config.data.stride_ms,\n",
-    "        window_ms=config.data.window_ms,\n",
-    "        n_fft=config.data.n_fft,\n",
-    "        max_freq=config.data.max_freq,\n",
-    "        target_sample_rate=config.data.target_sample_rate,\n",
-    "        specgram_type=config.data.specgram_type,\n",
-    "        feat_dim=config.data.feat_dim,\n",
-    "        delta_delta=config.data.delat_delta,\n",
-    "        use_dB_normalization=config.data.use_dB_normalization,\n",
-    "        target_dB=config.data.target_dB,\n",
-    "        random_seed=config.data.random_seed,\n",
-    "        keep_transcription_text=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "layer summary:\n",
-      "encoder.conv.conv_in.conv.weight|[32, 1, 41, 11]|14432\n",
-      "encoder.conv.conv_in.bn.weight|[32]|32\n",
-      "encoder.conv.conv_in.bn.bias|[32]|32\n",
-      "encoder.conv.conv_in.bn._mean|[32]|32\n",
-      "encoder.conv.conv_in.bn._variance|[32]|32\n",
-      "encoder.conv.conv_stack.0.conv.weight|[32, 32, 21, 11]|236544\n",
-      "encoder.conv.conv_stack.0.bn.weight|[32]|32\n",
-      "encoder.conv.conv_stack.0.bn.bias|[32]|32\n",
-      "encoder.conv.conv_stack.0.bn._mean|[32]|32\n",
-      "encoder.conv.conv_stack.0.bn._variance|[32]|32\n",
-      "encoder.rnn.rnn_stacks.0.fw_fc.weight|[1312, 3072]|4030464\n",
-      "encoder.rnn.rnn_stacks.0.fw_bn.weight|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.fw_bn.bias|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.fw_bn._mean|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.fw_bn._variance|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.bw_fc.weight|[1312, 3072]|4030464\n",
-      "encoder.rnn.rnn_stacks.0.bw_bn.weight|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.bw_bn.bias|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.bw_bn._mean|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.bw_bn._variance|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.fw_cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.0.fw_cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.bw_cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.0.bw_cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.0.fw_rnn.cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.0.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.0.bw_rnn.cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.fw_fc.weight|[2048, 3072]|6291456\n",
-      "encoder.rnn.rnn_stacks.1.fw_bn.weight|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.fw_bn.bias|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.fw_bn._mean|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.fw_bn._variance|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.bw_fc.weight|[2048, 3072]|6291456\n",
-      "encoder.rnn.rnn_stacks.1.bw_bn.weight|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.bw_bn.bias|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.bw_bn._mean|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.bw_bn._variance|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.fw_cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.1.fw_cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.bw_cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.1.bw_cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.1.fw_rnn.cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.1.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.1.bw_rnn.cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.fw_fc.weight|[2048, 3072]|6291456\n",
-      "encoder.rnn.rnn_stacks.2.fw_bn.weight|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.fw_bn.bias|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.fw_bn._mean|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.fw_bn._variance|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.bw_fc.weight|[2048, 3072]|6291456\n",
-      "encoder.rnn.rnn_stacks.2.bw_bn.weight|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.bw_bn.bias|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.bw_bn._mean|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.bw_bn._variance|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.fw_cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.2.fw_cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.bw_cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.2.bw_cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.2.fw_rnn.cell.bias_hh|[3072]|3072\n",
-      "encoder.rnn.rnn_stacks.2.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
-      "encoder.rnn.rnn_stacks.2.bw_rnn.cell.bias_hh|[3072]|3072\n",
-      "decoder.ctc_lo.weight|[2048, 4300]|8806400\n",
-      "decoder.ctc_lo.bias|[4300]|4300\n",
-      "layer has 66 parameters, 80148012 elements.\n"
-     ]
-    }
-   ],
-   "source": [
-    "model = DeepSpeech2InferModel.from_pretrained(dataset, config,\n",
-    "                                             args.checkpoint_path)\n",
-    "model.eval()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "examples/aishell/jit.model.pdmodel\n",
-      "examples/aishell/jit.model.pdiparams\n",
-      "0\n",
-      "False\n"
-     ]
-    }
-   ],
-   "source": [
-    "\n",
-    "from paddle.inference import Config\n",
-    "from paddle.inference import PrecisionType\n",
-    "from paddle.inference import create_predictor\n",
-    "\n",
-    "args.use_gpu=False\n",
-    "paddle.set_device('cpu')\n",
-    "\n",
-    "def init_predictor(args):\n",
-    "    if args.model_dir is not None:\n",
-    "        config = Config(args.model_dir)\n",
-    "    else:\n",
-    "        config = Config(args.model_file, args.params_file)\n",
-    "\n",
-    "    if args.use_gpu:\n",
-    "        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
-    "#         config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
-    "#                               use_calib_mode=True) # 开启TensorRT预测，精度为fp32，开启int8离线量化\n",
-    "    else:\n",
-    "        # If not specific mkldnn, you can set the blas thread.\n",
-    "        # The thread num should not be greater than the number of cores in the CPU.\n",
-    "        config.set_cpu_math_library_num_threads(1)\n",
-    "        config.enable_mkldnn()\n",
-    "        \n",
-    "    config.enable_memory_optim()\n",
-    "    config.switch_ir_optim(True)\n",
-    "    \n",
-    "    print(config.model_dir())\n",
-    "    print(config.prog_file())\n",
-    "    print(config.params_file())\n",
-    "    print(config.gpu_device_id())\n",
-    "    print(args.use_gpu)\n",
-    "    predictor = create_predictor(config)\n",
-    "    return predictor\n",
-    "\n",
-    "def run(predictor, audio, audio_len):\n",
-    "    # copy img data to input tensor\n",
-    "    input_names = predictor.get_input_names()\n",
-    "    for i, name in enumerate(input_names):\n",
-    "        print(\"input:\", i, name)\n",
-    "        \n",
-    "    audio_tensor = predictor.get_input_handle('audio')\n",
-    "    audio_tensor.reshape(audio.shape)\n",
-    "    audio_tensor.copy_from_cpu(audio.copy())\n",
-    "    \n",
-    "    audiolen_tensor = predictor.get_input_handle('audio_len')\n",
-    "    audiolen_tensor.reshape(audio_len.shape)\n",
-    "    audiolen_tensor.copy_from_cpu(audio_len.copy())\n",
-    "\n",
-    "    output_names = predictor.get_output_names()\n",
-    "    for i, name in enumerate(output_names):\n",
-    "        print(\"output:\", i, name)\n",
-    "\n",
-    "    # do the inference\n",
-    "    predictor.run()\n",
-    "\n",
-    "    results = []\n",
-    "    # get out data from output tensor\n",
-    "    output_names = predictor.get_output_names()\n",
-    "    for i, name in enumerate(output_names):\n",
-    "        output_tensor = predictor.get_output_handle(name)\n",
-    "        output_data = output_tensor.copy_to_cpu()\n",
-    "        results.append(output_data)\n",
-    "\n",
-    "    return results\n",
-    "\n",
-    "\n",
-    "predictor = init_predictor(args)\n",
-    "\n",
-    "def file_to_transcript(filename):\n",
-    "        print(filename)\n",
-    "        feature = dataset.process_utterance(filename, \"\")\n",
-    "        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]\n",
-    "        audio_len = feature[0].shape[1]\n",
-    "        audio_len = np.array([audio_len]).astype('int64')  # [1]\n",
-    "        \n",
-    "        \n",
-    "        i_probs = run(predictor, audio, audio_len)\n",
-    "        print('jit:', i_probs[0], type(i_probs[0]))\n",
-    "        \n",
-    "        audio = paddle.to_tensor(audio)\n",
-    "        audio_len = paddle.to_tensor(audio_len)\n",
-    "        print(audio.shape)\n",
-    "        print(audio_len.shape)\n",
-    "        \n",
-    "        #eouts, eouts_len = model.encoder(audio, audio_len)\n",
-    "        #probs = model.decoder.softmax(eouts)\n",
-    "        probs = model.forward(audio, audio_len)\n",
-    "        print('paddle:', probs.numpy())\n",
-    "        \n",
-    "        flag = np.allclose(i_probs[0], probs.numpy())\n",
-    "        print(flag)\n",
-    "        \n",
-    "        return probs\n",
-    "\n",
-    "#         result_transcript = model.decode(\n",
-    "#             audio,\n",
-    "#             audio_len,\n",
-    "#             vocab_list=dataset.vocab_list,\n",
-    "#             decoding_method=config.decoding.decoding_method,\n",
-    "#             lang_model_path=config.decoding.lang_model_path,\n",
-    "#             beam_alpha=config.decoding.alpha,\n",
-    "#             beam_beta=config.decoding.beta,\n",
-    "#             beam_size=config.decoding.beam_size,\n",
-    "#             cutoff_prob=config.decoding.cutoff_prob,\n",
-    "#             cutoff_top_n=config.decoding.cutoff_top_n,\n",
-    "#             num_processes=config.decoding.num_proc_bsearch)\n",
-    "#         return result_transcript[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Warm-up Test Case %d: %s 0 /home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n",
-      "/home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n",
-      "input: 0 audio\n",
-      "input: 1 audio_len\n",
-      "output: 0 tmp_75\n",
-      "jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
-      "   8.91573707e-12 4.64317296e-08]\n",
-      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
-      "   1.55891342e-15 9.99992609e-01]\n",
-      "  [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
-      "   1.24587264e-17 1.00000000e+00]\n",
-      "  ...\n",
-      "  [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
-      "   4.37354747e-15 1.00000000e+00]\n",
-      "  [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
-      "   3.89252270e-13 1.00000000e+00]\n",
-      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
-      "   1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
-      "[1, 161, 522]\n",
-      "[1]\n",
-      "paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
-      "   8.91577090e-12 4.64319072e-08]\n",
-      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
-      "   1.55891342e-15 9.99992609e-01]\n",
-      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
-      "   1.24587735e-17 1.00000000e+00]\n",
-      "  ...\n",
-      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
-      "   4.37354747e-15 1.00000000e+00]\n",
-      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
-      "   3.89253761e-13 1.00000000e+00]\n",
-      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
-      "   1.00334095e-10 9.99998808e-01]]]\n",
-      "False\n"
-     ]
-    }
-   ],
-   "source": [
-    "manifest = read_manifest(args.warmup_manifest)\n",
-    "\n",
-    "for idx, sample in enumerate(manifest[:1]):\n",
-    "    print(\"Warm-up Test Case %d: %s\", idx, sample['audio_filepath'])\n",
-    "    start_time = time.time()\n",
-    "    transcript = file_to_transcript(sample['audio_filepath'])\n",
-    "    finish_time = time.time()\n",
-    "#     print(\"Response Time: %f, Transcript: %s\" %\n",
-    "#           (finish_time - start_time, transcript))\n",
-    "    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1, 161, 522) (1,)\n",
-      "input: 0 audio\n",
-      "input: 1 audio_len\n",
-      "output: 0 tmp_75\n",
-      "jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
-      "   8.91577090e-12 4.64319072e-08]\n",
-      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
-      "   1.55891342e-15 9.99992609e-01]\n",
-      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
-      "   1.24587735e-17 1.00000000e+00]\n",
-      "  ...\n",
-      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
-      "   4.37354747e-15 1.00000000e+00]\n",
-      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
-      "   3.89253761e-13 1.00000000e+00]\n",
-      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
-      "   1.00334095e-10 9.99998808e-01]]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "def test(filename):\n",
-    "    feature = dataset.process_utterance(filename, \"\")\n",
-    "    audio = np.array([feature[0]]).astype('float32')  #[1, D, T]\n",
-    "    audio_len = feature[0].shape[1]\n",
-    "    audio_len = np.array([audio_len]).astype('int64')  # [1]\n",
-    "    \n",
-    "    print(audio.shape, audio_len.shape)\n",
-    "\n",
-    "    i_probs = run(predictor, audio, audio_len)\n",
-    "    print('jit:', i_probs[0])\n",
-    "    return i_probs\n",
-    "    \n",
-    "probs = test(sample['audio_filepath'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/.notebook/layer_norm_test.ipynb
+++ b/.notebook/layer_norm_test.ipynb
@ -1,229 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "academic-surname",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import paddle\n",
-    "from paddle import nn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "fundamental-treasure",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameter containing:\n",
-      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
-      "       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
-      "Parameter containing:\n",
-      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
-      "       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
-     ]
-    }
-   ],
-   "source": [
-    "L = nn.LayerNorm(256, epsilon=1e-12)\n",
-    "for p in L.parameters():\n",
-    "    print(p)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "consolidated-elephant",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "moderate-noise",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "float64\n"
-     ]
-    }
-   ],
-   "source": [
-    "x = np.random.randn(2, 51, 256)\n",
-    "print(x.dtype)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "cooked-progressive",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y = L(paddle.to_tensor(x, dtype='float32'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "optimum-milwaukee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "viral-indian",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Parameter containing:\n",
-      "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
-      "        1., 1., 1., 1.], requires_grad=True)\n",
-      "Parameter containing:\n",
-      "tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
-      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
-      "       requires_grad=True)\n"
-     ]
-    }
-   ],
-   "source": [
-    "TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
-    "for p in TL.parameters():\n",
-    "    print(p)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "skilled-vietnamese",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ty = TL(torch.tensor(x, dtype=torch.float32))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "incorrect-allah",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 51,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "np.allclose(y.numpy(), ty.detach().numpy())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "prostate-cameroon",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "governmental-surge",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 52,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "x = np.random.randn(2, 256)\n",
-    "y = L(paddle.to_tensor(x, dtype='float32'))\n",
-    "ty = TL(torch.tensor(x, dtype=torch.float32))\n",
-    "np.allclose(y.numpy(), ty.detach().numpy())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "confidential-jacket",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.notebook/mask_and_masked_fill_test.ipynb
+++ b/.notebook/mask_and_masked_fill_test.ipynb
@ -1,449 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "primary-organic",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "stopped-semester",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def mask_finished_scores(score: torch.Tensor,\n",
-    "                         flag: torch.Tensor) -> torch.Tensor:\n",
-    "    \"\"\"\n",
-    "    If a sequence is finished, we only allow one alive branch. This function\n",
-    "    aims to give one branch a zero score and the rest -inf score.\n",
-    "    Args:\n",
-    "        score (torch.Tensor): A real value array with shape\n",
-    "            (batch_size * beam_size, beam_size).\n",
-    "        flag (torch.Tensor): A bool array with shape\n",
-    "            (batch_size * beam_size, 1).\n",
-    "    Returns:\n",
-    "        torch.Tensor: (batch_size * beam_size, beam_size).\n",
-    "    \"\"\"\n",
-    "    beam_size = score.size(-1)\n",
-    "    zero_mask = torch.zeros_like(flag, dtype=torch.bool)\n",
-    "    if beam_size > 1:\n",
-    "        unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),\n",
-    "                               dim=1)\n",
-    "        finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),\n",
-    "                             dim=1)\n",
-    "    else:\n",
-    "        unfinished = zero_mask\n",
-    "        finished = flag\n",
-    "    print(unfinished)\n",
-    "    print(finished)\n",
-    "    score.masked_fill_(unfinished, -float('inf'))\n",
-    "    score.masked_fill_(finished, 0)\n",
-    "    return score"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "id": "agreed-portuguese",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[ True],\n",
-      "        [False]])\n",
-      "tensor([[-0.8841,  0.7381, -0.9986],\n",
-      "        [ 0.2675, -0.7971,  0.3798]])\n",
-      "tensor([[ True,  True],\n",
-      "        [False, False]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "score = torch.randn((2, 3))\n",
-    "flag = torch.ones((2, 1), dtype=torch.bool)\n",
-    "flag[1] = False\n",
-    "print(flag)\n",
-    "print(score)\n",
-    "print(flag.repeat([1, 2]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "id": "clean-aspect",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tensor([[False,  True,  True],\n",
-      "        [False, False, False]])\n",
-      "tensor([[ True, False, False],\n",
-      "        [False, False, False]])\n",
-      "tensor([[ 0.0000,    -inf,    -inf],\n",
-      "        [ 0.2675, -0.7971,  0.3798]])\n",
-      "tensor([[ 0.0000,    -inf,    -inf],\n",
-      "        [ 0.2675, -0.7971,  0.3798]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "r  = mask_finished_scores(score, flag)\n",
-    "print(r)\n",
-    "print(score)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "thrown-airline",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Tensor(shape=[2, 1], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[True ],\n",
-      "        [False]])\n",
-      "Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 2.05994511,  1.87704289,  0.01988174],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[True , True ],\n",
-      "        [False, False]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "import paddle\n",
-    "\n",
-    "score = paddle.randn((2, 3))\n",
-    "flag = paddle.ones((2, 1), dtype='bool')\n",
-    "flag[1] = False\n",
-    "print(flag)\n",
-    "print(score)\n",
-    "print(flag.tile([1, 2]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "internal-patent",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[False, True , True ],\n",
-      "        [False, False, False]])\n",
-      "Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[True , False, False],\n",
-      "        [False, False, False]])\n",
-      "x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 2.05994511,  1.87704289,  0.01988174],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 2.05994511,  1.87704289,  0.01988174],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 2.05994511, -inf.      , -inf.      ],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 2.05994511, -inf.      , -inf.      ],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 2.05994511, -inf.      , -inf.      ],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 0.        , -inf.      , -inf.      ],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
-      "Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[ 0.        , -inf.      , -inf.      ],\n",
-      "        [-0.40165186,  0.77547729, -0.64469045]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "paddle.bool = 'bool'\n",
-    "\n",
-    "def masked_fill(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
-    "    print(xs)\n",
-    "    trues = paddle.ones_like(xs) * value\n",
-    "    assert xs.shape == mask.shape\n",
-    "    xs = paddle.where(mask, trues, xs)\n",
-    "    return xs\n",
-    "\n",
-    "def masked_fill_(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
-    "    print('x', xs)\n",
-    "    trues = paddle.ones_like(xs) * value\n",
-    "    assert xs.shape == mask.shape\n",
-    "    ret = paddle.where(mask, trues, xs)\n",
-    "    print('2', xs)\n",
-    "    paddle.assign(ret, output=xs)\n",
-    "    print('3', xs)\n",
-    "\n",
-    "paddle.Tensor.masked_fill = masked_fill\n",
-    "paddle.Tensor.masked_fill_ = masked_fill_\n",
-    "\n",
-    "def mask_finished_scores_pd(score: paddle.Tensor,\n",
-    "                         flag: paddle.Tensor) -> paddle.Tensor:\n",
-    "    \"\"\"\n",
-    "    If a sequence is finished, we only allow one alive branch. This function\n",
-    "    aims to give one branch a zero score and the rest -inf score.\n",
-    "    Args:\n",
-    "        score (torch.Tensor): A real value array with shape\n",
-    "            (batch_size * beam_size, beam_size).\n",
-    "        flag (torch.Tensor): A bool array with shape\n",
-    "            (batch_size * beam_size, 1).\n",
-    "    Returns:\n",
-    "        torch.Tensor: (batch_size * beam_size, beam_size).\n",
-    "    \"\"\"\n",
-    "    beam_size = score.shape[-1]\n",
-    "    zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)\n",
-    "    if beam_size > 1:\n",
-    "        unfinished = paddle.concat((zero_mask, flag.tile([1, beam_size - 1])),\n",
-    "                               axis=1)\n",
-    "        finished = paddle.concat((flag, zero_mask.tile([1, beam_size - 1])),\n",
-    "                             axis=1)\n",
-    "    else:\n",
-    "        unfinished = zero_mask\n",
-    "        finished = flag\n",
-    "    print(unfinished)\n",
-    "    print(finished)\n",
-    "    \n",
-    "    #score.masked_fill_(unfinished, -float('inf'))\n",
-    "    #score.masked_fill_(finished, 0)\n",
-    "#     infs = paddle.ones_like(score) * -float('inf')\n",
-    "#     score = paddle.where(unfinished, infs, score)\n",
-    "#     score = paddle.where(finished, paddle.zeros_like(score), score)\n",
-    "\n",
-    "#     score = score.masked_fill(unfinished, -float('inf'))\n",
-    "#     score = score.masked_fill(finished, 0)\n",
-    "    score.masked_fill_(unfinished, -float('inf'))\n",
-    "    score.masked_fill_(finished, 0)\n",
-    "    return score\n",
-    "\n",
-    "r  = mask_finished_scores_pd(score, flag)\n",
-    "print(r)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "vocal-prime",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<bound method PyCapsule.value of Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
-       "       [[ 0.        , -inf.      , -inf.      ],\n",
-       "        [-0.40165186,  0.77547729, -0.64469045]])>"
-      ]
-     },
-     "execution_count": 57,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "score.value"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "id": "bacterial-adolescent",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Union, Any"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "id": "absent-fiber",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def repeat(xs : paddle.Tensor, *size: Any):\n",
-    "    print(size)\n",
-    "    return paddle.tile(xs, size)\n",
-    "paddle.Tensor.repeat = repeat"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "id": "material-harbor",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(1, 2)\n",
-      "Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[True , True ],\n",
-      "        [False, False]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "flag = paddle.ones((2, 1), dtype='bool')\n",
-    "flag[1] = False\n",
-    "print(flag.repeat(1, 2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "id": "acute-brighton",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [1]), 2)\n",
-      "Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [[True , True ],\n",
-      "        [False, False]])\n"
-     ]
-    }
-   ],
-   "source": [
-    "flag = paddle.ones((2, 1), dtype='bool')\n",
-    "flag[1] = False\n",
-    "print(flag.repeat(paddle.to_tensor(1), 2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "id": "european-rugby",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def size(xs, *args: int):\n",
-    "    nargs = len(args)\n",
-    "    s = paddle.shape(xs)\n",
-    "    assert(nargs <= 1)\n",
-    "    if nargs == 1:\n",
-    "        return s[args[0]]\n",
-    "    else:\n",
-    "        return s\n",
-    "paddle.Tensor.size = size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "id": "moral-special",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Tensor(shape=[2], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
-       "       [2, 1])"
-      ]
-     },
-     "execution_count": 86,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "flag.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "id": "ahead-coach",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
-       "       [1])"
-      ]
-     },
-     "execution_count": 87,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "flag.size(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "id": "incomplete-fitness",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
-       "       [2])"
-      ]
-     },
-     "execution_count": 88,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "flag.size(0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "upset-connectivity",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.notebook/position_embeding_check.ipynb
+++ b/.notebook/position_embeding_check.ipynb
@ -1,231 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "designing-borough",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
-      "   0.0000000e+00  0.0000000e+00]\n",
-      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
-      "   1.1547816e-04  1.0746076e-04]\n",
-      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
-      "   2.3095631e-04  2.1492151e-04]\n",
-      " ...\n",
-      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
-      "   1.1201146e-02  1.0423505e-02]\n",
-      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
-      "   1.1316618e-02  1.0530960e-02]\n",
-      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
-      "   1.1432089e-02  1.0638415e-02]]\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "import torch\n",
-    "import math\n",
-    "import numpy as np\n",
-    "\n",
-    "max_len=100\n",
-    "d_model=256\n",
-    "\n",
-    "pe = torch.zeros(max_len, d_model)\n",
-    "position = torch.arange(0, max_len,\n",
-    "                        dtype=torch.float32).unsqueeze(1)\n",
-    "toruch_position = position\n",
-    "div_term = torch.exp(\n",
-    "    torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
-    "    -(math.log(10000.0) / d_model))\n",
-    "tourch_div_term = div_term.cpu().detach().numpy()\n",
-    "\n",
-    "\n",
-    "\n",
-    "torhc_sin = torch.sin(position * div_term)\n",
-    "torhc_cos = torch.cos(position * div_term)\n",
-    "print(torhc_sin.cpu().detach().numpy())\n",
-    "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
-    "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
-    "print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
-    "print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
-    "pe[:, 0::2] = torhc_sin\n",
-    "pe[:, 1::2] = torhc_cos\n",
-    "tourch_pe = pe.cpu().detach().numpy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "swiss-referral",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "True\n",
-      "True\n",
-      "False\n",
-      "False\n",
-      "False\n",
-      "False\n",
-      "[[ 1.          1.          1.         ...  1.          1.\n",
-      "   1.        ]\n",
-      " [ 0.5403023   0.59737533  0.6479059  ...  1.          1.\n",
-      "   1.        ]\n",
-      " [-0.41614684 -0.28628543 -0.1604359  ...  0.99999994  1.\n",
-      "   1.        ]\n",
-      " ...\n",
-      " [-0.92514753 -0.66694194 -0.67894876 ...  0.9999276   0.99993724\n",
-      "   0.9999457 ]\n",
-      " [-0.81928825 -0.9959641  -0.999139   ...  0.99992603  0.999936\n",
-      "   0.99994457]\n",
-      " [ 0.03982088 -0.52298605 -0.6157435  ...  0.99992454  0.9999347\n",
-      "   0.99994344]]\n",
-      "----\n",
-      "[[ 1.          1.          1.         ...  1.          1.\n",
-      "   1.        ]\n",
-      " [ 0.54030234  0.59737533  0.6479059  ...  1.          1.\n",
-      "   1.        ]\n",
-      " [-0.41614684 -0.28628543 -0.1604359  ...  1.          1.\n",
-      "   1.        ]\n",
-      " ...\n",
-      " [-0.92514753 -0.66694194 -0.67894876 ...  0.9999276   0.9999373\n",
-      "   0.9999457 ]\n",
-      " [-0.81928825 -0.9959641  -0.999139   ...  0.99992603  0.999936\n",
-      "   0.99994457]\n",
-      " [ 0.03982088 -0.5229861  -0.6157435  ...  0.99992454  0.9999347\n",
-      "   0.99994344]]\n",
-      ")))))))\n",
-      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
-      "   0.0000000e+00  0.0000000e+00]\n",
-      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
-      "   1.1547816e-04  1.0746076e-04]\n",
-      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
-      "   2.3095631e-04  2.1492151e-04]\n",
-      " ...\n",
-      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
-      "   1.1201146e-02  1.0423505e-02]\n",
-      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
-      "   1.1316618e-02  1.0530960e-02]\n",
-      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
-      "   1.1432089e-02  1.0638415e-02]]\n",
-      "----\n",
-      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
-      "   0.0000000e+00  0.0000000e+00]\n",
-      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
-      "   1.1547816e-04  1.0746076e-04]\n",
-      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
-      "   2.3095631e-04  2.1492151e-04]\n",
-      " ...\n",
-      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
-      "   1.1201146e-02  1.0423505e-02]\n",
-      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
-      "   1.1316618e-02  1.0530960e-02]\n",
-      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
-      "   1.1432089e-02  1.0638415e-02]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import paddle\n",
-    "paddle.set_device('cpu')\n",
-    "ppe = paddle.zeros((max_len, d_model), dtype='float32')\n",
-    "position = paddle.arange(0, max_len,\n",
-    "                        dtype='float32').unsqueeze(1)\n",
-    "print(np.allclose(position.numpy(), toruch_position))\n",
-    "div_term = paddle.exp(\n",
-    "    paddle.arange(0, d_model, 2, dtype='float32') *\n",
-    "    -(math.log(10000.0) / d_model))\n",
-    "print(np.allclose(div_term.numpy(), tourch_div_term))\n",
-    "\n",
-    "\n",
-    "\n",
-    "p_sin = paddle.sin(position * div_term)\n",
-    "p_cos = paddle.cos(position * div_term)\n",
-    "print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
-    "print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
-    "ppe[:, 0::2] = p_sin\n",
-    "ppe[:, 1::2] = p_cos\n",
-    "print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
-    "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n",
-    "print(p_cos.numpy())\n",
-    "print(\"----\")\n",
-    "print(torhc_cos.cpu().detach().numpy())\n",
-    "print(\")))))))\")\n",
-    "print(p_sin.numpy())\n",
-    "print(\"----\")\n",
-    "print(torhc_sin.cpu().detach().numpy())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "integrated-boards",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "False\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(np.allclose(ppe.numpy(), pe.numpy()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "flying-reserve",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "revised-divide",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
--- a/.notebook/train_test.ipynb
+++ b/.notebook/train_test.ipynb
--- a/.notebook/u2_confermer_model_wenet.ipynb
+++ b/.notebook/u2_confermer_model_wenet.ipynb
--- a/.notebook/u2_tansformer_model_espnet.ipynb
+++ b/.notebook/u2_tansformer_model_espnet.ipynb
--- a/.notebook/wenet_model.ipynb
+++ b/.notebook/wenet_model.ipynb
--- a/README.md
+++ b/README.md
@ -1,5 +1,3 @@
-[中文版](README_cn.md)
-
 # PaddlePaddle Speech to Any toolkit

 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
@ -11,31 +9,29 @@

 ## Features

- See [feature list](doc/src/feature_list.md) for more information.
+ See [feature list](docs/src/feature_list.md) for more information.

 ## Setup

 All tested under:  
 * Ubuntu 16.04
 * python>=3.7
-* paddlepaddle>=2.1.2
+* paddlepaddle>=2.2.0rc

-Please see [install](doc/src/install.md).
+Please see [install](docs/src/install.md).

 ## Getting Started

-Please see [Getting Started](doc/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
+Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).


 ## More Information  

-* [Data Prepration](doc/src/data_preparation.md)  
-* [Data Augmentation](doc/src/augmentation.md)  
-* [Ngram LM](doc/src/ngram_lm.md)  
-* [Server Demo](doc/src/server.md)  
-* [Benchmark](doc/src/benchmark.md)  
-* [Relased Model](doc/src/released_model.md)  
-* [FAQ](doc/src/faq.md)  
+* [Data Prepration](docs/src/data_preparation.md)  
+* [Data Augmentation](docs/src/augmentation.md)  
+* [Ngram LM](docs/src/ngram_lm.md)  
+* [Benchmark](docs/src/benchmark.md)  
+* [Relased Model](docs/src/released_model.md)  


 ## Questions and Help
@ -45,8 +41,8 @@ You are welcome to submit questions in [Github Discussions](https://github.com/P

 ## License

-DeepASR is provided under the [Apache-2.0 License](./LICENSE).
+DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).

 ## Acknowledgement

-We depends on many open source repos. See [References](doc/src/reference.md) for more information.
+We depends on many open source repos. See [References](docs/src/reference.md) for more information.
--- a/README_cn.md
+++ b/README_cn.md
@ -1,51 +0,0 @@
-[English](README.md)
-
-# PaddlePaddle Speech to Any toolkit
-
-![License](https://img.shields.io/badge/license-Apache%202-red.svg)
-![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
-![support os](https://img.shields.io/badge/os-linux-yellow.svg)
-
-*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目，
-我们的愿景是为语音识别在工业应用和学术研究上，提供易于使用、高效、小型化和可扩展的工具，包括训练，推理，以及  部署。
-
-## 特性
-
- 参看 [特性列表](doc/src/feature_list.md)。
-
-
-## 安装
-
-在以下环境测试验证过：  
-
-* Ubuntu 16.04
-* python>=3.7
-* paddlepaddle>=2.1.2
-
-参看 [安装](doc/src/install.md)。
-
-## 开始
-
-请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
-
-## 更多信息
-
-* [数据处理](doc/src/data_preparation.md)  
-* [数据增强](doc/src/augmentation.md)  
-* [语言模型](doc/src/ngram_lm.md)  
-* [服务部署](doc/src/server.md)  
-* [Benchmark](doc/src/benchmark.md)  
-* [Relased Model](doc/src/released_model.md)  
-* [FAQ](doc/src/faq.md)  
-
-## 问题和帮助
-
-欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题，[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
-
-## License
-
-DeepASR 遵循[Apache-2.0开源协议](./LICENSE)。
-
-## 感谢
-
-开发中参考一些优秀的仓库，详情参见 [References](doc/src/reference.md)。
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -80,23 +80,23 @@ def convert_dtype_to_string(tensor_dtype):


 if not hasattr(paddle, 'softmax'):
-    logger.warn("register user softmax to paddle, remove this when fixed!")
+    logger.debug("register user softmax to paddle, remove this when fixed!")
    setattr(paddle, 'softmax', paddle.nn.functional.softmax)

 if not hasattr(paddle, 'log_softmax'):
-    logger.warn("register user log_softmax to paddle, remove this when fixed!")
+    logger.debug("register user log_softmax to paddle, remove this when fixed!")
    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)

 if not hasattr(paddle, 'sigmoid'):
-    logger.warn("register user sigmoid to paddle, remove this when fixed!")
+    logger.debug("register user sigmoid to paddle, remove this when fixed!")
    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)

 if not hasattr(paddle, 'log_sigmoid'):
-    logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
+    logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)

 if not hasattr(paddle, 'relu'):
-    logger.warn("register user relu to paddle, remove this when fixed!")
+    logger.debug("register user relu to paddle, remove this when fixed!")
    setattr(paddle, 'relu', paddle.nn.functional.relu)


@ -105,7 +105,7 @@ def cat(xs, dim=0):


 if not hasattr(paddle, 'cat'):
-    logger.warn(
+    logger.debug(
        "override cat of paddle if exists or register, remove this when fixed!")
    paddle.cat = cat

@ -116,7 +116,7 @@ def item(x: paddle.Tensor):


 if not hasattr(paddle.Tensor, 'item'):
-    logger.warn(
+    logger.debug(
        "override item of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.item = item
@ -127,13 +127,13 @@ def func_long(x: paddle.Tensor):


 if not hasattr(paddle.Tensor, 'long'):
-    logger.warn(
+    logger.debug(
        "override long of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.long = func_long

 if not hasattr(paddle.Tensor, 'numel'):
-    logger.warn(
+    logger.debug(
        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.numel = paddle.numel
@ -147,7 +147,7 @@ def new_full(x: paddle.Tensor,


 if not hasattr(paddle.Tensor, 'new_full'):
-    logger.warn(
+    logger.debug(
        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.new_full = new_full
@ -162,13 +162,13 @@ def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'eq'):
-    logger.warn(
+    logger.debug(
        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.eq = eq

 if not hasattr(paddle, 'eq'):
-    logger.warn(
+    logger.debug(
        "override eq of paddle if exists or register, remove this when fixed!")
    paddle.eq = eq

@ -178,7 +178,7 @@ def contiguous(xs: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'contiguous'):
-    logger.warn(
+    logger.debug(
        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.contiguous = contiguous
@ -195,7 +195,7 @@ def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:


 #`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.warn(
+logger.debug(
    "override size of paddle.Tensor "
    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
 )
@ -207,7 +207,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'view'):
-    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user view to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.view = view


@ -216,7 +216,7 @@ def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'view_as'):
-    logger.warn(
+    logger.debug(
        "register user view_as to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.view_as = view_as

@ -242,7 +242,7 @@ def masked_fill(xs: paddle.Tensor,


 if not hasattr(paddle.Tensor, 'masked_fill'):
-    logger.warn(
+    logger.debug(
        "register user masked_fill to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.masked_fill = masked_fill

@ -260,7 +260,7 @@ def masked_fill_(xs: paddle.Tensor,


 if not hasattr(paddle.Tensor, 'masked_fill_'):
-    logger.warn(
+    logger.debug(
        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.masked_fill_ = masked_fill_

@ -272,7 +272,8 @@ def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'fill_'):
-    logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
+    logger.debug(
+        "register user fill_ to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.fill_ = fill_


@ -281,22 +282,22 @@ def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'repeat'):
-    logger.warn(
+    logger.debug(
        "register user repeat to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.repeat = repeat

 if not hasattr(paddle.Tensor, 'softmax'):
-    logger.warn(
+    logger.debug(
        "register user softmax to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)

 if not hasattr(paddle.Tensor, 'sigmoid'):
-    logger.warn(
+    logger.debug(
        "register user sigmoid to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)

 if not hasattr(paddle.Tensor, 'relu'):
-    logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)


@ -305,7 +306,7 @@ def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'type_as'):
-    logger.warn(
+    logger.debug(
        "register user type_as to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'type_as', type_as)

@ -321,7 +322,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'to'):
-    logger.warn("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'to', to)


@ -330,7 +331,8 @@ def func_float(x: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'float'):
-    logger.warn("register user float to paddle.Tensor, remove this when fixed!")
+    logger.debug(
+        "register user float to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'float', func_float)


@ -339,7 +341,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'int'):
-    logger.warn("register user int to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user int to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'int', func_int)


@ -348,23 +350,6 @@ def tolist(x: paddle.Tensor) -> List[Any]:


 if not hasattr(paddle.Tensor, 'tolist'):
-    logger.warn(
+    logger.debug(
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)
-
-
-########### hcak paddle.nn #############
-class GLU(nn.Layer):
-    """Gated Linear Units (GLU) Layer"""
-
-    def __init__(self, dim: int=-1):
-        super().__init__()
-        self.dim = dim
-
-    def forward(self, xs):
-        return F.glu(xs, axis=self.dim)
-
-
-if not hasattr(paddle.nn, 'GLU'):
-    logger.warn("register user GLU to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'GLU', GLU)
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@ -35,7 +35,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer) {
+    Scorer *ext_scorer,
+    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
@ -48,7 +49,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(

    // assign blank id
    // size_t blank_id = vocabulary.size();
-    size_t blank_id = 0;
+    // size_t blank_id = 0;

    // assign space id
    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
@ -57,7 +58,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    if ((size_t)space_id >= vocabulary.size()) {
        space_id = -2;
    }
-
    // init prefixes' root
    PathTrie root;
    root.score = root.log_prob_b_prev = 0.0;
@ -218,7 +218,8 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer) {
+    Scorer *ext_scorer,
+    size_t blank_id) {
    VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    ThreadPool pool(num_processes);
@ -234,7 +235,8 @@ ctc_beam_search_decoder_batch(
                                      beam_size,
                                      cutoff_prob,
                                      cutoff_top_n,
-                                      ext_scorer));
+                                      ext_scorer,
+                                      blank_id));
    }

    // get decoding results
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.h
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.h
@ -43,7 +43,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
-    Scorer *ext_scorer = nullptr);
+    Scorer *ext_scorer = nullptr,
+    size_t blank_id = 0);

 /* CTC Beam Search Decoder for batch data

@ -70,6 +71,7 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
-    Scorer *ext_scorer = nullptr);
+    Scorer *ext_scorer = nullptr,
+    size_t blank_id = 0);

 #endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
@ -17,17 +17,18 @@

 std::string ctc_greedy_decoder(
    const std::vector<std::vector<double>> &probs_seq,
-    const std::vector<std::string> &vocabulary) {
+    const std::vector<std::string> &vocabulary,
+    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
        VALID_CHECK_EQ(probs_seq[i].size(),
-                       vocabulary.size() + 1,
+                       vocabulary.size(),
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }

-    size_t blank_id = vocabulary.size();
+    // size_t blank_id = vocabulary.size();

    std::vector<size_t> max_idx_vec(num_time_steps, 0);
    std::vector<size_t> idx_vec;
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.h
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.h
@ -29,6 +29,7 @@
 */
 std::string ctc_greedy_decoder(
    const std::vector<std::vector<double>>& probs_seq,
-    const std::vector<std::string>& vocabulary);
+    const std::vector<std::string>& vocabulary,
+    size_t blank_id);

 #endif  // CTC_GREEDY_DECODER_H
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@ -85,9 +85,8 @@ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')

 # yapf: disable
 FILES = [
-    fn for fn in FILES
-    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
-        'unittest.cc'))
+    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
+                               or fn.endswith('unittest.cc'))
 ]
 # yapf: enable

--- a/deepspeech/decoders/swig_wrapper.py
+++ b/deepspeech/decoders/swig_wrapper.py
@ -32,7 +32,7 @@ class Scorer(swig_decoders.Scorer):
        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)


-def ctc_greedy_decoder(probs_seq, vocabulary):
+def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
    """Wrapper for ctc best path decoder in swig.

    :param probs_seq: 2-D list of probability distributions over each time
@ -44,7 +44,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
    :return: Decoding result string.
    :rtype: str
    """
-    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary)
+    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
+                                              blank_id)
    return result


@ -53,7 +54,8 @@ def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            cutoff_prob=1.0,
                            cutoff_top_n=40,
-                            ext_scoring_func=None):
+                            ext_scoring_func=None,
+                            blank_id=0):
    """Wrapper for the CTC Beam Search Decoder.

    :param probs_seq: 2-D list of probability distributions over each time
@ -81,7 +83,7 @@ def ctc_beam_search_decoder(probs_seq,
    """
    beam_results = swig_decoders.ctc_beam_search_decoder(
        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
-        ext_scoring_func)
+        ext_scoring_func, blank_id)
    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
    return beam_results

@ -92,7 +94,8 @@ def ctc_beam_search_decoder_batch(probs_split,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
-                                  ext_scoring_func=None):
+                                  ext_scoring_func=None,
+                                  blank_id=0):
    """Wrapper for the batched CTC beam search decoder.

    :param probs_seq: 3-D list with each element as an instance of 2-D list
@ -125,7 +128,7 @@ def ctc_beam_search_decoder_batch(probs_split,

    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
-        cutoff_top_n, ext_scoring_func)
+        cutoff_top_n, ext_scoring_func, blank_id)
    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
                          for beam_results in batch_beam_results]
    return batch_beam_results
--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
@ -27,7 +27,7 @@ def main_sp(config, args):


 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@ -1,191 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Beam search parameters tuning for DeepSpeech2 model."""
-import functools
-import sys
-
-import numpy as np
-from paddle.io import DataLoader
-
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils import error_rate
-from deepspeech.utils.utility import add_arguments
-from deepspeech.utils.utility import print_arguments
-
-
-def tune(config, args):
-    """Tune parameters alpha and beta incrementally."""
-    if not args.num_alphas >= 0:
-        raise ValueError("num_alphas must be non-negative!")
-    if not args.num_betas >= 0:
-        raise ValueError("num_betas must be non-negative!")
-    config.defrost()
-    config.data.manfiest = config.data.dev_manifest
-    config.data.augmentation_config = ""
-    config.data.keep_transcription_text = True
-    dev_dataset = ManifestDataset.from_config(config)
-
-    valid_loader = DataLoader(
-        dev_dataset,
-        batch_size=config.data.batch_size,
-        shuffle=False,
-        drop_last=False,
-        collate_fn=SpeechCollator(keep_transcription_text=True))
-
-    model = DeepSpeech2Model.from_pretrained(valid_loader, config,
-                                             args.checkpoint_path)
-    model.eval()
-
-    # decoders only accept string encoded in utf-8
-    vocab_list = valid_loader.dataset.vocab_list
-    errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
-
-    # create grid for search
-    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
-    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
-    params_grid = [(alpha, beta) for alpha in cand_alphas
-                   for beta in cand_betas]
-
-    err_sum = [0.0 for i in range(len(params_grid))]
-    err_ave = [0.0 for i in range(len(params_grid))]
-
-    num_ins, len_refs, cur_batch = 0, 0, 0
-    # initialize external scorer
-    model.decoder.init_decode(args.alpha_from, args.beta_from,
-                              config.decoding.lang_model_path, vocab_list,
-                              config.decoding.decoding_method)
-    ## incremental tuning parameters over multiple batches
-    print("start tuning ...")
-    for infer_data in valid_loader():
-        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
-            break
-
-        def ordid2token(texts, texts_len):
-            """ ord() id to chr() chr """
-            trans = []
-            for text, n in zip(texts, texts_len):
-                n = n.numpy().item()
-                ids = text[:n]
-                trans.append(''.join([chr(i) for i in ids]))
-            return trans
-
-        audio, audio_len, text, text_len = infer_data
-        target_transcripts = ordid2token(text, text_len)
-        num_ins += audio.shape[0]
-
-        # model infer
-        eouts, eouts_len = model.encoder(audio, audio_len)
-        probs = model.decoder.softmax(eouts)
-
-        # grid search
-        for index, (alpha, beta) in enumerate(params_grid):
-            print(f"tuneing: alpha={alpha} beta={beta}")
-            result_transcripts = model.decoder.decode_probs(
-                probs.numpy(), eouts_len, vocab_list,
-                config.decoding.decoding_method,
-                config.decoding.lang_model_path, alpha, beta,
-                config.decoding.beam_size, config.decoding.cutoff_prob,
-                config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
-
-            for target, result in zip(target_transcripts, result_transcripts):
-                errors, len_ref = errors_func(target, result)
-                err_sum[index] += errors
-
-                # accumulate the length of references of every batchπ
-                # in the first iteration
-                if args.alpha_from == alpha and args.beta_from == beta:
-                    len_refs += len_ref
-
-            err_ave[index] = err_sum[index] / len_refs
-            if index % 2 == 0:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-            print("tuneing: one grid done!")
-
-        # output on-line tuning result at the end of current batch
-        err_ave_min = min(err_ave)
-        min_index = err_ave.index(err_ave_min)
-        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
-              " min [%s] = %f" %
-              (cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
-               "%.3f" % params_grid[min_index][1],
-               config.decoding.error_rate_type, err_ave_min))
-        cur_batch += 1
-
-    # output WER/CER at every (alpha, beta)
-    print("\nFinal %s:\n" % config.decoding.error_rate_type)
-    for index in range(len(params_grid)):
-        print("(alpha, beta) = (%s, %s), [%s] = %f" %
-              ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
-               config.decoding.error_rate_type, err_ave[index]))
-
-    err_ave_min = min(err_ave)
-    min_index = err_ave.index(err_ave_min)
-    print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
-          (cur_batch, "%.3f" % params_grid[min_index][0],
-           "%.3f" % params_grid[min_index][1]))
-
-    print("finish tuning")
-
-
-def main(config, args):
-    tune(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    add_arg = functools.partial(add_arguments, argparser=parser)
-    add_arg('num_batches', int, -1, "# of batches tuning on. "
-            "Default -1, on whole dev set.")
-    add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
-    add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
-    add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
-    add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
-    add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
-    add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
-
-    add_arg('batch_size', int, 256, "# of samples per batch.")
-    add_arg('beam_size', int, 500, "Beam search width.")
-    add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
-    add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
-    add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
-
-    args = parser.parse_args()
-    print_arguments(args, globals())
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-
-    config.data.batch_size = args.batch_size
-    config.decoding.beam_size = args.beam_size
-    config.decoding.num_proc_bsearch = args.num_proc_bsearch
-    config.decoding.cutoff_prob = args.cutoff_prob
-    config.decoding.cutoff_top_n = args.cutoff_top_n
-
-    config.freeze()
-    print(config)
-
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -15,9 +15,11 @@
 import os
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -34,12 +36,14 @@ from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
 from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
+from deepspeech.training.reporter import report
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Autolog
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig

 logger = Log(__name__).getlog()

@ -65,29 +69,52 @@ class DeepSpeech2Trainer(Trainer):
        super().__init__(config, args)

    def train_batch(self, batch_index, batch_data, msg):
+        batch_size = self.config.collator.batch_size
+        accum_grad = self.config.training.accum_grad
+
        start = time.time()
+
+        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss = self.model(audio, audio_len, text, text_len)
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-        self.optimizer.step()
-        self.optimizer.clear_grad()
-        iteration_time = time.time() - start
-
        losses_np = {
            'train_loss': float(loss),
        }
-        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.collator.batch_size)
-        msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                         for k, v in losses_np.items())
-        logger.info(msg)
+
+        # loss backward
+        if (batch_index + 1) % accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
+        if (batch_index + 1) % accum_grad == 0:
+            self.optimizer.step()
+            self.optimizer.clear_grad()
+            self.iteration += 1
+
+        iteration_time = time.time() - start
+
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", batch_size)
+        report("accum", accum_grad)
+        report("step_cost", iteration_time)

        if dist.get_rank() == 0 and self.visualizer:
            for k, v in losses_np.items():
+                # `step -1` since we update `step` after optimizer.step().
                self.visualizer.add_scalar("train/{}".format(k), v,
-                                           self.iteration)
-        self.iteration += 1
+                                           self.iteration - 1)

    @paddle.no_grad()
    def valid(self):
@ -124,10 +151,9 @@ class DeepSpeech2Trainer(Trainer):

    def setup_model(self):
        config = self.config.clone()
-        config.defrost()
-        config.model.feat_size = self.train_loader.collate_fn.feature_size
-        config.model.dict_size = self.train_loader.collate_fn.vocab_size
-        config.freeze()
+        with UpdateConfig(config):
+            config.model.feat_size = self.train_loader.collate_fn.feature_size
+            config.model.dict_size = self.train_loader.collate_fn.vocab_size

        if self.args.model_type == 'offline':
            model = DeepSpeech2Model.from_config(config.model)
@ -280,9 +306,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
-                        (target, result))
+                fout.write({"utt": utt, "ref": target, "hyp": result})
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("Current error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))

@ -325,7 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        cfg = self.config
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                utts, audio, audio_len, texts, texts_len = batch
                metrics = self.compute_metrics(utts, audio, audio_len, texts,
@ -378,7 +405,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')

        self.setup_output_dir()
        self.setup_checkpointer()
@ -610,7 +637,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')

        self.setup_output_dir()

--- a/deepspeech/exps/u2/bin/train.py
+++ b/deepspeech/exps/u2/bin/train.py
@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils.utility import print_arguments

+# from deepspeech.exps.u2.trainer import U2Trainer as Trainer
+

 def main_sp(config, args):
    exp = Trainer(config, args)
@ -30,7 +32,7 @@ def main_sp(config, args):


 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -17,9 +17,12 @@ import os
 import sys
 import time
 from collections import defaultdict
+from collections import OrderedDict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -32,7 +35,10 @@ from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
+from deepspeech.training.reporter import ObsScope
+from deepspeech.training.reporter import report
 from deepspeech.training.scheduler import LRSchedulerFactory
+from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@ -41,6 +47,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig

 logger = Log(__name__).getlog()

@ -79,21 +86,36 @@ class U2Trainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
-        utt, audio, audio_len, text, text_len = batch_data

+        # forward
+        utt, audio, audio_len, text, text_len = batch_data
        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
+
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)

+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if self.parallel else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -102,14 +124,13 @@ class U2Trainer(Trainer):

        iteration_time = time.time() - start

-        if (batch_index + 1) % train_conf.log_interval == 0:
-            msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.collator.batch_size)
-            msg += "accum: {}, ".format(train_conf.accum_grad)
-            msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                             for k, v in losses_np.items())
-            logger.info(msg)
+        for k, v in losses_np.items():
+            report(k, v)
+        report("batch_size", self.config.collator.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)

+        if (batch_index + 1) % train_conf.accum_grad == 0:
            if dist.get_rank() == 0 and self.visualizer:
                losses_np_v = losses_np.copy()
                losses_np_v.update({"lr": self.lr_scheduler()})
@ -163,46 +184,58 @@ class U2Trainer(Trainer):
        # script_model_path = str(self.checkpoint_dir / 'init')
        # paddle.jit.save(script_model, script_model_path)

-        from_scratch = self.resume_or_scratch()
-        if from_scratch:
-            # save init model, i.e. 0 epoch
-            self.save(tag='init')
-
-        self.lr_scheduler.step(self.iteration)
-        if self.parallel:
-            self.train_loader.batch_sampler.set_epoch(self.epoch)
+        self.before_train()

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips[sent./sec]'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += ","
+                        if (batch_index + 1
+                            ) % self.config.training.log_interval == 0:
+                            logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
@ -294,10 +327,11 @@ class U2Trainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
        model = U2Model.from_config(model_conf)

        if self.parallel:
@ -433,9 +467,10 @@ class U2Tester(U2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
-                        (target, result))
+                fout.write({"utt": utt, "ref": target, "hyp": result})
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("One example error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))

@ -460,7 +495,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_metrics(*batch, fout=fout)
                num_frames += metrics['num_frames']
@ -540,7 +575,7 @@ class U2Tester(U2Trainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)

@ -548,26 +583,25 @@ class U2Tester(U2Trainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))

                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
-                    os.path.dirname(self.args.result_file), "align")
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                align_output_path = Path(self.args.result_file).parent / "align"
+                align_output_path.mkdir(parents=True, exist_ok=True)
+                tier_path = align_output_path / (key[0] + ".tier")
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
-                                             key[0] + ".TextGrid")
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@ -575,7 +609,7 @@ class U2Tester(U2Trainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))

    def run_align(self):
        self.resume_or_scratch()
@ -621,7 +655,7 @@ class U2Tester(U2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')

        self.setup_output_dir()
        self.setup_checkpointer()
--- a/deepspeech/exps/u2/trainer.py
+++ b/deepspeech/exps/u2/trainer.py
@ -0,0 +1,220 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains U2 model."""
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.io.sampler import SortagradBatchSampler
+from deepspeech.io.sampler import SortagradDistributedBatchSampler
+from deepspeech.models.u2 import U2Evaluator
+from deepspeech.models.u2 import U2Model
+from deepspeech.models.u2 import U2Updater
+from deepspeech.training.extensions.snapshot import Snapshot
+from deepspeech.training.extensions.visualizer import VisualDL
+from deepspeech.training.optimizer import OptimizerFactory
+from deepspeech.training.scheduler import LRSchedulerFactory
+from deepspeech.training.timer import Timer
+from deepspeech.training.trainer import Trainer
+from deepspeech.training.updaters.trainer import Trainer as NewTrainer
+from deepspeech.utils import layer_tools
+from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+class U2Trainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        config.defrost()
+        config.collator.keep_transcription_text = False
+
+        # train/valid dataset, return token ids
+        config.data.manifest = config.data.train_manifest
+        train_dataset = ManifestDataset.from_config(config)
+
+        config.data.manifest = config.data.dev_manifest
+        dev_dataset = ManifestDataset.from_config(config)
+
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
+        if self.parallel:
+            batch_sampler = SortagradDistributedBatchSampler(
+                train_dataset,
+                batch_size=config.collator.batch_size,
+                num_replicas=None,
+                rank=None,
+                shuffle=True,
+                drop_last=True,
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
+        else:
+            batch_sampler = SortagradBatchSampler(
+                train_dataset,
+                shuffle=True,
+                batch_size=config.collator.batch_size,
+                drop_last=True,
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
+        self.train_loader = DataLoader(
+            train_dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers, )
+        self.valid_loader = DataLoader(
+            dev_dataset,
+            batch_size=config.collator.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=collate_fn_dev)
+
+        # test dataset, return raw text
+        config.data.manifest = config.data.test_manifest
+        # filter test examples, will cause less examples, but no mismatch with training
+        # and can use large batch size , save training time, so filter test egs now.
+        config.data.min_input_len = 0.0  # second
+        config.data.max_input_len = float('inf')  # second
+        config.data.min_output_len = 0.0  # tokens
+        config.data.max_output_len = float('inf')  # tokens
+        config.data.min_output_input_ratio = 0.00
+        config.data.max_output_input_ratio = float('inf')
+
+        test_dataset = ManifestDataset.from_config(config)
+        # return text ord id
+        config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
+        self.test_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decoding.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        # return text token id
+        config.collator.keep_transcription_text = False
+        self.align_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decoding.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        logger.info("Setup train/valid/test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config.model
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
+        model = U2Model.from_config(model_conf)
+
+        if self.parallel:
+            model = paddle.DataParallel(model)
+
+        model.train()
+        logger.info(f"{model}")
+        layer_tools.print_params(model, logger.info)
+
+        train_config = config.training
+        optim_type = train_config.optim
+        optim_conf = train_config.optim_conf
+        scheduler_type = train_config.scheduler
+        scheduler_conf = train_config.scheduler_conf
+
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config.training
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+        self.model = model
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        logger.info("Setup model/optimizer/lr_scheduler!")
+
+    def setup_updater(self):
+        output_dir = self.output_dir
+        config = self.config.training
+
+        updater = U2Updater(
+            model=self.model,
+            optimizer=self.optimizer,
+            scheduler=self.lr_scheduler,
+            dataloader=self.train_loader,
+            output_dir=output_dir,
+            accum_grad=config.accum_grad)
+
+        trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
+
+        evaluator = U2Evaluator(self.model, self.valid_loader)
+
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+
+        if dist.get_rank() == 0:
+            trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+            num_snapshots = config.checkpoint.kbest_n
+            trainer.extend(
+                Snapshot(
+                    mode='kbest',
+                    max_size=num_snapshots,
+                    indicator='VALID/LOSS',
+                    less_better=True),
+                trigger=(1, 'epoch'))
+        # print(trainer.extensions)
+        # trainer.run()
+        self.trainer = trainer
+
+    def run(self):
+        """The routine of the experiment after setup. This method is intended
+        to be used by the user.
+        """
+        self.setup_updater()
+        with Timer("Training Done: {}"):
+            self.trainer.run()
--- a/deepspeech/exps/u2_kaldi/bin/train.py
+++ b/deepspeech/exps/u2_kaldi/bin/train.py
@ -36,7 +36,7 @@ def main_sp(config, args):


 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -17,9 +17,11 @@ import os
 import sys
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -31,6 +33,7 @@ from deepspeech.io.dataloader import BatchDataLoader
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.scheduler import LRSchedulerFactory
+from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@ -39,6 +42,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig

 logger = Log(__name__).getlog()

@ -83,20 +87,34 @@ class U2Trainer(Trainer):
        train_conf = self.config.training
        start = time.time()

+        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
+
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)

+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -167,43 +185,42 @@ class U2Trainer(Trainer):
        # script_model_path = str(self.checkpoint_dir / 'init')
        # paddle.jit.save(script_model, script_model_path)

-        from_scratch = self.resume_or_scratch()
-        if from_scratch:
-            # save init model, i.e. 0 epoch
-            self.save(tag='init')
-        self.lr_scheduler.step(self.iteration)
+        self.before_train()

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
@ -300,10 +317,10 @@ class U2Trainer(Trainer):

        # model
        model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.feat_dim
-        model_conf.output_dim = self.train_loader.vocab_size
-        model_conf.freeze()
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.feat_dim
+            model_conf.output_dim = self.train_loader.vocab_size
+
        model = U2Model.from_config(model_conf)
        if self.parallel:
            model = paddle.DataParallel(model)
@ -429,9 +446,10 @@ class U2Tester(U2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
-                        (target, result))
+                fout.write({"utt": utt, "ref": target, "hyp": result})
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("One example error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))

@ -456,7 +474,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_metrics(*batch, fout=fout)
                num_frames += metrics['num_frames']
@ -526,9 +544,8 @@ class U2Tester(U2Trainer):
        self.model.eval()
        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")

-        stride_ms = self.config.collater.stride_ms
-        token_dict = self.args.char_list
-
+        stride_ms = self.align_loader.collate_fn.stride_ms
+        token_dict = self.align_loader.collate_fn.vocab_list
        with open(self.args.result_file, 'w') as fout:
            # one example in batch
            for i, batch in enumerate(self.align_loader):
@ -537,7 +554,7 @@ class U2Tester(U2Trainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)

@ -545,26 +562,25 @@ class U2Tester(U2Trainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))

                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
-                    os.path.dirname(self.args.result_file), "align")
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                align_output_path = Path(self.args.result_file).parent / "align"
+                align_output_path.mkdir(parents=True, exist_ok=True)
+                tier_path = align_output_path / (key[0] + ".tier")
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
-                                             key[0] + ".TextGrid")
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@ -572,7 +588,7 @@ class U2Tester(U2Trainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))

    def run_align(self):
        self.resume_or_scratch()
@ -623,7 +639,7 @@ class U2Tester(U2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')

        self.setup_output_dir()
        self.setup_checkpointer()
--- a/deepspeech/exps/u2_st/bin/train.py
+++ b/deepspeech/exps/u2_st/bin/train.py
@ -30,7 +30,7 @@ def main_sp(config, args):


 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -17,9 +17,11 @@ import os
 import sys
 import time
 from collections import defaultdict
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional

+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -37,6 +39,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
 from deepspeech.training.scheduler import WarmupLR
+from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import bleu_score
 from deepspeech.utils import ctc_utils
@ -45,6 +48,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
+from deepspeech.utils.utility import UpdateConfig

 logger = Log(__name__).getlog()

@ -83,6 +87,7 @@ class U2STTrainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
+        # forward
        utt, audio, audio_len, text, text_len = batch_data
        if isinstance(text, list) and isinstance(text_len, list):
            # joint training with ASR. Two decoding texts [translation, transcription]
@ -94,18 +99,30 @@ class U2STTrainer(Trainer):
        else:
            loss, st_loss, attention_loss, ctc_loss = self.model(
                audio, audio_len, text, text_len)
+
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
-        loss.backward()
-        layer_tools.print_grads(self.model, print_func=None)
-
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
-        losses_np['st_loss'] = float(st_loss)
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)

+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -182,46 +199,42 @@ class U2STTrainer(Trainer):
        # script_model_path = str(self.checkpoint_dir / 'init')
        # paddle.jit.save(script_model, script_model_path)

-        from_scratch = self.resume_or_scratch()
-        if from_scratch:
-            # save init model, i.e. 0 epoch
-            self.save(tag='init')
-
-        self.lr_scheduler.step(self.iteration)
-        if self.parallel:
-            self.train_loader.batch_sampler.set_epoch(self.epoch)
+        self.before_train()

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg += "epoch: {}, ".format(self.epoch)
+                        msg += "step: {}, ".format(self.iteration)
+                        msg += "batch : {}/{}, ".format(batch_index + 1,
+                                                        len(self.train_loader))
+                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                        msg += "data time: {:>.3f}s, ".format(dataload_time)
+                        self.train_batch(batch_index, batch, msg)
+                        self.after_train_batch()
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
@ -327,10 +340,10 @@ class U2STTrainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.collate_fn.feature_size
-        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+        with UpdateConfig(model_conf):
+            model_conf.input_dim = self.train_loader.collate_fn.feature_size
+            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
+
        model = U2STModel.from_config(model_conf)

        if self.parallel:
@ -467,8 +480,10 @@ class U2STTester(U2STTrainer):
            len_refs += len(target.split())
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
-            logger.info("\nReference: %s\nHypothesis: %s" % (target, result))
+                fout.write({"utt": utt, "ref": target, "hyp": result})
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("One example BLEU = %s" %
                        (bleu_func([result], [[target]]).prec_str))

@ -496,7 +511,7 @@ class U2STTester(U2STTrainer):
        len_refs, num_ins = 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_translation_metrics(
                    *batch, bleu_func=bleu_func, fout=fout)
@ -569,7 +584,7 @@ class U2STTester(U2STTrainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)

@ -577,26 +592,25 @@ class U2STTester(U2STTrainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))

                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
-                    os.path.dirname(self.args.result_file), "align")
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                align_output_path = Path(self.args.result_file).parent / "align"
+                align_output_path.mkdir(parents=True, exist_ok=True)
+                tier_path = align_output_path / (key[0] + ".tier")
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
-                                             key[0] + ".TextGrid")
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@ -604,7 +618,7 @@ class U2STTester(U2STTrainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))

    def run_align(self):
        self.resume_or_scratch()
@ -650,7 +664,7 @@ class U2STTester(U2STTrainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')

        self.setup_output_dir()
        self.setup_checkpointer()
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -76,7 +76,7 @@ class TextFeaturizer():

        Args:
            text (str): Text.
-        
+
        Returns:
            List[int]: List of token indices.
        """
@ -89,7 +89,7 @@ class TextFeaturizer():

    def defeaturize(self, idxs):
        """Convert a list of token indices to text string,
-        ignore index after eos_id. 
+        ignore index after eos_id.

        Args:
            idxs (List[int]): List of token indices.
@ -196,7 +196,12 @@ class TextFeaturizer():
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
-
-        unk_id = vocab_list.index(UNK)
-        eos_id = vocab_list.index(EOS)
+        if UNK in vocab_list:
+            unk_id = vocab_list.index(UNK)
+        else:
+            unk_id = -1
+        if EOS in vocab_list:
+            eos_id = vocab_list.index(EOS)
+        else:
+            eos_id = -1
        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@ -130,7 +130,8 @@ class FeatureNormalizer(object):

    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
-        mean, istd = load_cmvn(filepath, filetype='json')
+        filetype = filepath.split(".")[-1]
+        mean, istd = load_cmvn(filepath, filetype=filetype)
        self._mean = np.expand_dims(mean, axis=0)
        self._istd = np.expand_dims(istd, axis=0)

--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains data helper functions."""
-import codecs
 import json
 import math
 from typing import List
 from typing import Optional
 from typing import Text

+import jsonlines
 import numpy as np

 from deepspeech.utils.log import Log
@ -69,19 +69,19 @@ def read_manifest(

    Args:
        manifest_path ([type]): Manifest file to load and parse.
-        max_input_len ([type], optional): maximum output seq length, 
-            in seconds for raw wav, in frame numbers for feature data. 
+        max_input_len ([type], optional): maximum output seq length,
+            in seconds for raw wav, in frame numbers for feature data.
            Defaults to float('inf').
-        min_input_len (float, optional): minimum input seq length, 
-            in seconds for raw wav, in frame numbers for feature data. 
+        min_input_len (float, optional): minimum input seq length,
+            in seconds for raw wav, in frame numbers for feature data.
            Defaults to 0.0.
-        max_output_len (float, optional): maximum input seq length, 
+        max_output_len (float, optional): maximum input seq length,
            in modeling units. Defaults to 500.0.
-        min_output_len (float, optional): minimum input seq length, 
+        min_output_len (float, optional): minimum input seq length,
            in modeling units. Defaults to 0.0.
-        max_output_input_ratio (float, optional): 
+        max_output_input_ratio (float, optional):
            maximum output seq length/output seq length ratio. Defaults to 10.0.
-        min_output_input_ratio (float, optional): 
+        min_output_input_ratio (float, optional):
            minimum output seq length/output seq length ratio. Defaults to 0.05.

    Raises:
@ -92,26 +92,22 @@ def read_manifest(
    """

    manifest = []
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
-        try:
-            json_data = json.loads(json_line)
-        except Exception as e:
-            raise IOError("Error reading manifest: %s" % str(e))
-
-        feat_len = json_data["feat_shape"][
-            0] if 'feat_shape' in json_data else 1.0
-        token_len = json_data["token_shape"][
-            0] if 'token_shape' in json_data else 1.0
-        conditions = [
-            feat_len >= min_input_len,
-            feat_len <= max_input_len,
-            token_len >= min_output_len,
-            token_len <= max_output_len,
-            token_len / feat_len >= min_output_input_ratio,
-            token_len / feat_len <= max_output_input_ratio,
-        ]
-        if all(conditions):
-            manifest.append(json_data)
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            feat_len = json_data["feat_shape"][
+                0] if 'feat_shape' in json_data else 1.0
+            token_len = json_data["token_shape"][
+                0] if 'token_shape' in json_data else 1.0
+            conditions = [
+                feat_len >= min_input_len,
+                feat_len <= max_input_len,
+                token_len >= min_output_len,
+                token_len <= max_output_len,
+                token_len / feat_len >= min_output_input_ratio,
+                token_len / feat_len <= max_output_input_ratio,
+            ]
+            if all(conditions):
+                manifest.append(json_data)
    return manifest


@ -131,7 +127,7 @@ def rms_to_dbfs(rms: float):
    """Root Mean Square to dBFS.
    https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/
    Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB.
-   
+
    dB = dBFS + 3.0103
    dBFS = db - 3.0103
    e.g. 0 dB = -3.0103 dBFS
@ -146,26 +142,26 @@ def rms_to_dbfs(rms: float):


 def max_dbfs(sample_data: np.ndarray):
-    """Peak dBFS based on the maximum energy sample. 
+    """Peak dBFS based on the maximum energy sample.

    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].

    Returns:
-        float: dBFS 
+        float: dBFS
    """
    # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization.
    return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data))))


 def mean_dbfs(sample_data):
-    """Peak dBFS based on the RMS energy. 
+    """Peak dBFS based on the RMS energy.

    Args:
        sample_data ([np.ndarray]): float array, [-1, 1].

    Returns:
-        float: dBFS 
+        float: dBFS
    """
    return rms_to_dbfs(
        math.sqrt(np.mean(np.square(sample_data, dtype=np.float64))))
@ -185,7 +181,7 @@ def gain_db_to_ratio(gain_db: float):

 def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103):
    """Nomalize audio to dBFS.
-    
+
    Args:
        sample_data (np.ndarray): input wave samples, [-1, 1].
        dbfs (float, optional): target dBFS. Defaults to -3.0103.
@ -284,6 +280,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
        cmvn = _load_json_cmvn(cmvn_file)
    elif filetype == "kaldi":
        cmvn = _load_kaldi_cmvn(cmvn_file)
+    elif filetype == "npz":
+        eps = 1e-14
+        npzfile = np.load(cmvn_file)
+        mean = np.squeeze(npzfile["mean"])
+        std = np.squeeze(npzfile["std"])
+        istd = 1 / (std + eps)
+        cmvn = [mean, istd]
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -292,10 +292,6 @@ class SpeechCollator():
        olens = np.array(text_lens).astype(np.int64)
        return utts, xs_pad, ilens, ys_pad, olens

-    @property
-    def manifest(self):
-        return self._manifest
-
    @property
    def vocab_size(self):
        return self._speech_featurizer.vocab_size
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@ -44,7 +44,7 @@ def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],


 def batch_collate(x):
-    """de-tuple.
+    """de-minibatch, since user compose batch.

    Args:
        x (List[Tuple]): [(utts, xs, ilens, ys, olens)]
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -76,19 +76,19 @@ class ManifestDataset(Dataset):

        Args:
            manifest_path (str): manifest josn file path
-            max_input_len ([type], optional): maximum output seq length, 
+            max_input_len ([type], optional): maximum output seq length,
                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-            min_input_len (float, optional): minimum input seq length, 
+            min_input_len (float, optional): minimum input seq length,
                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-            max_output_len (float, optional): maximum input seq length, 
+            max_output_len (float, optional): maximum input seq length,
                in modeling units. Defaults to 500.0.
-            min_output_len (float, optional): minimum input seq length, 
+            min_output_len (float, optional): minimum input seq length,
                in modeling units. Defaults to 0.0.
-            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
                Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                Defaults to 0.05.
-        
+
        """
        super().__init__()

@ -147,3 +147,131 @@ class TransformDataset(Dataset):
    def __getitem__(self, idx):
        """[] operator."""
        return self.converter([self.reader(self.data[idx], return_uttid=True)])
+
+
+class AudioDataset(Dataset):
+    def __init__(self,
+                 data_file,
+                 max_length=10240,
+                 min_length=0,
+                 token_max_length=200,
+                 token_min_length=1,
+                 batch_type='static',
+                 batch_size=1,
+                 max_frames_in_batch=0,
+                 sort=True,
+                 raw_wav=True,
+                 stride_ms=10):
+        """Dataset for loading audio data.
+        Attributes::
+            data_file: input data file
+                Plain text data file, each line contains following 7 fields,
+                which is split by '\t':
+                    utt:utt1
+                    feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30
+                    feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames)
+                    text:i love you
+                    token: i <space> l o v e <space> y o u
+                    tokenid: int id of this token
+                    token_shape: M,N    # M is the number of token, N is vocab size
+            max_length: drop utterance which is greater than max_length(10ms), unit 10ms.
+            min_length: drop utterance which is less than min_length(10ms), unit 10ms.
+            token_max_length: drop utterance which is greater than token_max_length,
+                especially when use char unit for english modeling
+            token_min_length: drop utterance which is less than token_max_length
+            batch_type: static or dynamic, see max_frames_in_batch(dynamic)
+            batch_size: number of utterances in a batch,
+               it's for static batch size.
+            max_frames_in_batch: max feature frames in a batch,
+               when batch_type is dynamic, it's for dynamic batch size.
+               Then batch_size is ignored, we will keep filling the
+               batch until the total frames in batch up to max_frames_in_batch.
+            sort: whether to sort all data, so the utterance with the same
+               length could be filled in a same batch.
+            raw_wav: use raw wave or extracted featute.
+                if raw wave is used, dynamic waveform-level augmentation could be used
+                and the feature is extracted by torchaudio.
+                if extracted featute(e.g. by kaldi) is used, only feature-level
+                augmentation such as specaug could be used.
+        """
+        assert batch_type in ['static', 'dynamic']
+        # read manifest
+        data = read_manifest(data_file)
+        if sort:
+            data = sorted(data, key=lambda x: x["feat_shape"][0])
+        if raw_wav:
+            assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
+                                                                        '.scp')
+            data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
+
+        self.input_dim = data[0]['feat_shape'][1]
+        self.output_dim = data[0]['token_shape'][1]
+
+        # with open(data_file, 'r') as f:
+        #     for line in f:
+        #         arr = line.strip().split('\t')
+        #         if len(arr) != 7:
+        #             continue
+        #         key = arr[0].split(':')[1]
+        #         tokenid = arr[5].split(':')[1]
+        #         output_dim = int(arr[6].split(':')[1].split(',')[1])
+        #         if raw_wav:
+        #             wav_path = ':'.join(arr[1].split(':')[1:])
+        #             duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
+        #             data.append((key, wav_path, duration, tokenid))
+        #         else:
+        #             feat_ark = ':'.join(arr[1].split(':')[1:])
+        #             feat_info = arr[2].split(':')[1].split(',')
+        #             feat_dim = int(feat_info[1].strip())
+        #             num_frames = int(feat_info[0].strip())
+        #             data.append((key, feat_ark, num_frames, tokenid))
+        #             self.input_dim = feat_dim
+        #         self.output_dim = output_dim
+
+        valid_data = []
+        for i in range(len(data)):
+            length = data[i]['feat_shape'][0]
+            token_length = data[i]['token_shape'][0]
+            # remove too lang or too short utt for both input and output
+            # to prevent from out of memory
+            if length > max_length or length < min_length:
+                # logging.warn('ignore utterance {} feature {}'.format(
+                #     data[i][0], length))
+                pass
+            elif token_length > token_max_length or token_length < token_min_length:
+                pass
+            else:
+                valid_data.append(data[i])
+        data = valid_data
+
+        self.minibatch = []
+        num_data = len(data)
+        # Dynamic batch size
+        if batch_type == 'dynamic':
+            assert (max_frames_in_batch > 0)
+            self.minibatch.append([])
+            num_frames_in_batch = 0
+            for i in range(num_data):
+                length = data[i]['feat_shape'][0]
+                num_frames_in_batch += length
+                if num_frames_in_batch > max_frames_in_batch:
+                    self.minibatch.append([])
+                    num_frames_in_batch = length
+                self.minibatch[-1].append(data[i])
+        # Static batch size
+        else:
+            cur = 0
+            while cur < num_data:
+                end = min(cur + batch_size, num_data)
+                item = []
+                for i in range(cur, end):
+                    item.append(data[i])
+                self.minibatch.append(item)
+                cur = end
+
+    def __len__(self):
+        return len(self.minibatch)
+
+    def __getitem__(self, idx):
+        instance = self.minibatch[idx]
+        return instance["utt"], instance["feat"], instance["text"]
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
@ -106,11 +106,9 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # TODO(Hui Zhang): not support bool multiply
-        # masks = masks.type_as(x)
-        masks = masks.astype(x.dtype)
-        x = x.multiply(masks)
-
+        # https://github.com/PaddlePaddle/Paddle/pull/29265
+        # rhs will type promote to lhs
+        x = x * masks
        return x, x_len


--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
@ -128,8 +128,8 @@ class DeepSpeech2Model(nn.Layer):
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
+                share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
+                ctc_grad_norm_type='instance', ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
@ -141,7 +141,9 @@ class DeepSpeech2Model(nn.Layer):
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
-                 share_rnn_weights=True):
+                 share_rnn_weights=True,
+                 blank_id=0,
+                 ctc_grad_norm_type='instance'):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -156,10 +158,11 @@ class DeepSpeech2Model(nn.Layer):
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
+            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=ctc_grad_norm_type)

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -221,7 +224,8 @@ class DeepSpeech2Model(nn.Layer):
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,
                    use_gru=config.model.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
+                    share_rnn_weights=config.model.share_rnn_weights,
+                    blank_id=config.model.blank_id)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -246,7 +250,8 @@ class DeepSpeech2Model(nn.Layer):
                    num_rnn_layers=config.num_rnn_layers,
                    rnn_size=config.rnn_layer_size,
                    use_gru=config.use_gru,
-                    share_rnn_weights=config.share_rnn_weights)
+                    share_rnn_weights=config.share_rnn_weights,
+                    blank_id=config.blank_id)
        return model


@ -258,7 +263,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
-                 share_rnn_weights=True):
+                 share_rnn_weights=True,
+                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
@ -266,7 +272,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
+            share_rnn_weights=share_rnn_weights,
+            blank_id=blank_id)

    def forward(self, audio, audio_len):
        """export model function
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@ -308,7 +308,8 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
-            masks = masks.astype(x.dtype)
-            x = x.multiply(masks)
+            # https://github.com/PaddlePaddle/Paddle/pull/29265
+            # rhs will type promote to lhs
+            x = x * masks
+
        return x, x_len
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@ -254,6 +254,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
                num_fc_layers=2,
                fc_layers_size_list=[512, 256],
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
+                blank_id=0,  # index of blank in vocob.txt
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
@ -268,7 +269,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
-                 use_gru=False):
+                 use_gru=False,
+                 blank_id=0):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -284,10 +286,11 @@ class DeepSpeech2ModelOnline(nn.Layer):
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
+            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type='instance')

    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -353,7 +356,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                    rnn_direction=config.model.rnn_direction,
                    num_fc_layers=config.model.num_fc_layers,
                    fc_layers_size_list=config.model.fc_layers_size_list,
-                    use_gru=config.model.use_gru)
+                    use_gru=config.model.use_gru,
+                    blank_id=config.model.blank_id)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -380,7 +384,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                    rnn_direction=config.rnn_direction,
                    num_fc_layers=config.num_fc_layers,
                    fc_layers_size_list=config.fc_layers_size_list,
-                    use_gru=config.use_gru)
+                    use_gru=config.use_gru,
+                    blank_id=config.blank_id)
        return model


@ -394,7 +399,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
-                 use_gru=False):
+                 use_gru=False,
+                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
@ -404,7 +410,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
            rnn_direction=rnn_direction,
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
-            use_gru=use_gru)
+            use_gru=use_gru,
+            blank_id=blank_id)

    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box):
--- a/deepspeech/models/u2/init.py
+++ b/deepspeech/models/u2/init.py
@ -0,0 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .u2 import U2InferModel
+from .u2 import U2Model
+from .updater import U2Evaluator
+from .updater import U2Updater
+
+__all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import pad_sequence
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import log_add
+from deepspeech.utils.utility import UpdateConfig

 __all__ = ["U2Model", "U2InferModel"]

@ -115,7 +116,8 @@ class U2BaseModel(nn.Layer):
                 ctc_weight: float=0.5,
                 ignore_id: int=IGNORE_ID,
                 lsm_weight: float=0.0,
-                 length_normalized_loss: bool=False):
+                 length_normalized_loss: bool=False,
+                 **kwargs):
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight

        super().__init__()
@ -162,10 +164,7 @@ class U2BaseModel(nn.Layer):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        encoder_time = time.time() - start
        #logger.debug(f"encoder time: {encoder_time}")
-        #TODO(Hui Zhang): sum not support bool type
-        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
-        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
-            1)  #[B, 1, T] -> [B]
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]

        # 2a. Attention-decoder branch
        loss_att = None
@ -299,8 +298,8 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
-        encoder_dim = encoder_out.size(2)
+        maxlen = encoder_out.shape[1]
+        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@ -320,8 +319,7 @@ class U2BaseModel(nn.Layer):
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            # TODO(Hui Zhang): if end_flag.sum() == running_size:
-            if end_flag.cast(paddle.int64).sum() == running_size:
+            if end_flag.sum() == running_size:
                break

            # 2.1 Forward decoder step
@ -406,10 +404,8 @@ class U2BaseModel(nn.Layer):
        encoder_out, encoder_mask = self._forward_encoder(
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks, simulate_streaming)
-        maxlen = encoder_out.size(1)
-        # (TODO Hui Zhang): bool no support reduce_sum
-        # encoder_out_lens = encoder_mask.squeeze(1).sum(1)
-        encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
+        maxlen = encoder_out.shape[1]
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)

        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
@ -459,7 +455,7 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (1, maxlen, vocab_size)
        ctc_probs = ctc_probs.squeeze(0)

@ -587,7 +583,7 @@ class U2BaseModel(nn.Layer):

        encoder_out = encoder_out.repeat(beam_size, 1, 1)
        encoder_mask = paddle.ones(
-            (beam_size, 1, encoder_out.size(1)), dtype=paddle.bool)
+            (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
        decoder_out, _ = self.decoder(
            encoder_out, encoder_mask, hyps_pad,
            hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
@ -667,9 +663,7 @@ class U2BaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    # @jit.to_static([
-    #         paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'),  # audio feat, [B,T,D]
-    #     ])
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@ -696,13 +690,13 @@ class U2BaseModel(nn.Layer):
        Returns:
            paddle.Tensor: decoder output, (B, L)
        """
-        assert encoder_out.size(0) == 1
-        num_hyps = hyps.size(0)
-        assert hyps_lens.size(0) == num_hyps
+        assert encoder_out.shape[0] == 1
+        num_hyps = hyps.shape[0]
+        assert hyps_lens.shape[0] == num_hyps
        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
        # (B, 1, T)
        encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
        # (num_hyps, max_hyps_len, vocab_size)
        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                      hyps_lens)
@ -757,7 +751,7 @@ class U2BaseModel(nn.Layer):
        Returns:
            List[List[int]]: transcripts.
        """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
        if decoding_method in ['ctc_prefix_beam_search',
                               'attention_rescoring'] and batch_size > 1:
            logger.fatal(
@ -785,7 +779,7 @@ class U2BaseModel(nn.Layer):
        # result in List[int], change it to List[List[int]] for compatible
        # with other batch decoding mode
        elif decoding_method == 'ctc_prefix_beam_search':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
            hyp = self.ctc_prefix_beam_search(
                feats,
                feats_lengths,
@ -795,7 +789,7 @@ class U2BaseModel(nn.Layer):
                simulate_streaming=simulate_streaming)
            hyps = [hyp]
        elif decoding_method == 'attention_rescoring':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
            hyp = self.attention_rescoring(
                feats,
                feats_lengths,
@ -836,6 +830,7 @@ class U2Model(U2BaseModel):
        Returns:
            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
        """
+        # cmvn
        if configs['cmvn_file'] is not None:
            mean, istd = load_cmvn(configs['cmvn_file'],
                                   configs['cmvn_file_type'])
@ -845,11 +840,13 @@ class U2Model(U2BaseModel):
        else:
            global_cmvn = None

+        # input & output dim
        input_dim = configs['input_dim']
        vocab_size = configs['output_dim']
        assert input_dim != 0, input_dim
        assert vocab_size != 0, vocab_size

+        # encoder
        encoder_type = configs.get('encoder', 'transformer')
        logger.info(f"U2 Encoder type: {encoder_type}")
        if encoder_type == 'transformer':
@ -861,16 +858,21 @@ class U2Model(U2BaseModel):
        else:
            raise ValueError(f"not support encoder type:{encoder_type}")

+        # decoder
        decoder = TransformerDecoder(vocab_size,
                                     encoder.output_size(),
                                     **configs['decoder_conf'])
+
+        # ctc decoder and ctc loss
+        model_conf = configs['model_conf']
        ctc = CTCDecoder(
            odim=vocab_size,
            enc_n_units=encoder.output_size(),
            blank_id=0,
-            dropout_rate=0.0,
+            dropout_rate=model_conf['ctc_dropoutrate'],
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=model_conf['ctc_grad_norm_type'])

        return vocab_size, encoder, decoder, ctc

@ -902,10 +904,10 @@ class U2Model(U2BaseModel):
        Returns:
            DeepSpeech2Model: The model built from pretrained result.
        """
-        config.defrost()
-        config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+        with UpdateConfig(config):
+            config.input_dim = dataloader.collate_fn.feature_size
+            config.output_dim = dataloader.collate_fn.vocab_size
+
        model = cls.from_config(config)

        if checkpoint_path:
--- a/deepspeech/models/u2/updater.py
+++ b/deepspeech/models/u2/updater.py
@ -0,0 +1,149 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import nullcontext
+
+import paddle
+from paddle import distributed as dist
+
+from deepspeech.training.extensions.evaluator import StandardEvaluator
+from deepspeech.training.reporter import report
+from deepspeech.training.timer import Timer
+from deepspeech.training.updaters.standard_updater import StandardUpdater
+from deepspeech.utils import layer_tools
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class U2Evaluator(StandardEvaluator):
+    def __init__(self, model, dataloader):
+        super().__init__(model, dataloader)
+        self.msg = ""
+        self.num_seen_utts = 0
+        self.total_loss = 0.0
+
+    def evaluate_core(self, batch):
+        self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        loss, attention_loss, ctc_loss = self.model(*batch[1:])
+        if paddle.isfinite(loss):
+            num_utts = batch[1].shape[0]
+            self.num_seen_utts += num_utts
+            self.total_loss += float(loss) * num_utts
+
+            losses_dict['loss'] = float(loss)
+            if attention_loss:
+                losses_dict['att_loss'] = float(attention_loss)
+            if ctc_loss:
+                losses_dict['ctc_loss'] = float(ctc_loss)
+
+            for k, v in losses_dict.items():
+                report("eval/" + k, v)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        logger.info(self.msg)
+        return self.total_loss, self.num_seen_utts
+
+
+class U2Updater(StandardUpdater):
+    def __init__(self,
+                 model,
+                 optimizer,
+                 scheduler,
+                 dataloader,
+                 init_state=None,
+                 accum_grad=1,
+                 **kwargs):
+        super().__init__(
+            model, optimizer, scheduler, dataloader, init_state=init_state)
+        self.accum_grad = accum_grad
+        self.forward_count = 0
+        self.msg = ""
+
+    def update_core(self, batch):
+        """One Step
+
+        Args:
+            batch (List[Object]): utts, xs, xlens, ys, ylens
+        """
+        losses_dict = {}
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+
+        # forward
+        batch_size = batch[1].shape[0]
+        loss, attention_loss, ctc_loss = self.model(*batch[1:])
+        # loss div by `batch_size * accum_grad`
+        loss /= self.accum_grad
+
+        # loss backward
+        if (self.forward_count + 1) != self.accum_grad:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            context = self.model.no_sync
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+
+        with context():
+            loss.backward()
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # loss info
+        losses_dict['loss'] = float(loss) * self.accum_grad
+        if attention_loss:
+            losses_dict['att_loss'] = float(attention_loss)
+        if ctc_loss:
+            losses_dict['ctc_loss'] = float(ctc_loss)
+        # report loss
+        for k, v in losses_dict.items():
+            report("train/" + k, v)
+        # loss msg
+        self.msg += "batch size: {}, ".format(batch_size)
+        self.msg += "accum: {}, ".format(self.accum_grad)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+        # Truncate the graph
+        loss.detach()
+
+        # update parameters
+        self.forward_count += 1
+        if self.forward_count != self.accum_grad:
+            return
+        self.forward_count = 0
+
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+        self.scheduler.step()
+
+    def update(self):
+        # model is default in train mode
+
+        # training for a step is implemented here
+        with Timer("data time cost:{}"):
+            batch = self.read_batch()
+        with Timer("step time cost:{}"):
+            self.update_core(batch)
+
+        # #iterations with accum_grad > 1
+        # Ref.: https://github.com/espnet/espnet/issues/777
+        if self.forward_count == 0:
+            self.state.iteration += 1
+        if self.updates_per_epoch is not None:
+            if self.state.iteration % self.updates_per_epoch == 0:
+                self.state.epoch += 1
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import th_accuracy
+from deepspeech.utils.utility import UpdateConfig

 __all__ = ["U2STModel", "U2STInferModel"]

@ -163,10 +164,7 @@ class U2STBaseModel(nn.Layer):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        encoder_time = time.time() - start
        #logger.debug(f"encoder time: {encoder_time}")
-        #TODO(Hui Zhang): sum not support bool type
-        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
-        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
-            1)  #[B, 1, T] -> [B]
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]

        # 2a. ST-decoder branch
        start = time.time()
@ -342,8 +340,8 @@ class U2STBaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
-        encoder_dim = encoder_out.size(2)
+        maxlen = encoder_out.shape[1]
+        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@ -363,8 +361,7 @@ class U2STBaseModel(nn.Layer):
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            # TODO(Hui Zhang): if end_flag.sum() == running_size:
-            if end_flag.cast(paddle.int64).sum() == running_size:
+            if end_flag.sum() == running_size:
                break

            # 2.1 Forward decoder step
@ -417,26 +414,26 @@ class U2STBaseModel(nn.Layer):
        best_hyps = best_hyps[:, 1:]
        return best_hyps

-    @jit.to_static
+    # @jit.to_static
    def subsampling_rate(self) -> int:
        """ Export interface for c++ call, return subsampling_rate of the
            model
        """
        return self.encoder.embed.subsampling_rate

-    @jit.to_static
+    # @jit.to_static
    def right_context(self) -> int:
        """ Export interface for c++ call, return right_context of the model
        """
        return self.encoder.embed.right_context

-    @jit.to_static
+    # @jit.to_static
    def sos_symbol(self) -> int:
        """ Export interface for c++ call, return sos symbol id of the model
        """
        return self.sos

-    @jit.to_static
+    # @jit.to_static
    def eos_symbol(self) -> int:
        """ Export interface for c++ call, return eos symbol id of the model
        """
@ -472,7 +469,7 @@ class U2STBaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    @jit.to_static
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@ -499,13 +496,13 @@ class U2STBaseModel(nn.Layer):
        Returns:
            paddle.Tensor: decoder output, (B, L)
        """
-        assert encoder_out.size(0) == 1
-        num_hyps = hyps.size(0)
-        assert hyps_lens.size(0) == num_hyps
+        assert encoder_out.shape[0] == 1
+        num_hyps = hyps.shape[0]
+        assert hyps_lens.shape[0] == num_hyps
        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
        # (B, 1, T)
        encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
        # (num_hyps, max_hyps_len, vocab_size)
        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                      hyps_lens)
@ -560,7 +557,7 @@ class U2STBaseModel(nn.Layer):
        Returns:
            List[List[int]]: transcripts.
        """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]

        if decoding_method == 'fullsentence':
            hyps = self.translate(
@ -647,13 +644,16 @@ class U2STModel(U2STBaseModel):
            decoder = TransformerDecoder(vocab_size,
                                         encoder.output_size(),
                                         **configs['decoder_conf'])
+            # ctc decoder and ctc loss
+            model_conf = configs['model_conf']
            ctc = CTCDecoder(
                odim=vocab_size,
                enc_n_units=encoder.output_size(),
                blank_id=0,
-                dropout_rate=0.0,
+                dropout_rate=model_conf['ctc_dropout_rate'],
                reduction=True,  # sum
-                batch_average=True)  # sum / batch_size
+                batch_average=True,  # sum / batch_size
+                grad_norm_type=model_conf['ctc_grad_norm_type'])

            return vocab_size, encoder, (st_decoder, decoder, ctc)
        else:
@ -687,10 +687,10 @@ class U2STModel(U2STBaseModel):
        Returns:
            DeepSpeech2Model: The model built from pretrained result.
        """
-        config.defrost()
-        config.input_dim = dataloader.collate_fn.feature_size
-        config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+        with UpdateConfig(config):
+            config.input_dim = dataloader.collate_fn.feature_size
+            config.output_dim = dataloader.collate_fn.vocab_size
+
        model = cls.from_config(config)

        if checkpoint_path:
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@ -15,12 +15,13 @@ from collections import OrderedDict

 import paddle
 from paddle import nn
+from paddle.nn import functional as F

 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()

-__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"]
+__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock", "GLU"]


 def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,6 +31,17 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
    return x.maximum(t_min).minimum(t_max)


+class GLU(nn.Layer):
+    """Gated Linear Units (GLU) Layer"""
+
+    def __init__(self, dim: int=-1):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, xs):
+        return F.glu(xs, axis=self.dim)
+
+
 class LinearGLUBlock(nn.Layer):
    """A linear Gated Linear Units (GLU) block."""

@ -133,13 +145,18 @@ def get_activation(act):
    """Return activation function."""
    # Lazy load to avoid unused import
    activation_funcs = {
+        "hardshrink": paddle.nn.Hardshrink,
+        "hardswish": paddle.nn.Hardswish,
        "hardtanh": paddle.nn.Hardtanh,
        "tanh": paddle.nn.Tanh,
        "relu": paddle.nn.ReLU,
+        "relu6": paddle.nn.ReLU6,
+        "leakyrelu": paddle.nn.LeakyReLU,
        "selu": paddle.nn.SELU,
        "swish": paddle.nn.Swish,
        "gelu": paddle.nn.GELU,
-        "brelu": brelu,
+        "glu": GLU,
+        "elu": paddle.nn.ELU,
    }

    return activation_funcs[act]()
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
            paddle.Tensor: Transformed value tensor, size
                (#batch, n_head, time2, d_k).
        """
-        n_batch = query.size(0)
+        n_batch = query.shape[0]
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
            paddle.Tensor: Transformed value weighted 
                by the attention score, (#batch, time1, d_model).
        """
-        n_batch = value.size(0)
+        n_batch = value.shape[0]
        if mask is not None:
            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
            scores = scores.masked_fill(mask, -float('inf'))
@ -109,8 +109,8 @@ class MultiHeadedAttention(nn.Layer):

        p_attn = self.dropout(attn)
        x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = x.transpose([0, 2, 1, 3]).contiguous().view(
-            n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h *
+                                           self.d_k)  # (batch, time1, d_model)

        return self.linear_out(x)  # (batch, time1, d_model)

@ -172,15 +172,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
            paddle.Tensor: Output tensor. (batch, head, time1, time1)
        """
        zero_pad = paddle.zeros(
-            (x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype)
+            (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
        x_padded = paddle.cat([zero_pad, x], dim=-1)

-        x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
+        x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
+                                 x.shape[2])
        x = x_padded[:, :, 1:].view_as(x)  # [B, H, T1, T1]

        if zero_triu:
-            ones = paddle.ones((x.size(2), x.size(3)))
-            x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+            ones = paddle.ones((x.shape[2], x.shape[3]))
+            x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]

        return x

@ -205,7 +206,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)

-        n_batch_pos = pos_emb.size(0)
+        n_batch_pos = pos_emb.shape[0]
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)

--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@ -113,11 +113,9 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # TODO(Hui Zhang): not support bool multiply
-        # masks = masks.type_as(x)
-        masks = masks.astype(x.dtype)
-        x = x.multiply(masks)
-
+        # https://github.com/PaddlePaddle/Paddle/pull/29265
+        # rhs will type promote to lhs
+        x = x * masks
        return x, x_len


--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -16,15 +16,19 @@ from paddle import nn
 from paddle.nn import functional as F
 from typeguard import check_argument_types

-from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
-from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
-from deepspeech.decoders.swig_wrapper import Scorer
 from deepspeech.modules.loss import CTCLoss
 from deepspeech.utils import ctc_utils
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()

+try:
+    from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch  # noqa: F401
+    from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder  # noqa: F401
+    from deepspeech.decoders.swig_wrapper import Scorer  # noqa: F401
+except Exception as e:
+    logger.info("ctcdecoder not installed!")
+
 __all__ = ['CTCDecoder']


@ -35,7 +39,8 @@ class CTCDecoder(nn.Layer):
                 blank_id=0,
                 dropout_rate: float=0.0,
                 reduction: bool=True,
-                 batch_average: bool=True):
+                 batch_average: bool=True,
+                 grad_norm_type: str="instance"):
        """CTC decoder

        Args:
@ -44,6 +49,7 @@ class CTCDecoder(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
+            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
@ -56,7 +62,8 @@ class CTCDecoder(nn.Layer):
        self.criterion = CTCLoss(
            blank=self.blank_id,
            reduction=reduction_type,
-            batch_average=batch_average)
+            batch_average=batch_average,
+            grad_norm_type=grad_norm_type)

        # CTCDecoder LM Score handle
        self._ext_scorer = None
@ -132,7 +139,7 @@ class CTCDecoder(nn.Layer):
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
-                probs_seq=probs, vocabulary=vocab_list)
+                probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
            results.append(output_transcription)
        return results

@ -212,13 +219,15 @@ class CTCDecoder(nn.Layer):
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
-            cutoff_top_n=cutoff_top_n)
+            cutoff_top_n=cutoff_top_n,
+            blank_id=self.blank_id)

        results = [result[0][1] for result in beam_search_results]
        return results

    def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
                    decoding_method):
+
        if decoding_method == "ctc_beam_search":
            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                  vocab_list)
@ -229,7 +238,7 @@ class CTCDecoder(nn.Layer):
        """ctc decoding with probs.

        Args:
-            probs (Tenosr): activation after softmax 
+            probs (Tenosr): activation after softmax
            logits_lens (Tenosr): audio output lens
            vocab_list ([type]): [description]
            decoding_method ([type]): [description]
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -122,11 +122,9 @@ class TransformerDecoder(nn.Layer):
        # tgt_mask: (B, 1, L)
        tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
        # m: (1, L, L)
-        m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0)
+        m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
        # tgt_mask: (B, L, L)
-        # TODO(Hui Zhang): not support & for tensor
-        # tgt_mask = tgt_mask & m
-        tgt_mask = tgt_mask.logical_and(m)
+        tgt_mask = tgt_mask & m

        x, _ = self.embed(tgt)
        for layer in self.decoders:
@ -137,9 +135,7 @@ class TransformerDecoder(nn.Layer):
        if self.use_output_layer:
            x = self.output_layer(x)

-        # TODO(Hui Zhang): reduce_sum not support bool type
-        # olens = tgt_mask.sum(1)
-        olens = tgt_mask.astype(paddle.int).sum(1)
+        olens = tgt_mask.sum(1)
        return x, olens

    def forward_one_step(
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
        """
        T = x.shape[1]
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + T]
        x = x * self.xscale + pos_emb
@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
            paddle.Tensor: Encoded tensor (batch, time, `*`).
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
        x = x * self.xscale
        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + x.shape[1]]
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -159,11 +159,10 @@ class BaseEncoder(nn.Layer):
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
-        xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
        masks = masks.astype(paddle.bool)
-        #TODO(Hui Zhang): mask_pad = ~masks
-        mask_pad = masks.logical_not()
+        mask_pad = ~masks
        chunk_masks = add_optional_chunk_mask(
            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
            decoding_chunk_size, self.static_chunk_size,
@ -207,11 +206,11 @@ class BaseEncoder(nn.Layer):
                chunk computation
            List[paddle.Tensor]: conformer cnn cache
        """
-        assert xs.size(0) == 1  # batch size must be one
+        assert xs.shape[0] == 1  # batch size must be one
        # tmp_masks is just for interface compatibility
        # TODO(Hui Zhang): stride_slice not support bool tensor
        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
-        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32)
+        tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]

        if self.global_cmvn is not None:
@ -221,25 +220,25 @@ class BaseEncoder(nn.Layer):
            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)

        if subsampling_cache is not None:
-            cache_size = subsampling_cache.size(1)  #T
+            cache_size = subsampling_cache.shape[1]  #T
            xs = paddle.cat((subsampling_cache, xs), dim=1)
        else:
            cache_size = 0

        # only used when using `RelPositionMultiHeadedAttention`
        pos_emb = self.embed.position_encoding(
-            offset=offset - cache_size, size=xs.size(1))
+            offset=offset - cache_size, size=xs.shape[1])

        if required_cache_size < 0:
            next_cache_start = 0
        elif required_cache_size == 0:
-            next_cache_start = xs.size(1)
+            next_cache_start = xs.shape[1]
        else:
-            next_cache_start = xs.size(1) - required_cache_size
+            next_cache_start = xs.shape[1] - required_cache_size
        r_subsampling_cache = xs[:, next_cache_start:, :]

        # Real mask for transformer/conformer layers
-        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
        masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
        r_elayers_output_cache = []
        r_conformer_cnn_cache = []
@ -303,7 +302,7 @@ class BaseEncoder(nn.Layer):
        stride = subsampling * decoding_chunk_size
        decoding_window = (decoding_chunk_size - 1) * subsampling + context

-        num_frames = xs.size(1)
+        num_frames = xs.shape[1]
        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
        subsampling_cache: Optional[paddle.Tensor] = None
        elayers_output_cache: Optional[List[paddle.Tensor]] = None
@ -319,10 +318,10 @@ class BaseEncoder(nn.Layer):
                 chunk_xs, offset, required_cache_size, subsampling_cache,
                 elayers_output_cache, conformer_cnn_cache)
            outputs.append(y)
-            offset += y.size(1)
+            offset += y.shape[1]
        ys = paddle.cat(outputs, 1)
        # fake mask, just for jit script and compatibility with `forward` api
-        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
        masks = masks.unsqueeze(1)
        return ys, masks

--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -23,11 +23,32 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"]


 class CTCLoss(nn.Layer):
-    def __init__(self, blank=0, reduction='sum', batch_average=False):
+    def __init__(self,
+                 blank=0,
+                 reduction='sum',
+                 batch_average=False,
+                 grad_norm_type=None):
        super().__init__()
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
        self.batch_average = batch_average
+        logger.info(
+            f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
+
+        # instance for norm_by_times
+        # batch for norm_by_batchsize
+        # frame for norm_by_total_logits_len
+        assert grad_norm_type in ('instance', 'batch', 'frame', None)
+        self.norm_by_times = False
+        self.norm_by_batchsize = False
+        self.norm_by_total_logits_len = False
+        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
+        if grad_norm_type == 'instance':
+            self.norm_by_times = True
+        if grad_norm_type == 'batch':
+            self.norm_by_batchsize = True
+        if grad_norm_type == 'frame':
+            self.norm_by_total_logits_len = True

    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
@ -46,10 +67,15 @@ class CTCLoss(nn.Layer):
        # warp-ctc need activation with shape [T, B, V + 1]
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
-        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        ys_pad = ys_pad.astype(paddle.int32)
        loss = self.loss(
-            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
+            logits,
+            ys_pad,
+            hlens,
+            ys_lens,
+            norm_by_times=self.norm_by_times,
+            norm_by_batchsize=self.norm_by_batchsize,
+            norm_by_total_logits_len=self.norm_by_total_logits_len)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
@ -124,9 +150,9 @@ class LabelSmoothingLoss(nn.Layer):
        # use zeros_like instead of torch.no_grad() for true_dist,
        # since no_grad() can not be exported by JIT
        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
-        ignore = target == self.padding_idx  # (B,)
+        ignore = (target == self.padding_idx)  # (B,)

-        # target = target * (1 - ignore)  # avoid -1 index
+        #TODO(Hui Zhang): target = target * (1 - ignore)  # avoid -1 index
        target = target.masked_fill(ignore, 0)  # avoid -1 index
        # true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        target_mask = F.one_hot(target, self.size)
@ -135,10 +161,8 @@ class LabelSmoothingLoss(nn.Layer):

        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)

-        #TODO(Hui Zhang): sum not support bool type
-        #total = len(target) - int(ignore.sum())
-        total = len(target) - int(ignore.type_as(target).sum())
+        total = len(target) - int(ignore.sum())
        denom = total if self.normalize_length else B
-        #numer = (kl * (1 - ignore)).sum()
+        #TODO(Hui Zhang): numer = (kl * (1 - ignore)).sum()
        numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()
        return numer / denom
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -69,8 +69,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
    """
-    #TODO(Hui Zhang): return ~make_pad_mask(lengths), not support ~
-    return make_pad_mask(lengths).logical_not()
+    return ~make_pad_mask(lengths)


 def subsequent_mask(size: int) -> paddle.Tensor:
@ -92,12 +91,7 @@ def subsequent_mask(size: int) -> paddle.Tensor:
         [1, 1, 1]]
    """
    ret = paddle.ones([size, size], dtype=paddle.bool)
-    #TODO(Hui Zhang): tril not support bool
-    #return paddle.tril(ret)
-    ret = ret.astype(paddle.float)
-    ret = paddle.tril(ret)
-    ret = ret.astype(paddle.bool)
-    return ret
+    return paddle.tril(ret)


 def subsequent_chunk_mask(
@ -186,15 +180,13 @@ def add_optional_chunk_mask(xs: paddle.Tensor,
        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        # chunk_masks = masks & chunk_masks  # (B, L, L)
-        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
    elif static_chunk_size > 0:
        num_left_chunks = num_decoding_left_chunks
        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        # chunk_masks = masks & chunk_masks  # (B, L, L)
-        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
    else:
        chunk_masks = masks
    return chunk_masks
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -308,7 +308,7 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
-            masks = masks.astype(x.dtype)
-            x = x.multiply(masks)
+            # https://github.com/PaddlePaddle/Paddle/pull/29265
+            # rhs will type promote to lhs
+            x = x * masks
        return x, x_len
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -14,6 +14,20 @@
 import argparse


+class ExtendAction(argparse.Action):
+    """
+    [Since Python 3.8, the "extend" is available directly in stdlib]
+    (https://docs.python.org/3.8/library/argparse.html#action).
+    If you only have to support 3.8+ then defining it yourself is no longer required. 
+    Usage of stdlib "extend" action is exactly the same way as this answer originally described:
+    """
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        items = getattr(namespace, self.dest) or []
+        items.extend(values)
+        setattr(namespace, self.dest, items)
+
+
 def default_argument_parser():
    r"""A simple yet genral argument parser for experiments with parakeet.

@ -30,7 +44,7 @@ def default_argument_parser():

    The ``--checkpoint_path`` specifies the checkpoint to load from.

-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--nprocs`` specifies how to run the training.


    See Also
@ -42,29 +56,53 @@ def default_argument_parser():
        the parser
    """
    parser = argparse.ArgumentParser()
+    parser.register('action', 'extend', ExtendAction)

-    # yapf: disable
-    # data and output
-    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
-    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
-    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
-
-    # load from saved checkpoint
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
-
-    # running
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
-                        help="device type to use, cpu and gpu are supported.")
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
-
-    # overwrite extra config and default config
-    # parser.add_argument("--opts", nargs=argparse.REMAINDER,
-    # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
-    parser.add_argument("--opts", type=str, default=[], nargs='+',
-                        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+    train_group = parser.add_argument_group(
+        title='Train Options', description=None)
+    train_group.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
+    )
+    train_group.add_argument(
+        "--nprocs",
+        type=int,
+        default=1,
+        help="number of parallel processes. 0 for cpu.")
+    train_group.add_argument(
+        "--config", metavar="CONFIG_FILE", help="config file.")
+    train_group.add_argument(
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
+    train_group.add_argument(
+        "--checkpoint_path", type=str, help="path to load checkpoint")
+    train_group.add_argument(
+        "--opts",
+        action='extend',
+        nargs=2,
+        metavar=('key', 'val'),
+        help="overwrite --config field, passing (KEY VALUE) pairs")
+    train_group.add_argument(
+        "--dump-config", metavar="FILE", help="dump config to `this` file.")

-    parser.add_argument("--seed", type=int, default=None,
-                        help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
-    # yapd: enable
+    profile_group = parser.add_argument_group(
+        title='Benchmark Options', description=None)
+    profile_group.add_argument(
+        '--profiler-options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    profile_group.add_argument(
+        '--benchmark-batch-size',
+        type=int,
+        default=None,
+        help='batch size for benchmark.')
+    profile_group.add_argument(
+        '--benchmark-max-step',
+        type=int,
+        default=None,
+        help='max iteration for benchmark.')

    return parser
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
@ -13,14 +13,18 @@
 # limitations under the License.
 from typing import Dict

-import extension
 import paddle
+from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.nn import Layer

+from . import extension
 from ..reporter import DictSummary
+from ..reporter import ObsScope
 from ..reporter import report
-from ..reporter import scope
+from ..timer import Timer
+from deepspeech.utils.log import Log
+logger = Log(__name__).getlog()


 class StandardEvaluator(extension.Extension):
@ -43,6 +47,27 @@ class StandardEvaluator(extension.Extension):
    def evaluate_core(self, batch):
        # compute
        self.model(batch)  # you may report here
+        return
+
+    def evaluate_sync(self, data):
+        # dist sync `evaluate_core` outputs
+        if data is None:
+            return
+
+        numerator, denominator = data
+        if dist.get_world_size() > 1:
+            numerator = paddle.to_tensor(numerator)
+            denominator = paddle.to_tensor(denominator)
+            # the default operator in all_reduce function is sum.
+            dist.all_reduce(numerator)
+            dist.all_reduce(denominator)
+            value = numerator / denominator
+            value = float(value)
+        else:
+            value = numerator / denominator
+        # used for `snapshort` to do kbest save.
+        report("VALID/LOSS", value)
+        logger.info(f"Valid: all-reduce loss {value}")

    def evaluate(self):
        # switch to eval mode
@ -53,12 +78,16 @@ class StandardEvaluator(extension.Extension):
        summary = DictSummary()
        for batch in self.dataloader:
            observation = {}
-            with scope(observation):
+            with ObsScope(observation):
                # main evaluation computation here.
                with paddle.no_grad():
-                    self.evaluate_core(batch)
+                    self.evaluate_sync(self.evaluate_core(batch))
            summary.add(observation)
        summary = summary.compute_mean()
+
+        # switch to train mode
+        for model in self.models.values():
+            model.train()
        return summary

    def __call__(self, trainer=None):
@ -66,6 +95,7 @@ class StandardEvaluator(extension.Extension):
        # if it is used to extend a trainer, the metrics is reported to
        # to observation of the trainer
        # or otherwise, you can use your own observation
-        summary = self.evaluate()
+        with Timer("Eval Time Cost: {}"):
+            summary = self.evaluate()
        for k, v in summary.items():
            report(k, v)
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
@ -20,8 +20,9 @@ from typing import List

 import jsonlines

-from deepspeech.training.extensions import extension
-from deepspeech.training.updaters.trainer import Trainer
+from . import extension
+from ..reporter import get_observations
+from ..updaters.trainer import Trainer
 from deepspeech.utils.log import Log
 from deepspeech.utils.mp_tools import rank_zero_only

@ -52,8 +53,19 @@ class Snapshot(extension.Extension):
    priority = -100
    default_name = "snapshot"

-    def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
+    def __init__(self,
+                 mode='latest',
+                 max_size: int=5,
+                 indicator=None,
+                 less_better=True,
+                 snapshot_on_error: bool=False):
        self.records: List[Dict[str, Any]] = []
+        assert mode in ('latest', 'kbest'), mode
+        if mode == 'kbest':
+            assert indicator is not None
+        self.mode = mode
+        self.indicator = indicator
+        self.less_is_better = less_better
        self.max_size = max_size
        self._snapshot_on_error = snapshot_on_error
        self._save_all = (max_size == -1)
@ -66,16 +78,17 @@ class Snapshot(extension.Extension):
        # load existing records
        record_path: Path = self.checkpoint_dir / "records.jsonl"
        if record_path.exists():
-            logger.debug("Loading from an existing checkpoint dir")
            self.records = load_records(record_path)
-            trainer.updater.load(self.records[-1]['path'])
+            ckpt_path = self.records[-1]['path']
+            logger.info(f"Loading from an existing checkpoint {ckpt_path}")
+            trainer.updater.load(ckpt_path)

    def on_error(self, trainer, exc, tb):
        if self._snapshot_on_error:
-            self.save_checkpoint_and_update(trainer)
+            self.save_checkpoint_and_update(trainer, 'latest')

    def __call__(self, trainer: Trainer):
-        self.save_checkpoint_and_update(trainer)
+        self.save_checkpoint_and_update(trainer, self.mode)

    def full(self):
        """Whether the number of snapshots it keeps track of is greater
@ -83,12 +96,12 @@ class Snapshot(extension.Extension):
        return (not self._save_all) and len(self.records) > self.max_size

    @rank_zero_only
-    def save_checkpoint_and_update(self, trainer: Trainer):
+    def save_checkpoint_and_update(self, trainer: Trainer, mode: str):
        """Saving new snapshot and remove the oldest snapshot if needed."""
        iteration = trainer.updater.state.iteration
        epoch = trainer.updater.state.epoch
        num = epoch if self.trigger[1] == 'epoch' else iteration
-        path = self.checkpoint_dir / f"{num}.pdz"
+        path = self.checkpoint_dir / f"{num}.np"

        # add the new one
        trainer.updater.save(path)
@ -97,11 +110,17 @@ class Snapshot(extension.Extension):
            'path': str(path.resolve()),  # use absolute path
            'iteration': iteration,
            'epoch': epoch,
+            'indicator': get_observations()[self.indicator]
        }
        self.records.append(record)

        # remove the earist
        if self.full():
+            if mode == 'kbest':
+                self.records = sorted(
+                    self.records,
+                    key=lambda record: record['indicator'],
+                    reverse=not self.less_is_better)
            eariest_record = self.records[0]
            os.remove(eariest_record["path"])
            self.records.pop(0)
--- a/deepspeech/training/extensions/visualizer.py
+++ b/deepspeech/training/extensions/visualizer.py
@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from deepspeech.training.extensions import extension
-from deepspeech.training.updaters.trainer import Trainer
+from visualdl import LogWriter
+
+from . import extension
+from ..updaters.trainer import Trainer


 class VisualDL(extension.Extension):
@ -26,8 +28,8 @@ class VisualDL(extension.Extension):
    default_name = 'visualdl'
    priority = extension.PRIORITY_READER

-    def __init__(self, writer):
-        self.writer = writer
+    def __init__(self, output_dir):
+        self.writer = LogWriter(str(output_dir))

    def __call__(self, trainer: Trainer):
        for k, v in trainer.observation.items():
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@ -47,7 +47,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

-            # debug log
+            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
@ -76,7 +76,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

-            # debug log
+            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
--- a/deepspeech/training/reporter.py
+++ b/deepspeech/training/reporter.py
@ -19,7 +19,7 @@ OBSERVATIONS = None


@contextlib.contextmanager
-def scope(observations):
+def ObsScope(observations):
    # make `observation` the target to report to.
    # it is basically a dictionary that stores temporary observations
    global OBSERVATIONS
--- a/deepspeech/training/timer.py
+++ b/deepspeech/training/timer.py
@ -0,0 +1,50 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import time
+
+from deepspeech.utils.log import Log
+
+__all__ = ["Timer"]
+
+logger = Log(__name__).getlog()
+
+
+class Timer():
+    """To be used like this: 
+        with Timer("Message") as value:
+            do some thing
+    """
+
+    def __init__(self, message=None):
+        self.message = message
+
+    def duration(self) -> str:
+        elapsed_time = time.time() - self.start
+        time_str = str(datetime.timedelta(seconds=elapsed_time))
+        return time_str
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        if self.message:
+            logger.info(self.message.format(self.duration()))
+
+    def __call__(self) -> float:
+        return time.time() - self.start
+
+    def __str__(self):
+        return self.duration()
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -11,17 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
 import time
+from collections import OrderedDict
 from pathlib import Path

 import paddle
 from paddle import distributed as dist
 from tensorboardX import SummaryWriter

+from deepspeech.training.reporter import ObsScope
+from deepspeech.training.reporter import report
+from deepspeech.training.timer import Timer
 from deepspeech.utils import mp_tools
+from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import seed_all
+from deepspeech.utils.utility import UpdateConfig

 __all__ = ["Trainer"]

@ -79,7 +86,7 @@ class Trainer():
    >>>     config.merge_from_list(args.opts)
    >>> config.freeze()
    >>>
-    >>> if args.nprocs > 1 and args.device == "gpu":
+    >>> if args.nprocs > 0:
    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    >>> else:
    >>>     main_sp(config, args)
@ -94,15 +101,25 @@ class Trainer():
        self.checkpoint_dir = None
        self.iteration = 0
        self.epoch = 0
+        self.rank = dist.get_rank()
+
+        logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")

        if args.seed:
            seed_all(args.seed)
            logger.info(f"Set seed {args.seed}")

+        if self.args.benchmark_batch_size:
+            with UpdateConfig(self.config):
+                self.config.collator.batch_size = self.args.benchmark_batch_size
+                self.config.training.log_interval = 1
+            logger.info(
+                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
+
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        if self.parallel:
            self.init_parallel()

@ -122,7 +139,7 @@ class Trainer():
        """A flag indicating whether the experiment should run with
        multiprocessing.
        """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.nprocs > 0

    def init_parallel(self):
        """Init environment for multiprocess training.
@ -162,67 +179,108 @@ class Trainer():
            checkpoint_dir=self.checkpoint_dir,
            checkpoint_path=self.args.checkpoint_path)
        if infos:
-            # restore from ckpt
+            # just restore ckpt
+            # lr will resotre from optimizer ckpt
            self.iteration = infos["step"]
            self.epoch = infos["epoch"]
            scratch = False
+            logger.info(
+                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
        else:
            self.iteration = 0
            self.epoch = 0
            scratch = True
-
+            logger.info("Init from scratch!")
        return scratch

-    def new_epoch(self):
-        """Reset the train loader seed and increment `epoch`.
-        """
-        self.epoch += 1
-        if self.parallel and hasattr(self.train_loader, "batch_sampler"):
+    def maybe_batch_sampler_step(self):
+        """ batch_sampler seed by epoch """
+        if hasattr(self.train_loader, "batch_sampler"):
            batch_sampler = self.train_loader.batch_sampler
            if isinstance(batch_sampler, paddle.io.DistributedBatchSampler):
                batch_sampler.set_epoch(self.epoch)

-    def train(self):
-        """The training process control by epoch."""
+    def before_train(self):
        from_scratch = self.resume_or_scratch()
        if from_scratch:
-            # save init model, i.e. 0 epoch
+            # scratch: save init model, i.e. 0 epoch
            self.save(tag='init', infos=None)
-        self.lr_scheduler.step(self.epoch)
-        if self.parallel and hasattr(self.train_loader, "batch_sampler"):
-            self.train_loader.batch_sampler.set_epoch(self.epoch)
+        else:
+            # resume: train next_epoch and next_iteration
+            self.epoch += 1
+            self.iteration += 1
+            logger.info(
+                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
+
+        self.maybe_batch_sampler_step()
+
+    def new_epoch(self):
+        """Reset the train loader seed and increment `epoch`.
+        """
+        # `iteration` increased by train step
+        self.epoch += 1
+        self.maybe_batch_sampler_step()
+
+    def after_train_batch(self):
+        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
+            profiler.add_profiler_step(self.args.profiler_options)
+            logger.info(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+            sys.exit(
+                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
+
+    def train(self):
+        """The training process control by epoch."""
+        self.before_train()

        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
-            self.model.train()
-            try:
-                data_start_time = time.time()
-                for batch_index, batch in enumerate(self.train_loader):
-                    dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
-                    msg += "epoch: {}, ".format(self.epoch)
-                    msg += "step: {}, ".format(self.iteration)
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
-                                                    len(self.train_loader))
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
-                    self.train_batch(batch_index, batch, msg)
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
                    data_start_time = time.time()
-            except Exception as e:
-                logger.error(e)
-                raise e
-
-            total_loss, num_seen_utts = self.valid()
-            if dist.get_world_size() > 1:
-                num_seen_utts = paddle.to_tensor(num_seen_utts)
-                # the default operator in all_reduce function is sum.
-                dist.all_reduce(num_seen_utts)
-                total_loss = paddle.to_tensor(total_loss)
-                dist.all_reduce(total_loss)
-                cv_loss = total_loss / num_seen_utts
-                cv_loss = float(cv_loss)
-            else:
-                cv_loss = total_loss / num_seen_utts
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("lr", self.lr_scheduler())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips[sent./sec]'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += ","
+                        logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    # the default operator in all_reduce function is sum.
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = total_loss / num_seen_utts

            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
@ -231,6 +289,7 @@ class Trainer():
                    'epoch', {'cv_loss': cv_loss,
                              'lr': self.lr_scheduler()}, self.epoch)

+            # after epoch
            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
            # step lr every epoch
            self.lr_scheduler.step()
@ -240,14 +299,13 @@ class Trainer():
        """The routine of the experiment after setup. This method is intended
        to be used by the user.
        """
-        try:
-            self.train()
-        except KeyboardInterrupt:
-            self.save()
-            exit(-1)
-        finally:
-            self.destory()
-        logger.info("Training Done.")
+        with Timer("Training Done: {}"):
+            try:
+                self.train()
+            except KeyboardInterrupt:
+                exit(-1)
+            finally:
+                self.destory()

    def setup_output_dir(self):
        """Create a directory used for output.
--- a/deepspeech/training/updaters/standard_updater.py
+++ b/deepspeech/training/updaters/standard_updater.py
@ -14,12 +14,12 @@
 from typing import Dict
 from typing import Optional

-from paddle import Tensor
+import paddle
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
-from timer import timer
+from paddle.optimizer.lr import LRScheduler

 from deepspeech.training.reporter import report
 from deepspeech.training.updaters.updater import UpdaterBase
@ -39,8 +39,10 @@ class StandardUpdater(UpdaterBase):
    def __init__(self,
                 model: Layer,
                 optimizer: Optimizer,
+                 scheduler: LRScheduler,
                 dataloader: DataLoader,
                 init_state: Optional[UpdaterState]=None):
+        super().__init__(init_state)
        # it is designed to hold multiple models
        models = {"main": model}
        self.models: Dict[str, Layer] = models
@ -51,15 +53,14 @@ class StandardUpdater(UpdaterBase):
        self.optimizer = optimizer
        self.optimizers: Dict[str, Optimizer] = optimizers

+        # it is designed to hold multiple scheduler
+        schedulers = {"main": scheduler}
+        self.scheduler = scheduler
+        self.schedulers: Dict[str, LRScheduler] = schedulers
+
        # dataloaders
        self.dataloader = dataloader

-        # init state
-        if init_state is None:
-            self.state = UpdaterState()
-        else:
-            self.state = init_state
-
        self.train_iterator = iter(dataloader)

    def update(self):
@ -103,8 +104,10 @@ class StandardUpdater(UpdaterBase):
            model.train()

        # training for a step is implemented here
-        batch = self.read_batch()
-        self.update_core(batch)
+        with Timier("data time cost:{}"):
+            batch = self.read_batch()
+        with Timier("step time cost:{}"):
+            self.update_core(batch)

        self.state.iteration += 1
        if self.updates_per_epoch is not None:
@ -115,13 +118,14 @@ class StandardUpdater(UpdaterBase):
        """A simple case for a training step. Basic assumptions are:
        Single model;
        Single optimizer;
+        Single scheduler, and update learning rate each step;
        A batch from the dataloader is just the input of the model;
        The model return a single loss, or a dict containing serval losses.
        Parameters updates at every batch, no gradient accumulation.
        """
        loss = self.model(*batch)

-        if isinstance(loss, Tensor):
+        if isinstance(loss, paddle.Tensor):
            loss_dict = {"main": loss}
        else:
            # Dict[str, Tensor]
@ -135,14 +139,15 @@ class StandardUpdater(UpdaterBase):
        for name, loss_item in loss_dict.items():
            report(name, float(loss_item))

-        self.optimizer.clear_gradient()
+        self.optimizer.clear_grad()
        loss_dict["main"].backward()
-        self.optimizer.update()
+        self.optimizer.step()
+        self.scheduler.step()

    @property
    def updates_per_epoch(self):
-        """Number of updater per epoch, determined by the length of the
-        dataloader."""
+        """Number of steps per epoch, 
+        determined by the length of the dataloader."""
        length_of_dataloader = None
        try:
            length_of_dataloader = len(self.dataloader)
@ -163,18 +168,16 @@ class StandardUpdater(UpdaterBase):

    def read_batch(self):
        """Read a batch from the data loader, auto renew when data is exhausted."""
-        with timer() as t:
-            try:
-                batch = next(self.train_iterator)
-            except StopIteration:
-                self.new_epoch()
-                batch = next(self.train_iterator)
-            logger.debug(
-                f"Read a batch takes {t.elapse}s.")  # replace it with logger
+        try:
+            batch = next(self.train_iterator)
+        except StopIteration:
+            self.new_epoch()
+            batch = next(self.train_iterator)
        return batch

    def state_dict(self):
-        """State dict of a Updater, model, optimizer and updater state are included."""
+        """State dict of a Updater, model, optimizers/schedulers 
+        and updater state are included."""
        state_dict = super().state_dict()
        for name, model in self.models.items():
            state_dict[f"{name}_params"] = model.state_dict()
@ -184,7 +187,7 @@ class StandardUpdater(UpdaterBase):

    def set_state_dict(self, state_dict):
        """Set state dict for a Updater. Parameters of models, states for
-        optimizers and UpdaterState are restored."""
+        optimizers/schedulers and UpdaterState are restored."""
        for name, model in self.models.items():
            model.set_state_dict(state_dict[f"{name}_params"])
        for name, optim in self.optimizers.items():
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
@ -24,7 +24,7 @@ import tqdm

 from deepspeech.training.extensions.extension import Extension
 from deepspeech.training.extensions.extension import PRIORITY_READER
-from deepspeech.training.reporter import scope
+from deepspeech.training.reporter import ObsScope
 from deepspeech.training.triggers import get_trigger
 from deepspeech.training.triggers.limit_trigger import LimitTrigger
 from deepspeech.training.updaters.updater import UpdaterBase
@ -140,11 +140,11 @@ class Trainer():
        try:
            while not stop_trigger(self):
                self.observation = {}
-                # set observation as the report target
-                # you can use report freely in Updater.update()
+                # set observation as the `report` target
+                # you can use `report` freely in Updater.update()

                # updating parameters and state
-                with scope(self.observation):
+                with ObsScope(self.observation):
                    update()
                    p.update()

--- a/deepspeech/training/updaters/updater.py
+++ b/deepspeech/training/updaters/updater.py
@ -52,6 +52,7 @@ class UpdaterBase():
    """

    def __init__(self, init_state=None):
+        # init state
        if init_state is None:
            self.state = UpdaterState()
        else:
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
@ -114,13 +114,13 @@ class Checkpoint():
        params_path = checkpoint_path + ".pdparams"
        model_dict = paddle.load(params_path)
        model.set_state_dict(model_dict)
-        logger.info("Rank {}: loaded model from {}".format(rank, params_path))
+        logger.info("Rank {}: Restore model from {}".format(rank, params_path))

        optimizer_path = checkpoint_path + ".pdopt"
        if optimizer and os.path.isfile(optimizer_path):
            optimizer_dict = paddle.load(optimizer_path)
            optimizer.set_state_dict(optimizer_dict)
-            logger.info("Rank {}: loaded optimizer state from {}".format(
+            logger.info("Rank {}: Restore optimizer state from {}".format(
                rank, optimizer_path))

        info_path = re.sub('.pdparams$', '.json', params_path)
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@ -84,19 +84,19 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    y_insert_blank = insert_blank(y, blank_id)  #(2L+1)

    log_alpha = paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
+        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
    log_alpha = log_alpha - float('inf')  # log of zero
-    # TODO(Hui Zhang): zeros not support paddle.int16
+
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
    state_path = (paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
+        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
                  )  # state path, Tuple((T, 2L+1))

    # init start state
-    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
-    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
-    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
+    log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
+    log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb

-    for t in range(1, ctc_probs.size(0)):  # T
+    for t in range(1, ctc_probs.shape[0]):  # T
        for s in range(len(y_insert_blank)):  # 2L+1
            if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
                    s] == y_insert_blank[s - 2]:
@ -110,13 +110,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                    log_alpha[t - 1, s - 2],
                ])
                prev_state = [s, s - 1, s - 2]
-            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
-            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
-                y_insert_blank[s])]
+            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
+                y_insert_blank[s]]
            state_path[t, s] = prev_state[paddle.argmax(candidates)]
-
-    # TODO(Hui Zhang): zeros not support paddle.int16
-    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
+    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)

    candidates = paddle.to_tensor([
        log_alpha[-1, len(y_insert_blank) - 1],  # Sb
@ -124,11 +122,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    ])
    prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
    state_seq[-1] = prev_state[paddle.argmax(candidates)]
-    for t in range(ctc_probs.size(0) - 2, -1, -1):
+    for t in range(ctc_probs.shape[0] - 2, -1, -1):
        state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]

    output_alignment = []
-    for t in range(0, ctc_probs.size(0)):
+    for t in range(0, ctc_probs.shape[0]):
        output_alignment.append(y_insert_blank[state_seq[t, 0]])

    return output_alignment
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import getpass
-import logging
 import os
 import socket
 import sys

+from loguru import logger
 from paddle import inference

-FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
-DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
-
-logging.basicConfig(
-    level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
-

 def find_log_dir(log_dir=None):
    """Returns the most suitable directory to put log files into.
@ -98,59 +92,28 @@ def find_log_dir_and_names(program_name=None, log_dir=None):


 class Log():
-
-    log_name = None
-
-    def __init__(self, logger=None):
-        self.logger = logging.getLogger(logger)
-        self.logger.setLevel(logging.DEBUG)
-
-        file_dir = os.getcwd() + '/log'
-        if not os.path.exists(file_dir):
-            os.mkdir(file_dir)
-        self.log_dir = file_dir
-
-        actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names(
-            program_name=None, log_dir=self.log_dir)
-
-        basename = '%s.DEBUG.%d' % (file_prefix, os.getpid())
-        filename = os.path.join(actual_log_dir, basename)
-        if Log.log_name is None:
-            Log.log_name = filename
-
-        # Create a symlink to the log file with a canonical name.
-        symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG')
-        try:
-            if os.path.islink(symlink):
-                os.unlink(symlink)
-            os.symlink(os.path.basename(Log.log_name), symlink)
-        except EnvironmentError:
-            # If it fails, we're sad but it's no error.  Commonly, this
-            # fails because the symlink was created by another user and so
-            # we can't modify it
-            pass
-
-        if not self.logger.hasHandlers():
-            formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
-            fh = logging.FileHandler(Log.log_name)
-            fh.setLevel(logging.DEBUG)
-            fh.setFormatter(formatter)
-            self.logger.addHandler(fh)
-
-            ch = logging.StreamHandler()
-            ch.setLevel(logging.INFO)
-            ch.setFormatter(formatter)
-            self.logger.addHandler(ch)
-
-        # stop propagate for propagating may print
-        # log multiple times
-        self.logger.propagate = False
+    """Default Logger for all."""
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        level='INFO',
+        enqueue=True,
+        filter=lambda record: record['level'].no >= 20)
+    _, file_prefix, _ = find_log_dir_and_names()
+    sink_prefix = os.path.join("exp/log", file_prefix)
+    sink_path = sink_prefix[:-3] + "{time}.log"
+    logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB")
+
+    def __init__(self, name=None):
+        pass

    def getlog(self):
-        return self.logger
+        return logger


 class Autolog:
+    """Just used by fullchain project"""
+
    def __init__(self,
                 batch_size,
                 model_name="DeepSpeech",
--- a/deepspeech/utils/profiler.py
+++ b/deepspeech/utils/profiler.py
@ -0,0 +1,119 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import paddle
+
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        if not options_str:
+            return
+
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+        logger.info(f"Profiler: {options_str}")
+        logger.info(f"Profiler: {_profiler_options._options}")
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(_profiler_options['state'],
+                                             _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@ -83,7 +83,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
-    max_len = max([s.size(0) for s in sequences])
+    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
@ -91,12 +91,22 @@ def pad_sequence(sequences: List[paddle.Tensor],

    out_tensor = sequences[0].new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
-        length = tensor.size(0)
+        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
-            out_tensor[i, :length, ...] = tensor
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # out_tensor[i, :length, ...] = tensor
+            if length != 0:
+                out_tensor[i, :length, ...] = tensor
+            else:
+                out_tensor[i, length, ...] = tensor
        else:
-            out_tensor[:length, i, ...] = tensor
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # out_tensor[:length, i, ...] = tensor
+            if length != 0:
+                out_tensor[:length, i, ...] = tensor
+            else:
+                out_tensor[length, i, ...] = tensor

    return out_tensor

@ -139,7 +149,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
-    B = ys_pad.size(0)
+    B = ys_pad.shape[0]
    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
    ys_in = paddle.cat([_sos, ys_pad], dim=1)
@ -165,16 +175,10 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(
-        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2)
+    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
+                                pad_outputs.shape[1]).argmax(2)
    mask = pad_targets != ignore_label
-    #TODO(Hui Zhang): sum not support bool type
-    # numerator = paddle.sum(
-    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
-    numerator = (
+    numerator = paddle.sum(
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
-    numerator = paddle.sum(numerator.type_as(pad_targets))
-    #TODO(Hui Zhang): sum not support bool type
-    # denominator = paddle.sum(mask)
-    denominator = paddle.sum(mask.type_as(pad_targets))
+    denominator = paddle.sum(mask)
    return float(numerator) / float(denominator)
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -16,15 +16,27 @@ import distutils.util
 import math
 import os
 import random
+from contextlib import contextmanager
 from typing import List

 import numpy as np
 import paddle

-__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"]
+__all__ = [
+    "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
+]
+
+
+@contextmanager
+def UpdateConfig(config):
+    """Update yacs config"""
+    config.defrost()
+    yield
+    config.freeze()


 def seed_all(seed: int=210329):
+    """freeze random generator seed."""
    np.random.seed(seed)
    random.seed(seed)
    paddle.seed(seed)
--- a/doc/images/multi_gpu_speedup.png
+++ b/doc/images/multi_gpu_speedup.png
--- a/doc/images/tuning_error_surface.png
+++ b/doc/images/tuning_error_surface.png
--- a/doc/src/benchmark.md
+++ b/doc/src/benchmark.md
@ -1,16 +0,0 @@
-# Benchmarks
-
-## Acceleration with Multi-GPUs
-
-We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds).  And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
-
-<img src="../images/multi_gpu_speedup.png" width=450>
-
-| # of GPU  | Acceleration Rate |
-| --------  | --------------:   |
-| 1         | 1.00 X |
-| 2         | 1.98 X |
-| 4         | 3.73 X |
-| 8         | 6.95 X |
-
-`utils/profile.sh` provides such a demo profiling tool, you can change it as need.
--- a/doc/src/faq.md
+++ b/doc/src/faq.md
@ -1,37 +0,0 @@
-# FAQ
-
-1. 音频变速快慢到达什么晨读会影响识别率？
-
-   变速会提升识别效果，一般用0.9， 1.0， 1.1 的变速。
-
-2. 音量大小到什么程度会影响识别率？
-
-   一般训练会固定音量到一个范围内，波动过大会影响训练，估计在10dB ~ 20dB吧。
-
-3. 语音模型训练数据的最小时长要求时多少？
-
-   Aishell-1大约178h的数据，数据越多越好。
-
-4. 那些噪声或背景生会影响识别率？  
-
-   主要是人生干扰和低信噪比会影响识别率。
-
-5. 单条语音数据的长度限制是多少？  
-
-   一般训练的语音长度会限制在1s~6s之间，和训练配置有关。
-
-6. 背景声在识别前是否需要分离出来，或做降噪处理？  
-
-   需要分离的，需要结合具体场景考虑。
-
-7. 模型是否带有VAD人生激活识别能力？  
-
-   VAD是单独的模型或模块，模型不包含此能力。
-
-8. 是否支持长语音识别？  
-
-   一般过VAD后识别。
-
-9. Mandarin LM Large语言模型需要的硬件配置时怎样的？  
-
-   内存能放得下LM即可。
--- a/doc/src/reference.md
+++ b/doc/src/reference.md
@ -1,3 +0,0 @@
-# Reference
-
-* [wenet](https://github.com/mobvoi/wenet)
--- a/doc/src/released_model.md
+++ b/doc/src/released_model.md
@ -1,9 +0,0 @@
-# Released Models
-
-## Language Model Released
-
-Language Model | Training Data | Token-based | Size | Descriptions
-:-------------:| :------------:| :-----: | -----: | :-----------------
-[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
-[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
-[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
--- a/doc/src/server.md
+++ b/doc/src/server.md
@ -1,34 +0,0 @@
-
-# Trying Live Demo with Your Own Voice
-
-Until now, an ASR model is trained and tested qualitatively (`infer`) and quantitatively (`test`) with existing audio files. But it is not yet tested with your own speech. We build up a real-time demo ASR engine with the trained model, enabling you to test and play around with the demo, with your own voice.
-
-First, change your directory to `examples/aishell` and `source path.sh`.
-
-To start the demo's server, please run this in one console:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 bash local/server.sh
-```
-
-For the machine (might not be the same machine) to run the demo's client, please do the following installation before moving on.
-
-For example, on MAC OS X:
-
-```bash
-brew install portaudio
-pip install pyaudio
-pip install keyboard
-```
-
-Then to start the client, please run this in another console:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 bash local/client.sh
-```
-
-Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key.
-
-Notice that `deepspeech/exps/deepspeech2/deploy/client.py` must be run on a machine with a microphone device, while `deepspeech/exps/deepspeech2/deploy/server.py` could be run on one without any audio recording hardware, e.g. any remote server machine. Just be careful to set the `host_ip` and `host_port` argument with the actual accessible IP address and port, if the server and client are running with two separate machines. Nothing should be done if they are running on one single machine.
-
-Please also refer to `examples/aishell/local/server.sh`, which will first download a pre-trained Chinese model (trained with AISHELL1) and then start the demo server with the model. With running `examples/aishell/local/client.sh`, you can speak Chinese to test it. If you would like to try some other models, just update `--checkpoint_path` argument in the script.  
--- a/docs/images/ds2offlineModel.png
+++ b/docs/images/ds2offlineModel.png
--- a/docs/images/ds2onlineModel.png
+++ b/docs/images/ds2onlineModel.png
--- a/docs/src/augmentation.md
+++ b/docs/src/augmentation.md
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@ -0,0 +1,190 @@
+# Deepspeech2
+## Streaming
+
+The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
+The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
+
+To illustrate the model implementation clearly, 3 parts are described in detail.  
+- Data Preparation
+- Encoder
+- Decoder
+
+In addition, the training process and the testing process are also introduced.
+
+The arcitecture of the model is shown in Fig.1.
+
+<p align="center">
+<img src="../images/ds2onlineModel.png" width=800>
+<br/>Fig.1 The Arcitecture of deepspeech2 online model
+</p>
+
+### Data Preparation
+#### Vocabulary
+For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>.  For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
+```
+ # The code to build vocabulary
+ cd examples/aishell/s0
+ python3 ../../../utils/build_vocab.py \
+     --unit_type="char" \
+     --count_threshold=0 \
+     --vocab_path="data/vocab.txt" \
+     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
+
+# vocabulary for aishell dataset (Mandarin)
+vi examples/aishell/s0/data/vocab.txt
+
+# vocabulary for librispeech dataset (English)
+vi examples/librispeech/s0/data/vocab.txt
+```
+
+#### CMVN
+For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
+```
+ # The code to compute the feature mean and std
+cd examples/aishell/s0
+python3 ../../../utils/compute_mean_std.py \
+     --manifest_path="data/manifest.train.raw" \
+     --specgram_type="linear" \
+     --delta_delta=false \
+     --stride_ms=10.0 \
+     --window_ms=20.0 \
+     --sample_rate=16000 \
+     --use_dB_normalization=True \
+     --num_samples=2000 \
+     --num_workers=10 \
+     --output_path="data/mean_std.json"
+
+```
+
+#### Feature Extraction
+ For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
+ Currently, the released deepspeech2 online model use the linear feature extraction method.
+ ```
+ The code for feature extraction
+ vi deepspeech/frontend/featurizer/audio_featurizer.py
+ ```
+
+### Encoder
+The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.
+
+The code of Encoder is in:
+```
+vi deepspeech/models/ds2_online/deepspeech2.py
+```
+
+### Decoder
+To got the character possibilities of each frame, the feature representation of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
+
+The code of the decoder is in:
+```
+# The code of constructing the decoder in model
+vi deepspeech/models/ds2_online/deepspeech2.py
+# The code of CTC Decoder
+vi deepspeech/modules/ctc.py
+```
+
+## Training Process
+Using the command below, you can train the deepspeech2 online model.
+```
+ cd examples/aishell/s0
+ bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
+```
+The detail commands are:
+```  
+# The code for training in run.sh
+set -e
+source path.sh
+
+gpus=2,3,5,7
+stage=0
+stop_stage=5
+conf_path=conf/deepspeech2_online.yaml     # conf/deepspeech2.yaml | conf/deepspeech2_online.yaml
+avg_num=1
+model_type=online    # online | offline
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} ${model_type}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+fi
+```
+
+By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.  
+
+## Testing Process
+Using the command below, you can test the deepspeech2 online model.
+ ```
+ bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
+```
+The detail commands are:
+```
+conf_path=conf/deepspeech2_online.yaml
+avg_num=1
+model_type=online
+avg_ckpt=avg_${avg_num}
+
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES=5 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # test export ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
+fi
+ ```
+After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
+
+
+## Non-Streaming
+The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
+
+The arcitecture of the model is shown in Fig.2.
+<p align="center">
+<img src="../images/ds2offlineModel.png" width=800>
+<br/>Fig.2 The Arcitecture of deepspeech2 offline model
+</p>
+
+
+
+For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
+
+The code of encoder and decoder for deepspeech2 offline model is in:
+```
+vi deepspeech/models/ds2/deepspeech2.py
+```
+
+The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
+Only some changes should be noticed.
+
+For training and testing, the "model_type" and the "conf_path" must be set.
+ ```
+# Training offline
+cd examples/aishell/s0
+bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
+```
+```
+# Testing offline
+cd examples/aishell/s0
+bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
+```
--- a/docs/src/feature_list.md
+++ b/docs/src/feature_list.md
@ -1,13 +1,20 @@
 # Features

+### Dataset
+* Aishell
+* Librispeech
+* THCHS30
+* TIMIT
+
 ### Speech Recognition

-* Offline
+* Non-Streaming
  * [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
  * [Transformer](https://arxiv.org/abs/1706.03762)
  * [Conformer](https://arxiv.org/abs/2005.08100)

-* Online
+* Streaming
+  * [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
  * [U2](https://arxiv.org/pdf/2012.05481.pdf)

 ### Language Model
@ -22,6 +29,15 @@
 * beam search
 * attention rescore

+### Deployment
+
+* Paddle Inference
+
+### Aligment  
+
+* MFA  
+* CTC Aligment  
+
 ### Speech Frontend

 * Audio
--- a/docs/src/getting_started.md
+++ b/docs/src/getting_started.md
--- a/docs/src/install.md
+++ b/docs/src/install.md
@ -4,15 +4,16 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin

 ## Prerequisites
 - Python >= 3.7
- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
+- PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))

-## Setup
+## Setup (Important)

 - Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox, and `swig`, e.g. installing them via `apt-get`:

 ```bash
 sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
 ```
+The version of `swig` should >= 3.0

 or, installing them via `yum`:

--- a/docs/src/ngram_lm.md
+++ b/docs/src/ngram_lm.md
@ -35,52 +35,3 @@ Different from the English language model, Mandarin language model is character-
  * A whitespace character between two tokens is inserted.

 Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings.
-
-
-
-## [KenLM](http://kheafield.com/code/kenlm/)
-
-统计语言模型工具有比较多的选择，目前使用比较好的有srilm及kenlm，其中kenlm比srilm晚出来，训练速度也更快，而且支持单机大数据的训练。现在介绍一下kenlm的使用方法。
-
-1. 工具包的下载地址：http://kheafield.com/code/kenlm.tar.gz
-
-2. 使用。该工具在linux环境下使用方便。 先确保linux环境已经按照1.36.0的Boost和zlib
-
-   ```
-   boost:
-   yum install boost
-   yum install boost-devel
-
-   zlib:
-   yum install zlib
-   yum install zlib-devel
-   ```
-
-   然后gcc版本需要是4.8.2及以上。
-
-   ```
-   wget -O - https://kheafield.com/code/kenlm.tar.gz |tar xz
-   mkdir kenlm/build
-   cd kenlm/build
-   cmake ..
-   make -j2
-   ```
-
-3. 训练。使用如下命令进行训练：
-
-   ```
-   build/bin/lmplz -o 3 --verbose_header --text people2014corpus_words.txt --arpa result/people2014corpus_words.arps
-   ```
-
-   其中，
-   1）people2014corpus_words.txt文件必须是分词以后的文件。
-
-   训练语料<人民日报2014版熟语料>，包括： 1）标准人工切词及词性数据people2014.tar.gz， 2）未切词文本数据people2014_words.txt， 3）kenlm训练字粒度语言模型文件及其二进制文件people2014corpus_chars.arps/klm， 4）kenlm词粒度语言模型文件及其二进制文件people2014corpus_words.arps/klm。
-
-   2）-o后面的5表示的是5-gram,一般取到3即可，但可以结合自己实际情况判断。
-
-4. 压缩。压缩模型为二进制，方便模型快速加载：
-
-   ```
-   build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
-   ```
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@ -0,0 +1,8 @@
+# Reference
+
+We refer these repos to build `model` and `engine`:
+
+* [delta](https://github.com/Delta-ML/delta.git)
+* [espnet](https://github.com/espnet/espnet.git)
+* [kaldi](https://github.com/kaldi-asr/kaldi.git)
+* [wenet](https://github.com/mobvoi/wenet)
--- a/docs/src/released_model.md
+++ b/docs/src/released_model.md
@ -0,0 +1,28 @@
+# Released Models
+
+## Acoustic Model Released in paddle 2.X
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
+[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
+[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
+[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
+[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
+[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
+[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
+
+## Acoustic Model Transformed from paddle 1.8
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
+[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
+[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
+[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
+
+
+
+## Language Model Released
+
+Language Model | Training Data | Token-based | Size | Descriptions
+:-------------:| :------------:| :-----: | -----: | :-----------------
+[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
+[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
+[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
--- a/examples/1xt2x/.gitignore
+++ b/examples/1xt2x/.gitignore
@ -0,0 +1 @@
+tmp
--- a/examples/1xt2x/README.md
+++ b/examples/1xt2x/README.md
@ -0,0 +1,11 @@
+# 1xt2x
+
+Convert Deepspeech 1.8 released model to 2.x.
+
+## Model
+* Deepspeech2x
+
+## Exp
+* baidu_en8k
+* aishell
+* librispeech
--- a/examples/1xt2x/aishell/.gitignore
+++ b/examples/1xt2x/aishell/.gitignore
@ -0,0 +1,4 @@
+exp
+data
+*log
+tmp
--- a/examples/1xt2x/aishell/conf/augmentation.json
+++ b/examples/1xt2x/aishell/conf/augmentation.json
@ -0,0 +1 @@
+[]
--- a/Show More
+++ b/Show More