Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into fix_bug

4 years ago · f4e59293bf
parent 1d7072731d 5ed56b3f7e
commit f4e59293bf
274 changed files with 7396 additions and 24655 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,5 +18,7 @@ tools/sox-14.4.2
 tools/soxbindings
 tools/montreal-forced-aligner/
 tools/Montreal-Forced-Aligner/
 tools/sctk
 tools/sctk-20159b5/
 *output/
--- a/.notebook/Linear_test.ipynb
+++ b/.notebook/Linear_test.ipynb
@ -1,605 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "academic-surname",
   "metadata": {},
   "outputs": [],
   "source": [
    "import paddle\n",
    "from paddle import nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fundamental-treasure",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n"
     ]
    }
   ],
   "source": [
    "L = nn.Linear(256, 2048)\n",
    "L2 = nn.Linear(2048, 256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "consolidated-elephant",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import torch\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "moderate-noise",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "float64\n",
      "Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[[-1.54171216, -2.61531472, -1.79881978, ..., -0.31395876,  0.56513089, -0.44516513],\n",
      "         [-0.79492962,  1.91157901,  0.66567147, ...,  0.54825783, -1.01471853, -0.84924090],\n",
      "         [-1.22556651, -0.36225814,  0.65063190, ...,  0.65726501,  0.05563191,  0.09009409],\n",
      "         ...,\n",
      "         [ 0.38615900, -0.77905393,  0.99732304, ..., -1.38463700, -3.32365036, -1.31089687],\n",
      "         [ 0.05579993,  0.06885809, -1.66662002, ..., -0.23346378, -3.29372883,  1.30561364],\n",
      "         [ 1.90676069,  1.95093191, -0.28849599, ..., -0.06860496,  0.95347673,  1.00475824]],\n",
      "\n",
      "        [[-0.91453546,  0.55298805, -1.06146812, ..., -0.86378336,  1.00454640,  1.26062179],\n",
      "         [ 0.10223761,  0.81301165,  2.36865163, ...,  0.16821407,  0.29240361,  1.05408621],\n",
      "         [-1.33196676,  1.94433689,  0.01934209, ...,  0.48036841,  0.51585966,  1.22893548],\n",
      "         ...,\n",
      "         [-0.19558455, -0.47075930,  0.90796155, ..., -1.28598249, -0.24321797,  0.17734711],\n",
      "         [ 0.89819717, -1.39516675,  0.17138045, ...,  2.39761519,  1.76364994, -0.52177650],\n",
      "         [ 0.94122332, -0.18581429,  1.36099780, ...,  0.67647684, -0.04699665,  1.51205540]]])\n",
      "tensor([[[-1.5417, -2.6153, -1.7988,  ..., -0.3140,  0.5651, -0.4452],\n",
      "         [-0.7949,  1.9116,  0.6657,  ...,  0.5483, -1.0147, -0.8492],\n",
      "         [-1.2256, -0.3623,  0.6506,  ...,  0.6573,  0.0556,  0.0901],\n",
      "         ...,\n",
      "         [ 0.3862, -0.7791,  0.9973,  ..., -1.3846, -3.3237, -1.3109],\n",
      "         [ 0.0558,  0.0689, -1.6666,  ..., -0.2335, -3.2937,  1.3056],\n",
      "         [ 1.9068,  1.9509, -0.2885,  ..., -0.0686,  0.9535,  1.0048]],\n",
      "\n",
      "        [[-0.9145,  0.5530, -1.0615,  ..., -0.8638,  1.0045,  1.2606],\n",
      "         [ 0.1022,  0.8130,  2.3687,  ...,  0.1682,  0.2924,  1.0541],\n",
      "         [-1.3320,  1.9443,  0.0193,  ...,  0.4804,  0.5159,  1.2289],\n",
      "         ...,\n",
      "         [-0.1956, -0.4708,  0.9080,  ..., -1.2860, -0.2432,  0.1773],\n",
      "         [ 0.8982, -1.3952,  0.1714,  ...,  2.3976,  1.7636, -0.5218],\n",
      "         [ 0.9412, -0.1858,  1.3610,  ...,  0.6765, -0.0470,  1.5121]]])\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n"
     ]
    }
   ],
   "source": [
    "x = np.random.randn(2, 51, 256)\n",
    "print(x.dtype)\n",
    "px = paddle.to_tensor(x, dtype='float32')\n",
    "tx = torch.tensor(x, dtype=torch.float32)\n",
    "print(px)\n",
    "print(tx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cooked-progressive",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "mechanical-prisoner",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
    "t_norm_ff = data['norm_ff']\n",
    "t_ff_out = data['ff_out']\n",
    "t_ff_l_x = data['ff_l_x']\n",
    "t_ff_l_a_x = data['ff_l_a_x']\n",
    "t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
    "t_ps = data['ps']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "indie-marriage",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "assured-zambia",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "True\n",
      "True\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
    "L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
    "\n",
    "ps = []\n",
    "for n, p in L.named_parameters():\n",
    "   ps.append(p)\n",
    "\n",
    "for n, p in L2.state_dict().items():\n",
    "    ps.append(p)\n",
    "    \n",
    "for p, tp in zip(ps, t_ps):\n",
    "    print(np.allclose(p.numpy(), tp.T))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "committed-jacob",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "extreme-traffic",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "optimum-milwaukee",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "viral-indian",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "True\n",
      "True\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
    "# t_norm_ff = data['norm_ff']\n",
    "# t_ff_out = data['ff_out']\n",
    "# t_ff_l_x = data['ff_l_x']\n",
    "# t_ff_l_a_x = data['ff_l_a_x']\n",
    "# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
    "# t_ps = data['ps']\n",
    "TL = torch.nn.Linear(256, 2048)\n",
    "TL2 = torch.nn.Linear(2048, 256)\n",
    "TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
    "TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
    "\n",
    "# for n, p in TL.named_parameters():\n",
    "#    print(n, p)\n",
    "# for n, p in TL2.named_parameters():\n",
    "#    print(n, p)\n",
    "\n",
    "ps = []\n",
    "for n, p in TL.state_dict().items():\n",
    "    ps.append(p.data.numpy())\n",
    "    \n",
    "for n, p in TL2.state_dict().items():\n",
    "    ps.append(p.data.numpy())\n",
    "    \n",
    "for p, tp in zip(ps, t_ps):\n",
    "    print(np.allclose(p, tp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "skilled-vietnamese",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[[ 0.67277956  0.08313607 -0.62761104 ... -0.17480263  0.42718208\n",
      "   -0.5787626 ]\n",
      "  [ 0.91516656  0.5393416   1.7159258  ...  0.06144593  0.06486575\n",
      "   -0.03350811]\n",
      "  [ 0.438351    0.6227843   0.24096036 ...  1.0912522  -0.90929437\n",
      "   -1.012989  ]\n",
      "  ...\n",
      "  [ 0.68631977  0.14240924  0.10763275 ... -0.11513516  0.48065388\n",
      "    0.04070369]\n",
      "  [-0.9525228   0.23197874  0.31264272 ...  0.5312439   0.18773697\n",
      "   -0.8450228 ]\n",
      "  [ 0.42024016 -0.04561988  0.54541194 ... -0.41933843 -0.00436018\n",
      "   -0.06663495]]\n",
      "\n",
      " [[-0.11638781 -0.33566502 -0.20887226 ...  0.17423287 -0.9195841\n",
      "   -0.8161046 ]\n",
      "  [-0.3469874   0.88269687 -0.11887559 ... -0.15566081  0.16357468\n",
      "   -0.20766167]\n",
      "  [-0.3847657   0.3984318  -0.06963477 ... -0.00360622  1.2360432\n",
      "   -0.26811332]\n",
      "  ...\n",
      "  [ 0.08230796 -0.46158582  0.54582864 ...  0.15747628 -0.44790155\n",
      "    0.06020184]\n",
      "  [-0.8095085   0.43163058 -0.42837143 ...  0.8627463   0.90656304\n",
      "    0.15847842]\n",
      "  [-1.485811   -0.18216592 -0.8882585  ...  0.32596245  0.7822631\n",
      "   -0.6460344 ]]]\n",
      "[[[ 0.67278004  0.08313602 -0.6276114  ... -0.17480245  0.42718196\n",
      "   -0.5787625 ]\n",
      "  [ 0.91516703  0.5393413   1.7159253  ...  0.06144581  0.06486579\n",
      "   -0.03350812]\n",
      "  [ 0.43835106  0.62278455  0.24096027 ...  1.0912521  -0.9092943\n",
      "   -1.0129892 ]\n",
      "  ...\n",
      "  [ 0.6863195   0.14240888  0.10763284 ... -0.11513527  0.48065376\n",
      "    0.04070365]\n",
      "  [-0.9525231   0.23197863  0.31264275 ...  0.53124386  0.18773702\n",
      "   -0.84502304]\n",
      "  [ 0.42024007 -0.04561983  0.545412   ... -0.41933888 -0.00436005\n",
      "   -0.066635  ]]\n",
      "\n",
      " [[-0.11638767 -0.33566508 -0.20887226 ...  0.17423296 -0.9195838\n",
      "   -0.8161046 ]\n",
      "  [-0.34698725  0.88269705 -0.11887549 ... -0.15566081  0.16357464\n",
      "   -0.20766166]\n",
      "  [-0.3847657   0.3984319  -0.06963488 ... -0.00360619  1.2360426\n",
      "   -0.26811326]\n",
      "  ...\n",
      "  [ 0.08230786 -0.4615857   0.5458287  ...  0.15747619 -0.44790167\n",
      "    0.06020182]\n",
      "  [-0.8095083   0.4316307  -0.42837155 ...  0.862746    0.9065631\n",
      "    0.15847899]\n",
      "  [-1.485811   -0.18216613 -0.8882584  ...  0.32596254  0.7822631\n",
      "   -0.6460344 ]]]\n",
      "True\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "y = L(px)\n",
    "print(y.numpy())\n",
    "\n",
    "ty = TL(tx)\n",
    "print(ty.data.numpy())\n",
    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
    "print(np.allclose(y.numpy(), ty.detach().numpy()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "incorrect-allah",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "prostate-cameroon",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "governmental-surge",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.04476918  0.554463   -0.3027508  ... -0.49600336  0.3751858\n",
      "   0.8254095 ]\n",
      " [ 0.95594174 -0.29528382 -1.2899452  ...  0.43718258  0.05584608\n",
      "  -0.06974669]]\n",
      "[[ 0.04476918  0.5544631  -0.3027507  ... -0.49600336  0.37518573\n",
      "   0.8254096 ]\n",
      " [ 0.95594174 -0.29528376 -1.2899454  ...  0.4371827   0.05584623\n",
      "  -0.0697467 ]]\n",
      "True\n",
      "False\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "x = np.random.randn(2, 256)\n",
    "px = paddle.to_tensor(x, dtype='float32')\n",
    "tx = torch.tensor(x, dtype=torch.float32)\n",
    "y = L(px)\n",
    "print(y.numpy())\n",
    "ty = TL(tx)\n",
    "print(ty.data.numpy())\n",
    "print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
    "print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
    "print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "confidential-jacket",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "improved-civilization",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5e7e7c9fde8350084abf1898cf52651cfc84b17a\n"
     ]
    }
   ],
   "source": [
    "print(paddle.version.commit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d1e2d3b4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['__builtins__',\n",
       " '__cached__',\n",
       " '__doc__',\n",
       " '__file__',\n",
       " '__loader__',\n",
       " '__name__',\n",
       " '__package__',\n",
       " '__spec__',\n",
       " 'commit',\n",
       " 'full_version',\n",
       " 'istaged',\n",
       " 'major',\n",
       " 'minor',\n",
       " 'mkl',\n",
       " 'patch',\n",
       " 'rc',\n",
       " 'show',\n",
       " 'with_mkl']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dir(paddle.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c880c719",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.1.0\n"
     ]
    }
   ],
   "source": [
    "print(paddle.version.full_version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f26977bf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "commit: 5e7e7c9fde8350084abf1898cf52651cfc84b17a\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(paddle.version.show())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "04ad47f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.6.0\n"
     ]
    }
   ],
   "source": [
    "print(torch.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e1e03830",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['__builtins__',\n",
       " '__cached__',\n",
       " '__doc__',\n",
       " '__file__',\n",
       " '__loader__',\n",
       " '__name__',\n",
       " '__package__',\n",
       " '__spec__',\n",
       " '__version__',\n",
       " 'cuda',\n",
       " 'debug',\n",
       " 'git_version',\n",
       " 'hip']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dir(torch.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4ad0389b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'b31f58de6fa8bbda5353b3c77d9be4914399724d'"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.version.git_version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "7870ea10",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'10.2'"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "torch.version.cuda"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db8ee5a7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6321ec2a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.notebook/WarmupLR.ipynb
+++ b/.notebook/WarmupLR.ipynb
--- a/.notebook/audio_feature.ipynb
+++ b/.notebook/audio_feature.ipynb
--- a/.notebook/compute_cmvn_loader_test.ipynb
+++ b/.notebook/compute_cmvn_loader_test.ipynb
--- a/.notebook/dataloader.ipynb
+++ b/.notebook/dataloader.ipynb
@ -1,389 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "emerging-meter",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  def convert_to_list(value, n, name, dtype=np.int):\n",
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
      "  from numpy.dual import register_func\n",
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  long_ = _make_signed(np.long)\n",
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  ulong = _make_unsigned(np.long)\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "import random\n",
    "import tarfile\n",
    "import logging\n",
    "import numpy as np\n",
    "from collections import namedtuple\n",
    "from functools import partial\n",
    "\n",
    "import paddle\n",
    "from paddle.io import Dataset\n",
    "from paddle.io import DataLoader\n",
    "from paddle.io import BatchSampler\n",
    "from paddle.io import DistributedBatchSampler\n",
    "from paddle import distributed as dist\n",
    "\n",
    "from data_utils.utility import read_manifest\n",
    "from data_utils.augmentor.augmentation import AugmentationPipeline\n",
    "from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n",
    "from data_utils.speech import SpeechSegment\n",
    "from data_utils.normalizer import FeatureNormalizer\n",
    "\n",
    "\n",
    "from data_utils.dataset import (\n",
    "    DeepSpeech2Dataset,\n",
    "    DeepSpeech2DistributedBatchSampler,\n",
    "    DeepSpeech2BatchSampler,\n",
    "    SpeechCollator,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "excessive-american",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataloader(manifest_path,\t\n",
    "                      vocab_filepath,\t\n",
    "                      mean_std_filepath,\t\n",
    "                      augmentation_config='{}',\t\n",
    "                      max_duration=float('inf'),\t\n",
    "                      min_duration=0.0,\t\n",
    "                      stride_ms=10.0,\t\n",
    "                      window_ms=20.0,\t\n",
    "                      max_freq=None,\t\n",
    "                      specgram_type='linear',\t\n",
    "                      use_dB_normalization=True,\t\n",
    "                      random_seed=0,\t\n",
    "                      keep_transcription_text=False,\t\n",
    "                      is_training=False,\t\n",
    "                      batch_size=1,\t\n",
    "                      num_workers=0,\t\n",
    "                      sortagrad=False,\t\n",
    "                      shuffle_method=None,\t\n",
    "                      dist=False):\t\n",
    "\n",
    "    dataset = DeepSpeech2Dataset(\t\n",
    "        manifest_path,\t\n",
    "        vocab_filepath,\t\n",
    "        mean_std_filepath,\t\n",
    "        augmentation_config=augmentation_config,\t\n",
    "        max_duration=max_duration,\t\n",
    "        min_duration=min_duration,\t\n",
    "        stride_ms=stride_ms,\t\n",
    "        window_ms=window_ms,\t\n",
    "        max_freq=max_freq,\t\n",
    "        specgram_type=specgram_type,\t\n",
    "        use_dB_normalization=use_dB_normalization,\t\n",
    "        random_seed=random_seed,\t\n",
    "        keep_transcription_text=keep_transcription_text)\t\n",
    "\n",
    "    if dist:\t\n",
    "        batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n",
    "            dataset,\t\n",
    "            batch_size,\t\n",
    "            num_replicas=None,\t\n",
    "            rank=None,\t\n",
    "            shuffle=is_training,\t\n",
    "            drop_last=is_training,\t\n",
    "            sortagrad=is_training,\t\n",
    "            shuffle_method=shuffle_method)\t\n",
    "    else:\t\n",
    "        batch_sampler = DeepSpeech2BatchSampler(\t\n",
    "            dataset,\t\n",
    "            shuffle=is_training,\t\n",
    "            batch_size=batch_size,\t\n",
    "            drop_last=is_training,\t\n",
    "            sortagrad=is_training,\t\n",
    "            shuffle_method=shuffle_method)\t\n",
    "\n",
    "    def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n",
    "        \"\"\"\t\n",
    "        Padding audio features with zeros to make them have the same shape (or\t\n",
    "        a user-defined shape) within one bach.\t\n",
    "\n",
    "        If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n",
    "        as the target shape for padding. Otherwise, `padding_to` will be the\t\n",
    "        target shape (only refers to the second axis).\t\n",
    "\n",
    "        If `flatten` is True, features will be flatten to 1darray.\t\n",
    "        \"\"\"\t\n",
    "        new_batch = []\t\n",
    "        # get target shape\t\n",
    "        max_length = max([audio.shape[1] for audio, text in batch])\t\n",
    "        if padding_to != -1:\t\n",
    "            if padding_to < max_length:\t\n",
    "                raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n",
    "                                 \"than any instance's shape in the batch\")\t\n",
    "            max_length = padding_to\t\n",
    "        max_text_length = max([len(text) for audio, text in batch])\t\n",
    "        # padding\t\n",
    "        padded_audios = []\t\n",
    "        audio_lens = []\t\n",
    "        texts, text_lens = [], []\t\n",
    "        for audio, text in batch:\t\n",
    "            padded_audio = np.zeros([audio.shape[0], max_length])\t\n",
    "            padded_audio[:, :audio.shape[1]] = audio\t\n",
    "            if flatten:\t\n",
    "                padded_audio = padded_audio.flatten()\t\n",
    "            padded_audios.append(padded_audio)\t\n",
    "            audio_lens.append(audio.shape[1])\t\n",
    "\n",
    "            padded_text = np.zeros([max_text_length])\n",
    "            if is_training:\n",
    "                padded_text[:len(text)] = text\t# ids\n",
    "            else:\n",
    "                padded_text[:len(text)] = [ord(t) for t in text] # string\n",
    "            \n",
    "            texts.append(padded_text)\t\n",
    "            text_lens.append(len(text))\t\n",
    "\n",
    "        padded_audios = np.array(padded_audios).astype('float32')\t\n",
    "        audio_lens = np.array(audio_lens).astype('int64')\t\n",
    "        texts = np.array(texts).astype('int32')\t\n",
    "        text_lens = np.array(text_lens).astype('int64')\t\n",
    "        return padded_audios, texts, audio_lens, text_lens\t\n",
    "\n",
    "    loader = DataLoader(\t\n",
    "        dataset,\t\n",
    "        batch_sampler=batch_sampler,\t\n",
    "        collate_fn=partial(padding_batch, is_training=is_training),\t\n",
    "        num_workers=num_workers)\t\n",
    "    return loader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "naval-brave",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import argparse\n",
    "import functools\n",
    "from utils.utility import add_arguments, print_arguments\n",
    "parser = argparse.ArgumentParser(description=__doc__)\n",
    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
    "# yapf: disable\n",
    "add_arg('num_samples',      int,    5,     \"# of samples to infer.\")\n",
    "add_arg('beam_size',        int,    500,    \"Beam search width.\")\n",
    "add_arg('num_proc_bsearch', int,    8,      \"# of CPUs for beam search.\")\n",
    "add_arg('num_conv_layers',  int,    2,      \"# of convolution layers.\")\n",
    "add_arg('num_rnn_layers',   int,    3,      \"# of recurrent layers.\")\n",
    "add_arg('rnn_layer_size',   int,    2048,   \"# of recurrent cells per layer.\")\n",
    "add_arg('alpha',            float,  2.5,    \"Coef of LM for beam search.\")\n",
    "add_arg('beta',             float,  0.3,    \"Coef of WC for beam search.\")\n",
    "add_arg('cutoff_prob',      float,  1.0,    \"Cutoff probability for pruning.\")\n",
    "add_arg('cutoff_top_n',     int,    40,     \"Cutoff number for pruning.\")\n",
    "add_arg('use_gru',          bool,   False,  \"Use GRUs instead of simple RNNs.\")\n",
    "add_arg('use_gpu',          bool,   True,   \"Use GPU or not.\")\n",
    "add_arg('share_rnn_weights',bool,   True,   \"Share input-hidden weights across \"\n",
    "                                            \"bi-directional RNNs. Not for GRU.\")\n",
    "add_arg('infer_manifest',   str,\n",
    "        'examples/aishell/data/manifest.dev',\n",
    "        \"Filepath of manifest to infer.\")\n",
    "add_arg('mean_std_path',    str,\n",
    "        'examples/aishell/data/mean_std.npz',\n",
    "        \"Filepath of normalizer's mean & std.\")\n",
    "add_arg('vocab_path',       str,\n",
    "        'examples/aishell/data/vocab.txt',\n",
    "        \"Filepath of vocabulary.\")\n",
    "add_arg('lang_model_path',  str,\n",
    "        'models/lm/common_crawl_00.prune01111.trie.klm',\n",
    "        \"Filepath for language model.\")\n",
    "add_arg('model_path',       str,\n",
    "        'examples/aishell/checkpoints/step_final',\n",
    "        \"If None, the training starts from scratch, \"\n",
    "        \"otherwise, it resumes from the pre-trained model.\")\n",
    "add_arg('decoding_method',  str,\n",
    "        'ctc_beam_search',\n",
    "        \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
    "        choices = ['ctc_beam_search', 'ctc_greedy'])\n",
    "add_arg('error_rate_type',  str,\n",
    "        'wer',\n",
    "        \"Error rate type for evaluation.\",\n",
    "        choices=['wer', 'cer'])\n",
    "add_arg('specgram_type',    str,\n",
    "        'linear',\n",
    "        \"Audio feature type. Options: linear, mfcc.\",\n",
    "        choices=['linear', 'mfcc'])\n",
    "# yapf: disable\n",
    "args = parser.parse_args([])\n",
    "print(vars(args))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "bearing-physics",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_reader = create_dataloader(\n",
    "            manifest_path=args.infer_manifest,\n",
    "            vocab_filepath=args.vocab_path,\n",
    "            mean_std_filepath=args.mean_std_path,\n",
    "            augmentation_config='{}',\n",
    "            #max_duration=float('inf'),\n",
    "            max_duration=27.0,\n",
    "            min_duration=0.0,\n",
    "            stride_ms=10.0,\n",
    "            window_ms=20.0,\n",
    "            max_freq=None,\n",
    "            specgram_type=args.specgram_type,\n",
    "            use_dB_normalization=True,\n",
    "            random_seed=0,\n",
    "            keep_transcription_text=True,\n",
    "            is_training=False,\n",
    "            batch_size=args.num_samples,\n",
    "            sortagrad=True,\n",
    "            shuffle_method=None,\n",
    "            dist=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "classified-melissa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
      "       [[22823, 26102, 20195, 37324, 0    , 0    ],\n",
      "        [22238, 26469, 23601, 22909, 0    , 0    ],\n",
      "        [20108, 26376, 22235, 26085, 0    , 0    ],\n",
      "        [36824, 35201, 20445, 25345, 32654, 24863],\n",
      "        [29042, 27748, 21463, 23456, 0    , 0    ]])\n",
      "test raw 大时代里\n",
      "test raw 煲汤受宠\n",
      "audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
      "       [163, 167, 180, 186, 186])\n",
      "test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [4, 4, 4, 6, 4])\n",
      "audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
      "       [[[ 1.11669052,  0.79015088,  0.93658292, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [ 0.83549136,  0.72643483,  0.83578080, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.89155018, -0.18894747, -0.53357804, ...,  0.        ,  0.        ,  0.        ],\n",
      "         ...,\n",
      "         [ 0.33386710, -0.81240511,  0.12869737, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.17537928,  0.58380985,  0.70696265, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.84175998,  1.22041416,  0.07929770, ...,  0.        ,  0.        ,  0.        ]],\n",
      "\n",
      "        [[-0.35964420,  0.77392709,  0.71409988, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.15990183,  0.42962283,  0.06222462, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.31166190, -0.74864638, -0.52836996, ...,  0.        ,  0.        ,  0.        ],\n",
      "         ...,\n",
      "         [-0.27546275,  0.32889456,  0.12410031, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [ 0.16264282,  0.49418071, -0.15960945, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [ 0.12476666,  0.00516864,  1.16021466, ...,  0.        ,  0.        ,  0.        ]],\n",
      "\n",
      "        [[ 0.90202141,  1.48541915,  0.92062062, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [ 0.82661545,  1.37171340,  0.86746097, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.62287915, -0.48645937,  0.35041964, ...,  0.        ,  0.        ,  0.        ],\n",
      "         ...,\n",
      "         [ 0.07376949,  0.07138316,  0.76355994, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.32306790,  0.43247896,  1.27311838, ...,  0.        ,  0.        ,  0.        ],\n",
      "         [-0.97667056,  0.60747612,  0.79181534, ...,  0.        ,  0.        ,  0.        ]],\n",
      "\n",
      "        [[ 0.72022128,  0.95428467,  0.92766261, ...,  0.29105374, -0.45564806, -0.62151009],\n",
      "         [ 0.42083180,  0.49279949,  0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n",
      "         [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n",
      "         ...,\n",
      "         [-0.23009977,  1.06155431,  1.09065628, ...,  0.25581080,  0.53794998, -1.22650719],\n",
      "         [-1.37693381,  0.30778193,  0.17152318, ...,  0.51650339,  0.25580606,  0.83097816],\n",
      "         [-1.62180591,  1.30567718,  1.09928656, ..., -0.77590007,  1.27712476,  0.53189957]],\n",
      "\n",
      "        [[ 1.03205252, -0.51535392,  0.21077573, ...,  0.76618457,  1.27425683,  1.52250278],\n",
      "         [ 0.82059991,  0.43990925,  0.13090958, ...,  0.86662549,  1.01687658,  1.48495352],\n",
      "         [-0.75489789, -0.01997089, -0.65174174, ...,  0.09061214, -0.55211234, -0.01614586],\n",
      "         ...,\n",
      "         [ 0.50985396,  1.84555030,  0.79185146, ...,  1.13666189,  1.19898069,  1.98158395],\n",
      "         [ 1.98721015,  2.52385354,  1.11714780, ...,  0.19416514,  1.11329341,  0.64460152],\n",
      "         [ 2.69512844,  1.90993905,  0.50245082, ..., -0.50902629,  0.03333465, -1.24584770]]])\n"
     ]
    }
   ],
   "source": [
    "for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
    "    print('test', text)\n",
    "    print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
    "    print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
    "    print('audio len', audio_len)\n",
    "    print('test len', text_len)\n",
    "    print('audio', audio)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "unexpected-skating",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "minus-modern",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.notebook/dataloader_with_tokens_tokenids.ipynb
+++ b/.notebook/dataloader_with_tokens_tokenids.ipynb
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
--- a/.notebook/hack_api_test.ipynb
+++ b/.notebook/hack_api_test.ipynb
@ -1,290 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "breeding-haven",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%cd ..\n",
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "appropriate-theta",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LICENSE       deepspeech  examples\t\t    requirements.txt  tools\r\n",
      "README.md     docs\t  libsndfile-1.0.28\t    setup.sh\t      utils\r\n",
      "README_cn.md  env.sh\t  libsndfile-1.0.28.tar.gz  tests\r\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "entire-bloom",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  def convert_to_list(value, n, name, dtype=np.int):\n",
      "WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
      "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
      "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
      "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
      "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
      "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
      "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
      "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
     ]
    }
   ],
   "source": [
    "from deepspeech.modules import loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "governmental-aircraft",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n"
     ]
    }
   ],
   "source": [
    "import paddle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "proprietary-disaster",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paddle.Tensor.repeat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "first-diagram",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<property at 0x7fb515eeeb88>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paddle.Tensor.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "intelligent-david",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paddle.Tensor.cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "bronze-tenant",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "balanced-bearing",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "extreme-republic",
   "metadata": {},
   "outputs": [],
   "source": [
    "def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
    "    nargs = len(args)\n",
    "    assert (nargs <= 1)\n",
    "    s = paddle.shape(xs)\n",
    "    if nargs == 1:\n",
    "        return s[args[0]]\n",
    "    else:\n",
    "        return s\n",
    "\n",
    "# logger.warn(\n",
    "#     \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
    "# )\n",
    "paddle.Tensor.size = size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "gross-addiction",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
       "       [7])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.size(0)\n",
    "a.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "adverse-dining",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
       "       [7])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "popular-potato",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
@ -1,672 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/ssd5/zhanghui/DeepSpeech2.x\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'/home/ssd5/zhanghui/DeepSpeech2.x'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%cd ..\n",
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
      "2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
      "2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
      "2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
      "2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
      "2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
      "2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
      "2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
      "2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
      "2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
      "2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
      "2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
      "  from numpy.dual import register_func\n",
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import time\n",
    "import argparse\n",
    "import functools\n",
    "import paddle\n",
    "import numpy as np\n",
    "\n",
    "from deepspeech.utils.socket_server import warm_up_test\n",
    "from deepspeech.utils.socket_server import AsrTCPServer\n",
    "from deepspeech.utils.socket_server import AsrRequestHandler\n",
    "\n",
    "from deepspeech.training.cli import default_argument_parser\n",
    "from deepspeech.exps.deepspeech2.config import get_cfg_defaults\n",
    "\n",
    "from deepspeech.frontend.utility import read_manifest\n",
    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
    "\n",
    "from deepspeech.models.ds2 import DeepSpeech2Model\n",
    "from deepspeech.models.ds2 import DeepSpeech2InferModel\n",
    "from deepspeech.io.dataset import ManifestDataset\n",
    "\n",
    "\n",
    "\n",
    "from deepspeech.frontend.utility import read_manifest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0.0\n",
      "e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
      "OFF\n",
      "OFF\n",
      "commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
      "None\n",
      "0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['__builtins__',\n",
       " '__cached__',\n",
       " '__doc__',\n",
       " '__file__',\n",
       " '__loader__',\n",
       " '__name__',\n",
       " '__package__',\n",
       " '__spec__',\n",
       " 'commit',\n",
       " 'full_version',\n",
       " 'istaged',\n",
       " 'major',\n",
       " 'minor',\n",
       " 'mkl',\n",
       " 'patch',\n",
       " 'rc',\n",
       " 'show',\n",
       " 'with_mkl']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(paddle.__version__)\n",
    "print(paddle.version.commit)\n",
    "print(paddle.version.with_mkl)\n",
    "print(paddle.version.mkl())\n",
    "print(paddle.version.show())\n",
    "print(paddle.version.patch)\n",
    "dir(paddle.version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data:\n",
      "  augmentation_config: conf/augmentation.config\n",
      "  batch_size: 64\n",
      "  dev_manifest: data/manifest.dev\n",
      "  keep_transcription_text: False\n",
      "  max_duration: 27.0\n",
      "  max_freq: None\n",
      "  mean_std_filepath: examples/aishell/data/mean_std.npz\n",
      "  min_duration: 0.0\n",
      "  n_fft: None\n",
      "  num_workers: 0\n",
      "  random_seed: 0\n",
      "  shuffle_method: batch_shuffle\n",
      "  sortagrad: True\n",
      "  specgram_type: linear\n",
      "  stride_ms: 10.0\n",
      "  target_dB: -20\n",
      "  target_sample_rate: 16000\n",
      "  test_manifest: examples/aishell/data/manifest.test\n",
      "  train_manifest: data/manifest.train\n",
      "  use_dB_normalization: True\n",
      "  vocab_filepath: examples/aishell/data/vocab.txt\n",
      "  window_ms: 20.0\n",
      "decoding:\n",
      "  alpha: 2.6\n",
      "  batch_size: 128\n",
      "  beam_size: 300\n",
      "  beta: 5.0\n",
      "  cutoff_prob: 0.99\n",
      "  cutoff_top_n: 40\n",
      "  decoding_method: ctc_beam_search\n",
      "  error_rate_type: cer\n",
      "  lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm\n",
      "  num_proc_bsearch: 10\n",
      "model:\n",
      "  num_conv_layers: 2\n",
      "  num_rnn_layers: 3\n",
      "  rnn_layer_size: 1024\n",
      "  share_rnn_weights: False\n",
      "  use_gru: True\n",
      "training:\n",
      "  global_grad_clip: 5.0\n",
      "  lr: 0.0005\n",
      "  lr_decay: 0.83\n",
      "  n_epoch: 30\n",
      "  weight_decay: 1e-06\n",
      "-----------  Configuration Arguments -----------\n",
      "checkpoint_path: examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725\n",
      "config: examples/aishell/conf/deepspeech2.yaml\n",
      "device: gpu\n",
      "dump_config: None\n",
      "export_path: None\n",
      "host_ip: localhost\n",
      "host_port: 8086\n",
      "model_dir: None\n",
      "model_file: examples/aishell/jit.model.pdmodel\n",
      "nprocs: 1\n",
      "opts: ['data.test_manifest', 'examples/aishell/data/manifest.test', 'data.mean_std_filepath', 'examples/aishell/data/mean_std.npz', 'data.vocab_filepath', 'examples/aishell/data/vocab.txt']\n",
      "output: None\n",
      "params_file: examples/aishell/jit.model.pdiparams\n",
      "speech_save_dir: demo_cache\n",
      "use_gpu: False\n",
      "warmup_manifest: examples/aishell/data/manifest.test\n",
      "------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "parser = default_argument_parser()\n",
    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
    "add_arg('host_ip',          str,\n",
    "        'localhost',\n",
    "        \"Server's IP address.\")\n",
    "add_arg('host_port',        int,    8086,    \"Server's IP port.\")\n",
    "add_arg('speech_save_dir',  str,\n",
    "        'demo_cache',\n",
    "        \"Directory to save demo audios.\")\n",
    "add_arg('warmup_manifest',  \n",
    "        str, \n",
    "        \"examples/aishell/data/manifest.test\", \n",
    "        \"Filepath of manifest to warm up.\")\n",
    "add_arg(\n",
    "    \"--model_file\",\n",
    "    type=str,\n",
    "    default=\"examples/aishell/jit.model.pdmodel\",\n",
    "    help=\"Model filename, Specify this when your model is a combined model.\"\n",
    ")\n",
    "add_arg(\n",
    "    \"--params_file\",\n",
    "    type=str,\n",
    "    default=\"examples/aishell/jit.model.pdiparams\",\n",
    "    help=\n",
    "    \"Parameter filename, Specify this when your model is a combined model.\"\n",
    ")\n",
    "add_arg(\n",
    "    \"--model_dir\",\n",
    "    type=str,\n",
    "    default=None,\n",
    "    help=\n",
    "    \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
    ")\n",
    "add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
    "\n",
    "\n",
    "args = parser.parse_args(\n",
    "    \"--checkpoint_path examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725 --config examples/aishell/conf/deepspeech2.yaml --opts data.test_manifest examples/aishell/data/manifest.test data.mean_std_filepath examples/aishell/data/mean_std.npz  data.vocab_filepath examples/aishell/data/vocab.txt\".split()\n",
    ")\n",
    "\n",
    "\n",
    "config = get_cfg_defaults()\n",
    "if args.config:\n",
    "    config.merge_from_file(args.config)\n",
    "if args.opts:\n",
    "    config.merge_from_list(args.opts)\n",
    "config.freeze()\n",
    "print(config)\n",
    "\n",
    "args.warmup_manifest = config.data.test_manifest\n",
    "\n",
    "print_arguments(args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = ManifestDataset(\n",
    "        config.data.test_manifest,\n",
    "        config.data.unit_type,\n",
    "        config.data.vocab_filepath,\n",
    "        config.data.mean_std_filepath,\n",
    "        augmentation_config=\"{}\",\n",
    "        max_duration=config.data.max_duration,\n",
    "        min_duration=config.data.min_duration,\n",
    "        stride_ms=config.data.stride_ms,\n",
    "        window_ms=config.data.window_ms,\n",
    "        n_fft=config.data.n_fft,\n",
    "        max_freq=config.data.max_freq,\n",
    "        target_sample_rate=config.data.target_sample_rate,\n",
    "        specgram_type=config.data.specgram_type,\n",
    "        feat_dim=config.data.feat_dim,\n",
    "        delta_delta=config.data.delat_delta,\n",
    "        use_dB_normalization=config.data.use_dB_normalization,\n",
    "        target_dB=config.data.target_dB,\n",
    "        random_seed=config.data.random_seed,\n",
    "        keep_transcription_text=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "layer summary:\n",
      "encoder.conv.conv_in.conv.weight|[32, 1, 41, 11]|14432\n",
      "encoder.conv.conv_in.bn.weight|[32]|32\n",
      "encoder.conv.conv_in.bn.bias|[32]|32\n",
      "encoder.conv.conv_in.bn._mean|[32]|32\n",
      "encoder.conv.conv_in.bn._variance|[32]|32\n",
      "encoder.conv.conv_stack.0.conv.weight|[32, 32, 21, 11]|236544\n",
      "encoder.conv.conv_stack.0.bn.weight|[32]|32\n",
      "encoder.conv.conv_stack.0.bn.bias|[32]|32\n",
      "encoder.conv.conv_stack.0.bn._mean|[32]|32\n",
      "encoder.conv.conv_stack.0.bn._variance|[32]|32\n",
      "encoder.rnn.rnn_stacks.0.fw_fc.weight|[1312, 3072]|4030464\n",
      "encoder.rnn.rnn_stacks.0.fw_bn.weight|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.fw_bn.bias|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.fw_bn._mean|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.fw_bn._variance|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.bw_fc.weight|[1312, 3072]|4030464\n",
      "encoder.rnn.rnn_stacks.0.bw_bn.weight|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.bw_bn.bias|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.bw_bn._mean|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.bw_bn._variance|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.fw_cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.0.fw_cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.bw_cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.0.bw_cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.0.fw_rnn.cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.0.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.0.bw_rnn.cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.fw_fc.weight|[2048, 3072]|6291456\n",
      "encoder.rnn.rnn_stacks.1.fw_bn.weight|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.fw_bn.bias|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.fw_bn._mean|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.fw_bn._variance|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.bw_fc.weight|[2048, 3072]|6291456\n",
      "encoder.rnn.rnn_stacks.1.bw_bn.weight|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.bw_bn.bias|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.bw_bn._mean|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.bw_bn._variance|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.fw_cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.1.fw_cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.bw_cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.1.bw_cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.1.fw_rnn.cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.1.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.1.bw_rnn.cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.fw_fc.weight|[2048, 3072]|6291456\n",
      "encoder.rnn.rnn_stacks.2.fw_bn.weight|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.fw_bn.bias|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.fw_bn._mean|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.fw_bn._variance|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.bw_fc.weight|[2048, 3072]|6291456\n",
      "encoder.rnn.rnn_stacks.2.bw_bn.weight|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.bw_bn.bias|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.bw_bn._mean|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.bw_bn._variance|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.fw_cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.2.fw_cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.bw_cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.2.bw_cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.2.fw_rnn.cell.bias_hh|[3072]|3072\n",
      "encoder.rnn.rnn_stacks.2.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
      "encoder.rnn.rnn_stacks.2.bw_rnn.cell.bias_hh|[3072]|3072\n",
      "decoder.ctc_lo.weight|[2048, 4300]|8806400\n",
      "decoder.ctc_lo.bias|[4300]|4300\n",
      "layer has 66 parameters, 80148012 elements.\n"
     ]
    }
   ],
   "source": [
    "model = DeepSpeech2InferModel.from_pretrained(dataset, config,\n",
    "                                             args.checkpoint_path)\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "examples/aishell/jit.model.pdmodel\n",
      "examples/aishell/jit.model.pdiparams\n",
      "0\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from paddle.inference import Config\n",
    "from paddle.inference import PrecisionType\n",
    "from paddle.inference import create_predictor\n",
    "\n",
    "args.use_gpu=False\n",
    "paddle.set_device('cpu')\n",
    "\n",
    "def init_predictor(args):\n",
    "    if args.model_dir is not None:\n",
    "        config = Config(args.model_dir)\n",
    "    else:\n",
    "        config = Config(args.model_file, args.params_file)\n",
    "\n",
    "    if args.use_gpu:\n",
    "        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
    "#         config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
    "#                               use_calib_mode=True) # 开启TensorRT预测，精度为fp32，开启int8离线量化\n",
    "    else:\n",
    "        # If not specific mkldnn, you can set the blas thread.\n",
    "        # The thread num should not be greater than the number of cores in the CPU.\n",
    "        config.set_cpu_math_library_num_threads(1)\n",
    "        config.enable_mkldnn()\n",
    "        \n",
    "    config.enable_memory_optim()\n",
    "    config.switch_ir_optim(True)\n",
    "    \n",
    "    print(config.model_dir())\n",
    "    print(config.prog_file())\n",
    "    print(config.params_file())\n",
    "    print(config.gpu_device_id())\n",
    "    print(args.use_gpu)\n",
    "    predictor = create_predictor(config)\n",
    "    return predictor\n",
    "\n",
    "def run(predictor, audio, audio_len):\n",
    "    # copy img data to input tensor\n",
    "    input_names = predictor.get_input_names()\n",
    "    for i, name in enumerate(input_names):\n",
    "        print(\"input:\", i, name)\n",
    "        \n",
    "    audio_tensor = predictor.get_input_handle('audio')\n",
    "    audio_tensor.reshape(audio.shape)\n",
    "    audio_tensor.copy_from_cpu(audio.copy())\n",
    "    \n",
    "    audiolen_tensor = predictor.get_input_handle('audio_len')\n",
    "    audiolen_tensor.reshape(audio_len.shape)\n",
    "    audiolen_tensor.copy_from_cpu(audio_len.copy())\n",
    "\n",
    "    output_names = predictor.get_output_names()\n",
    "    for i, name in enumerate(output_names):\n",
    "        print(\"output:\", i, name)\n",
    "\n",
    "    # do the inference\n",
    "    predictor.run()\n",
    "\n",
    "    results = []\n",
    "    # get out data from output tensor\n",
    "    output_names = predictor.get_output_names()\n",
    "    for i, name in enumerate(output_names):\n",
    "        output_tensor = predictor.get_output_handle(name)\n",
    "        output_data = output_tensor.copy_to_cpu()\n",
    "        results.append(output_data)\n",
    "\n",
    "    return results\n",
    "\n",
    "\n",
    "predictor = init_predictor(args)\n",
    "\n",
    "def file_to_transcript(filename):\n",
    "        print(filename)\n",
    "        feature = dataset.process_utterance(filename, \"\")\n",
    "        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]\n",
    "        audio_len = feature[0].shape[1]\n",
    "        audio_len = np.array([audio_len]).astype('int64')  # [1]\n",
    "        \n",
    "        \n",
    "        i_probs = run(predictor, audio, audio_len)\n",
    "        print('jit:', i_probs[0], type(i_probs[0]))\n",
    "        \n",
    "        audio = paddle.to_tensor(audio)\n",
    "        audio_len = paddle.to_tensor(audio_len)\n",
    "        print(audio.shape)\n",
    "        print(audio_len.shape)\n",
    "        \n",
    "        #eouts, eouts_len = model.encoder(audio, audio_len)\n",
    "        #probs = model.decoder.softmax(eouts)\n",
    "        probs = model.forward(audio, audio_len)\n",
    "        print('paddle:', probs.numpy())\n",
    "        \n",
    "        flag = np.allclose(i_probs[0], probs.numpy())\n",
    "        print(flag)\n",
    "        \n",
    "        return probs\n",
    "\n",
    "#         result_transcript = model.decode(\n",
    "#             audio,\n",
    "#             audio_len,\n",
    "#             vocab_list=dataset.vocab_list,\n",
    "#             decoding_method=config.decoding.decoding_method,\n",
    "#             lang_model_path=config.decoding.lang_model_path,\n",
    "#             beam_alpha=config.decoding.alpha,\n",
    "#             beam_beta=config.decoding.beta,\n",
    "#             beam_size=config.decoding.beam_size,\n",
    "#             cutoff_prob=config.decoding.cutoff_prob,\n",
    "#             cutoff_top_n=config.decoding.cutoff_top_n,\n",
    "#             num_processes=config.decoding.num_proc_bsearch)\n",
    "#         return result_transcript[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warm-up Test Case %d: %s 0 /home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n",
      "/home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n",
      "input: 0 audio\n",
      "input: 1 audio_len\n",
      "output: 0 tmp_75\n",
      "jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
      "   8.91573707e-12 4.64317296e-08]\n",
      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
      "   1.55891342e-15 9.99992609e-01]\n",
      "  [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
      "   1.24587264e-17 1.00000000e+00]\n",
      "  ...\n",
      "  [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
      "   4.37354747e-15 1.00000000e+00]\n",
      "  [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
      "   3.89252270e-13 1.00000000e+00]\n",
      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
      "   1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
      "[1, 161, 522]\n",
      "[1]\n",
      "paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
      "   8.91577090e-12 4.64319072e-08]\n",
      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
      "   1.55891342e-15 9.99992609e-01]\n",
      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
      "   1.24587735e-17 1.00000000e+00]\n",
      "  ...\n",
      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
      "   4.37354747e-15 1.00000000e+00]\n",
      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
      "   3.89253761e-13 1.00000000e+00]\n",
      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
      "   1.00334095e-10 9.99998808e-01]]]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "manifest = read_manifest(args.warmup_manifest)\n",
    "\n",
    "for idx, sample in enumerate(manifest[:1]):\n",
    "    print(\"Warm-up Test Case %d: %s\", idx, sample['audio_filepath'])\n",
    "    start_time = time.time()\n",
    "    transcript = file_to_transcript(sample['audio_filepath'])\n",
    "    finish_time = time.time()\n",
    "#     print(\"Response Time: %f, Transcript: %s\" %\n",
    "#           (finish_time - start_time, transcript))\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 161, 522) (1,)\n",
      "input: 0 audio\n",
      "input: 1 audio_len\n",
      "output: 0 tmp_75\n",
      "jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
      "   8.91577090e-12 4.64319072e-08]\n",
      "  [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
      "   1.55891342e-15 9.99992609e-01]\n",
      "  [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
      "   1.24587735e-17 1.00000000e+00]\n",
      "  ...\n",
      "  [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
      "   4.37354747e-15 1.00000000e+00]\n",
      "  [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
      "   3.89253761e-13 1.00000000e+00]\n",
      "  [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
      "   1.00334095e-10 9.99998808e-01]]]\n"
     ]
    }
   ],
   "source": [
    "def test(filename):\n",
    "    feature = dataset.process_utterance(filename, \"\")\n",
    "    audio = np.array([feature[0]]).astype('float32')  #[1, D, T]\n",
    "    audio_len = feature[0].shape[1]\n",
    "    audio_len = np.array([audio_len]).astype('int64')  # [1]\n",
    "    \n",
    "    print(audio.shape, audio_len.shape)\n",
    "\n",
    "    i_probs = run(predictor, audio, audio_len)\n",
    "    print('jit:', i_probs[0])\n",
    "    return i_probs\n",
    "    \n",
    "probs = test(sample['audio_filepath'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/.notebook/layer_norm_test.ipynb
+++ b/.notebook/layer_norm_test.ipynb
@ -1,229 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "academic-surname",
   "metadata": {},
   "outputs": [],
   "source": [
    "import paddle\n",
    "from paddle import nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "fundamental-treasure",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parameter containing:\n",
      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
      "       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
      "Parameter containing:\n",
      "Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
      "       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
     ]
    }
   ],
   "source": [
    "L = nn.LayerNorm(256, epsilon=1e-12)\n",
    "for p in L.parameters():\n",
    "    print(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "consolidated-elephant",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "moderate-noise",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "float64\n"
     ]
    }
   ],
   "source": [
    "x = np.random.randn(2, 51, 256)\n",
    "print(x.dtype)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "cooked-progressive",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = L(paddle.to_tensor(x, dtype='float32'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "optimum-milwaukee",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "viral-indian",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parameter containing:\n",
      "tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
      "        1., 1., 1., 1.], requires_grad=True)\n",
      "Parameter containing:\n",
      "tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
      "        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
      "       requires_grad=True)\n"
     ]
    }
   ],
   "source": [
    "TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
    "for p in TL.parameters():\n",
    "    print(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "skilled-vietnamese",
   "metadata": {},
   "outputs": [],
   "source": [
    "ty = TL(torch.tensor(x, dtype=torch.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "incorrect-allah",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.allclose(y.numpy(), ty.detach().numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "prostate-cameroon",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "governmental-surge",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = np.random.randn(2, 256)\n",
    "y = L(paddle.to_tensor(x, dtype='float32'))\n",
    "ty = TL(torch.tensor(x, dtype=torch.float32))\n",
    "np.allclose(y.numpy(), ty.detach().numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "confidential-jacket",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.notebook/mask_and_masked_fill_test.ipynb
+++ b/.notebook/mask_and_masked_fill_test.ipynb
@ -1,449 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "primary-organic",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "stopped-semester",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mask_finished_scores(score: torch.Tensor,\n",
    "                         flag: torch.Tensor) -> torch.Tensor:\n",
    "    \"\"\"\n",
    "    If a sequence is finished, we only allow one alive branch. This function\n",
    "    aims to give one branch a zero score and the rest -inf score.\n",
    "    Args:\n",
    "        score (torch.Tensor): A real value array with shape\n",
    "            (batch_size * beam_size, beam_size).\n",
    "        flag (torch.Tensor): A bool array with shape\n",
    "            (batch_size * beam_size, 1).\n",
    "    Returns:\n",
    "        torch.Tensor: (batch_size * beam_size, beam_size).\n",
    "    \"\"\"\n",
    "    beam_size = score.size(-1)\n",
    "    zero_mask = torch.zeros_like(flag, dtype=torch.bool)\n",
    "    if beam_size > 1:\n",
    "        unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),\n",
    "                               dim=1)\n",
    "        finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),\n",
    "                             dim=1)\n",
    "    else:\n",
    "        unfinished = zero_mask\n",
    "        finished = flag\n",
    "    print(unfinished)\n",
    "    print(finished)\n",
    "    score.masked_fill_(unfinished, -float('inf'))\n",
    "    score.masked_fill_(finished, 0)\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "agreed-portuguese",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ True],\n",
      "        [False]])\n",
      "tensor([[-0.8841,  0.7381, -0.9986],\n",
      "        [ 0.2675, -0.7971,  0.3798]])\n",
      "tensor([[ True,  True],\n",
      "        [False, False]])\n"
     ]
    }
   ],
   "source": [
    "score = torch.randn((2, 3))\n",
    "flag = torch.ones((2, 1), dtype=torch.bool)\n",
    "flag[1] = False\n",
    "print(flag)\n",
    "print(score)\n",
    "print(flag.repeat([1, 2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "clean-aspect",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[False,  True,  True],\n",
      "        [False, False, False]])\n",
      "tensor([[ True, False, False],\n",
      "        [False, False, False]])\n",
      "tensor([[ 0.0000,    -inf,    -inf],\n",
      "        [ 0.2675, -0.7971,  0.3798]])\n",
      "tensor([[ 0.0000,    -inf,    -inf],\n",
      "        [ 0.2675, -0.7971,  0.3798]])\n"
     ]
    }
   ],
   "source": [
    "r  = mask_finished_scores(score, flag)\n",
    "print(r)\n",
    "print(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "thrown-airline",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tensor(shape=[2, 1], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[True ],\n",
      "        [False]])\n",
      "Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 2.05994511,  1.87704289,  0.01988174],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[True , True ],\n",
      "        [False, False]])\n"
     ]
    }
   ],
   "source": [
    "import paddle\n",
    "\n",
    "score = paddle.randn((2, 3))\n",
    "flag = paddle.ones((2, 1), dtype='bool')\n",
    "flag[1] = False\n",
    "print(flag)\n",
    "print(score)\n",
    "print(flag.tile([1, 2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "internal-patent",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[False, True , True ],\n",
      "        [False, False, False]])\n",
      "Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[True , False, False],\n",
      "        [False, False, False]])\n",
      "x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 2.05994511,  1.87704289,  0.01988174],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 2.05994511,  1.87704289,  0.01988174],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 2.05994511, -inf.      , -inf.      ],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 2.05994511, -inf.      , -inf.      ],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 2.05994511, -inf.      , -inf.      ],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 0.        , -inf.      , -inf.      ],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n",
      "Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[ 0.        , -inf.      , -inf.      ],\n",
      "        [-0.40165186,  0.77547729, -0.64469045]])\n"
     ]
    }
   ],
   "source": [
    "paddle.bool = 'bool'\n",
    "\n",
    "def masked_fill(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
    "    print(xs)\n",
    "    trues = paddle.ones_like(xs) * value\n",
    "    assert xs.shape == mask.shape\n",
    "    xs = paddle.where(mask, trues, xs)\n",
    "    return xs\n",
    "\n",
    "def masked_fill_(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
    "    print('x', xs)\n",
    "    trues = paddle.ones_like(xs) * value\n",
    "    assert xs.shape == mask.shape\n",
    "    ret = paddle.where(mask, trues, xs)\n",
    "    print('2', xs)\n",
    "    paddle.assign(ret, output=xs)\n",
    "    print('3', xs)\n",
    "\n",
    "paddle.Tensor.masked_fill = masked_fill\n",
    "paddle.Tensor.masked_fill_ = masked_fill_\n",
    "\n",
    "def mask_finished_scores_pd(score: paddle.Tensor,\n",
    "                         flag: paddle.Tensor) -> paddle.Tensor:\n",
    "    \"\"\"\n",
    "    If a sequence is finished, we only allow one alive branch. This function\n",
    "    aims to give one branch a zero score and the rest -inf score.\n",
    "    Args:\n",
    "        score (torch.Tensor): A real value array with shape\n",
    "            (batch_size * beam_size, beam_size).\n",
    "        flag (torch.Tensor): A bool array with shape\n",
    "            (batch_size * beam_size, 1).\n",
    "    Returns:\n",
    "        torch.Tensor: (batch_size * beam_size, beam_size).\n",
    "    \"\"\"\n",
    "    beam_size = score.shape[-1]\n",
    "    zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)\n",
    "    if beam_size > 1:\n",
    "        unfinished = paddle.concat((zero_mask, flag.tile([1, beam_size - 1])),\n",
    "                               axis=1)\n",
    "        finished = paddle.concat((flag, zero_mask.tile([1, beam_size - 1])),\n",
    "                             axis=1)\n",
    "    else:\n",
    "        unfinished = zero_mask\n",
    "        finished = flag\n",
    "    print(unfinished)\n",
    "    print(finished)\n",
    "    \n",
    "    #score.masked_fill_(unfinished, -float('inf'))\n",
    "    #score.masked_fill_(finished, 0)\n",
    "#     infs = paddle.ones_like(score) * -float('inf')\n",
    "#     score = paddle.where(unfinished, infs, score)\n",
    "#     score = paddle.where(finished, paddle.zeros_like(score), score)\n",
    "\n",
    "#     score = score.masked_fill(unfinished, -float('inf'))\n",
    "#     score = score.masked_fill(finished, 0)\n",
    "    score.masked_fill_(unfinished, -float('inf'))\n",
    "    score.masked_fill_(finished, 0)\n",
    "    return score\n",
    "\n",
    "r  = mask_finished_scores_pd(score, flag)\n",
    "print(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "vocal-prime",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method PyCapsule.value of Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
       "       [[ 0.        , -inf.      , -inf.      ],\n",
       "        [-0.40165186,  0.77547729, -0.64469045]])>"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score.value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "bacterial-adolescent",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Union, Any"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "absent-fiber",
   "metadata": {},
   "outputs": [],
   "source": [
    "def repeat(xs : paddle.Tensor, *size: Any):\n",
    "    print(size)\n",
    "    return paddle.tile(xs, size)\n",
    "paddle.Tensor.repeat = repeat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "material-harbor",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1, 2)\n",
      "Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[True , True ],\n",
      "        [False, False]])\n"
     ]
    }
   ],
   "source": [
    "flag = paddle.ones((2, 1), dtype='bool')\n",
    "flag[1] = False\n",
    "print(flag.repeat(1, 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "acute-brighton",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [1]), 2)\n",
      "Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
      "       [[True , True ],\n",
      "        [False, False]])\n"
     ]
    }
   ],
   "source": [
    "flag = paddle.ones((2, 1), dtype='bool')\n",
    "flag[1] = False\n",
    "print(flag.repeat(paddle.to_tensor(1), 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "european-rugby",
   "metadata": {},
   "outputs": [],
   "source": [
    "def size(xs, *args: int):\n",
    "    nargs = len(args)\n",
    "    s = paddle.shape(xs)\n",
    "    assert(nargs <= 1)\n",
    "    if nargs == 1:\n",
    "        return s[args[0]]\n",
    "    else:\n",
    "        return s\n",
    "paddle.Tensor.size = size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "moral-special",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Tensor(shape=[2], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
       "       [2, 1])"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flag.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "ahead-coach",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
       "       [1])"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flag.size(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "incomplete-fitness",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
       "       [2])"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "flag.size(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "upset-connectivity",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.notebook/position_embeding_check.ipynb
+++ b/.notebook/position_embeding_check.ipynb
@ -1,231 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "designing-borough",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
      "   0.0000000e+00  0.0000000e+00]\n",
      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
      "   1.1547816e-04  1.0746076e-04]\n",
      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
      "   2.3095631e-04  2.1492151e-04]\n",
      " ...\n",
      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
      "   1.1201146e-02  1.0423505e-02]\n",
      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
      "   1.1316618e-02  1.0530960e-02]\n",
      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
      "   1.1432089e-02  1.0638415e-02]]\n",
      "True\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import math\n",
    "import numpy as np\n",
    "\n",
    "max_len=100\n",
    "d_model=256\n",
    "\n",
    "pe = torch.zeros(max_len, d_model)\n",
    "position = torch.arange(0, max_len,\n",
    "                        dtype=torch.float32).unsqueeze(1)\n",
    "toruch_position = position\n",
    "div_term = torch.exp(\n",
    "    torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
    "    -(math.log(10000.0) / d_model))\n",
    "tourch_div_term = div_term.cpu().detach().numpy()\n",
    "\n",
    "\n",
    "\n",
    "torhc_sin = torch.sin(position * div_term)\n",
    "torhc_cos = torch.cos(position * div_term)\n",
    "print(torhc_sin.cpu().detach().numpy())\n",
    "np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
    "np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
    "print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
    "print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
    "pe[:, 0::2] = torhc_sin\n",
    "pe[:, 1::2] = torhc_cos\n",
    "tourch_pe = pe.cpu().detach().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "swiss-referral",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "True\n",
      "False\n",
      "False\n",
      "False\n",
      "False\n",
      "[[ 1.          1.          1.         ...  1.          1.\n",
      "   1.        ]\n",
      " [ 0.5403023   0.59737533  0.6479059  ...  1.          1.\n",
      "   1.        ]\n",
      " [-0.41614684 -0.28628543 -0.1604359  ...  0.99999994  1.\n",
      "   1.        ]\n",
      " ...\n",
      " [-0.92514753 -0.66694194 -0.67894876 ...  0.9999276   0.99993724\n",
      "   0.9999457 ]\n",
      " [-0.81928825 -0.9959641  -0.999139   ...  0.99992603  0.999936\n",
      "   0.99994457]\n",
      " [ 0.03982088 -0.52298605 -0.6157435  ...  0.99992454  0.9999347\n",
      "   0.99994344]]\n",
      "----\n",
      "[[ 1.          1.          1.         ...  1.          1.\n",
      "   1.        ]\n",
      " [ 0.54030234  0.59737533  0.6479059  ...  1.          1.\n",
      "   1.        ]\n",
      " [-0.41614684 -0.28628543 -0.1604359  ...  1.          1.\n",
      "   1.        ]\n",
      " ...\n",
      " [-0.92514753 -0.66694194 -0.67894876 ...  0.9999276   0.9999373\n",
      "   0.9999457 ]\n",
      " [-0.81928825 -0.9959641  -0.999139   ...  0.99992603  0.999936\n",
      "   0.99994457]\n",
      " [ 0.03982088 -0.5229861  -0.6157435  ...  0.99992454  0.9999347\n",
      "   0.99994344]]\n",
      ")))))))\n",
      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
      "   0.0000000e+00  0.0000000e+00]\n",
      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
      "   1.1547816e-04  1.0746076e-04]\n",
      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
      "   2.3095631e-04  2.1492151e-04]\n",
      " ...\n",
      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
      "   1.1201146e-02  1.0423505e-02]\n",
      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
      "   1.1316618e-02  1.0530960e-02]\n",
      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
      "   1.1432089e-02  1.0638415e-02]]\n",
      "----\n",
      "[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00\n",
      "   0.0000000e+00  0.0000000e+00]\n",
      " [ 8.4147096e-01  8.0196178e-01  7.6172036e-01 ...  1.2409373e-04\n",
      "   1.1547816e-04  1.0746076e-04]\n",
      " [ 9.0929741e-01  9.5814437e-01  9.8704624e-01 ...  2.4818745e-04\n",
      "   2.3095631e-04  2.1492151e-04]\n",
      " ...\n",
      " [ 3.7960774e-01  7.4510968e-01  7.3418564e-01 ...  1.2036801e-02\n",
      "   1.1201146e-02  1.0423505e-02]\n",
      " [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ...  1.2160885e-02\n",
      "   1.1316618e-02  1.0530960e-02]\n",
      " [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ...  1.2284970e-02\n",
      "   1.1432089e-02  1.0638415e-02]]\n"
     ]
    }
   ],
   "source": [
    "import paddle\n",
    "paddle.set_device('cpu')\n",
    "ppe = paddle.zeros((max_len, d_model), dtype='float32')\n",
    "position = paddle.arange(0, max_len,\n",
    "                        dtype='float32').unsqueeze(1)\n",
    "print(np.allclose(position.numpy(), toruch_position))\n",
    "div_term = paddle.exp(\n",
    "    paddle.arange(0, d_model, 2, dtype='float32') *\n",
    "    -(math.log(10000.0) / d_model))\n",
    "print(np.allclose(div_term.numpy(), tourch_div_term))\n",
    "\n",
    "\n",
    "\n",
    "p_sin = paddle.sin(position * div_term)\n",
    "p_cos = paddle.cos(position * div_term)\n",
    "print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
    "print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
    "ppe[:, 0::2] = p_sin\n",
    "ppe[:, 1::2] = p_cos\n",
    "print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
    "print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n",
    "print(p_cos.numpy())\n",
    "print(\"----\")\n",
    "print(torhc_cos.cpu().detach().numpy())\n",
    "print(\")))))))\")\n",
    "print(p_sin.numpy())\n",
    "print(\"----\")\n",
    "print(torhc_sin.cpu().detach().numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "integrated-boards",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    }
   ],
   "source": [
    "print(np.allclose(ppe.numpy(), pe.numpy()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "flying-reserve",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "revised-divide",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
--- a/.notebook/train_test.ipynb
+++ b/.notebook/train_test.ipynb
--- a/.notebook/u2_confermer_model_wenet.ipynb
+++ b/.notebook/u2_confermer_model_wenet.ipynb
--- a/.notebook/u2_tansformer_model_espnet.ipynb
+++ b/.notebook/u2_tansformer_model_espnet.ipynb
--- a/.notebook/wenet_model.ipynb
+++ b/.notebook/wenet_model.ipynb
--- a/README.md
+++ b/README.md
@ -1,5 +1,3 @@
 [中文版](README_cn.md)
 # PaddlePaddle Speech to Any toolkit
 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
@ -11,31 +9,29 @@
 ## Features
- See [feature list](doc/src/feature_list.md) for more information.
+ See [feature list](docs/src/feature_list.md) for more information.
 ## Setup
 All tested under:  
 * Ubuntu 16.04
 * python>=3.7
-* paddlepaddle>=2.1.2
+* paddlepaddle>=2.2.0rc
-Please see [install](doc/src/install.md).
+Please see [install](docs/src/install.md).
 ## Getting Started
-Please see [Getting Started](doc/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
+Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
 ## More Information  
-* [Data Prepration](doc/src/data_preparation.md)  
+* [Data Prepration](docs/src/data_preparation.md)  
-* [Data Augmentation](doc/src/augmentation.md)  
+* [Data Augmentation](docs/src/augmentation.md)  
-* [Ngram LM](doc/src/ngram_lm.md)  
+* [Ngram LM](docs/src/ngram_lm.md)  
-* [Server Demo](doc/src/server.md)  
+* [Benchmark](docs/src/benchmark.md)  
-* [Benchmark](doc/src/benchmark.md)  
+* [Relased Model](docs/src/released_model.md)  
 * [Relased Model](doc/src/released_model.md)  
 * [FAQ](doc/src/faq.md)  
 ## Questions and Help
@ -45,8 +41,8 @@ You are welcome to submit questions in [Github Discussions](https://github.com/P
 ## License
-DeepASR is provided under the [Apache-2.0 License](./LICENSE).
+DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
 ## Acknowledgement
-We depends on many open source repos. See [References](doc/src/reference.md) for more information.
+We depends on many open source repos. See [References](docs/src/reference.md) for more information.
--- a/README_cn.md
+++ b/README_cn.md
@ -1,51 +0,0 @@
 [English](README.md)
 # PaddlePaddle Speech to Any toolkit
 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
 ![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
 ![support os](https://img.shields.io/badge/os-linux-yellow.svg)
 *DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目，
 我们的愿景是为语音识别在工业应用和学术研究上，提供易于使用、高效、小型化和可扩展的工具，包括训练，推理，以及  部署。
 ## 特性
 参看 [特性列表](doc/src/feature_list.md)。
 ## 安装
 在以下环境测试验证过：  
 * Ubuntu 16.04
 * python>=3.7
 * paddlepaddle>=2.1.2
 参看 [安装](doc/src/install.md)。
 ## 开始
 请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
 ## 更多信息
 * [数据处理](doc/src/data_preparation.md)  
 * [数据增强](doc/src/augmentation.md)  
 * [语言模型](doc/src/ngram_lm.md)  
 * [服务部署](doc/src/server.md)  
 * [Benchmark](doc/src/benchmark.md)  
 * [Relased Model](doc/src/released_model.md)  
 * [FAQ](doc/src/faq.md)  
 ## 问题和帮助
 欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题，[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
 ## License
 DeepASR 遵循[Apache-2.0开源协议](./LICENSE)。
 ## 感谢
 开发中参考一些优秀的仓库，详情参见 [References](doc/src/reference.md)。
--- a/deepspeech/init.py
+++ b/deepspeech/init.py
@ -80,23 +80,23 @@ def convert_dtype_to_string(tensor_dtype):
 if not hasattr(paddle, 'softmax'):
-    logger.warn("register user softmax to paddle, remove this when fixed!")
+    logger.debug("register user softmax to paddle, remove this when fixed!")
    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
 if not hasattr(paddle, 'log_softmax'):
-    logger.warn("register user log_softmax to paddle, remove this when fixed!")
+    logger.debug("register user log_softmax to paddle, remove this when fixed!")
    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
 if not hasattr(paddle, 'sigmoid'):
-    logger.warn("register user sigmoid to paddle, remove this when fixed!")
+    logger.debug("register user sigmoid to paddle, remove this when fixed!")
    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
 if not hasattr(paddle, 'log_sigmoid'):
-    logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
+    logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
 if not hasattr(paddle, 'relu'):
-    logger.warn("register user relu to paddle, remove this when fixed!")
+    logger.debug("register user relu to paddle, remove this when fixed!")
    setattr(paddle, 'relu', paddle.nn.functional.relu)
@ -105,7 +105,7 @@ def cat(xs, dim=0):
 if not hasattr(paddle, 'cat'):
-    logger.warn(
+    logger.debug(
        "override cat of paddle if exists or register, remove this when fixed!")
    paddle.cat = cat
@ -116,7 +116,7 @@ def item(x: paddle.Tensor):
 if not hasattr(paddle.Tensor, 'item'):
-    logger.warn(
+    logger.debug(
        "override item of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.item = item
@ -127,13 +127,13 @@ def func_long(x: paddle.Tensor):
 if not hasattr(paddle.Tensor, 'long'):
-    logger.warn(
+    logger.debug(
        "override long of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.long = func_long
 if not hasattr(paddle.Tensor, 'numel'):
-    logger.warn(
+    logger.debug(
        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.numel = paddle.numel
@ -147,7 +147,7 @@ def new_full(x: paddle.Tensor,
 if not hasattr(paddle.Tensor, 'new_full'):
-    logger.warn(
+    logger.debug(
        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.new_full = new_full
@ -162,13 +162,13 @@ def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'eq'):
-    logger.warn(
+    logger.debug(
        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.eq = eq
 if not hasattr(paddle, 'eq'):
-    logger.warn(
+    logger.debug(
        "override eq of paddle if exists or register, remove this when fixed!")
    paddle.eq = eq
@ -178,7 +178,7 @@ def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'contiguous'):
-    logger.warn(
+    logger.debug(
        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
    )
    paddle.Tensor.contiguous = contiguous
@ -195,7 +195,7 @@ def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
 #`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.warn(
+logger.debug(
    "override size of paddle.Tensor "
    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
 )
@ -207,7 +207,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'view'):
-    logger.warn("register user view to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user view to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.view = view
@ -216,7 +216,7 @@ def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'view_as'):
-    logger.warn(
+    logger.debug(
        "register user view_as to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.view_as = view_as
@ -242,7 +242,7 @@ def masked_fill(xs: paddle.Tensor,
 if not hasattr(paddle.Tensor, 'masked_fill'):
-    logger.warn(
+    logger.debug(
        "register user masked_fill to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.masked_fill = masked_fill
@ -260,7 +260,7 @@ def masked_fill_(xs: paddle.Tensor,
 if not hasattr(paddle.Tensor, 'masked_fill_'):
-    logger.warn(
+    logger.debug(
        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.masked_fill_ = masked_fill_
@ -272,7 +272,8 @@ def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'fill_'):
-    logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
+    logger.debug(
        "register user fill_ to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.fill_ = fill_
@ -281,22 +282,22 @@ def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'repeat'):
-    logger.warn(
+    logger.debug(
        "register user repeat to paddle.Tensor, remove this when fixed!")
    paddle.Tensor.repeat = repeat
 if not hasattr(paddle.Tensor, 'softmax'):
-    logger.warn(
+    logger.debug(
        "register user softmax to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
 if not hasattr(paddle.Tensor, 'sigmoid'):
-    logger.warn(
+    logger.debug(
        "register user sigmoid to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
 if not hasattr(paddle.Tensor, 'relu'):
-    logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
@ -305,7 +306,7 @@ def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'type_as'):
-    logger.warn(
+    logger.debug(
        "register user type_as to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'type_as', type_as)
@ -321,7 +322,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'to'):
-    logger.warn("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'to', to)
@ -330,7 +331,8 @@ def func_float(x: paddle.Tensor) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'float'):
-    logger.warn("register user float to paddle.Tensor, remove this when fixed!")
+    logger.debug(
        "register user float to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'float', func_float)
@ -339,7 +341,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor:
 if not hasattr(paddle.Tensor, 'int'):
-    logger.warn("register user int to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user int to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'int', func_int)
@ -348,23 +350,6 @@ def tolist(x: paddle.Tensor) -> List[Any]:
 if not hasattr(paddle.Tensor, 'tolist'):
-    logger.warn(
+    logger.debug(
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)
 ########### hcak paddle.nn #############
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""
    def __init__(self, dim: int=-1):
        super().__init__()
        self.dim = dim
    def forward(self, xs):
        return F.glu(xs, axis=self.dim)
 if not hasattr(paddle.nn, 'GLU'):
    logger.warn("register user GLU to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'GLU', GLU)
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@ -35,7 +35,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer) {
+    Scorer *ext_scorer,
    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
@ -48,7 +49,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    // assign blank id
    // size_t blank_id = vocabulary.size();
-    size_t blank_id = 0;
+    // size_t blank_id = 0;
    // assign space id
    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
@ -57,7 +58,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    if ((size_t)space_id >= vocabulary.size()) {
        space_id = -2;
    }
    // init prefixes' root
    PathTrie root;
    root.score = root.log_prob_b_prev = 0.0;
@ -218,7 +218,8 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob,
    size_t cutoff_top_n,
-    Scorer *ext_scorer) {
+    Scorer *ext_scorer,
    size_t blank_id) {
    VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
    // thread pool
    ThreadPool pool(num_processes);
@ -234,7 +235,8 @@ ctc_beam_search_decoder_batch(
                                      beam_size,
                                      cutoff_prob,
                                      cutoff_top_n,
-                                      ext_scorer));
+                                      ext_scorer,
                                      blank_id));
    }
    // get decoding results
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.h
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.h
@ -43,7 +43,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    size_t beam_size,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
-    Scorer *ext_scorer = nullptr);
+    Scorer *ext_scorer = nullptr,
    size_t blank_id = 0);
 /* CTC Beam Search Decoder for batch data
@ -70,6 +71,7 @@ ctc_beam_search_decoder_batch(
    size_t num_processes,
    double cutoff_prob = 1.0,
    size_t cutoff_top_n = 40,
-    Scorer *ext_scorer = nullptr);
+    Scorer *ext_scorer = nullptr,
    size_t blank_id = 0);
 #endif  // CTC_BEAM_SEARCH_DECODER_H_
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.cpp
@ -17,17 +17,18 @@
 std::string ctc_greedy_decoder(
    const std::vector<std::vector<double>> &probs_seq,
-    const std::vector<std::string> &vocabulary) {
+    const std::vector<std::string> &vocabulary,
    size_t blank_id) {
    // dimension check
    size_t num_time_steps = probs_seq.size();
    for (size_t i = 0; i < num_time_steps; ++i) {
        VALID_CHECK_EQ(probs_seq[i].size(),
-                       vocabulary.size() + 1,
+                       vocabulary.size(),
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
-    size_t blank_id = vocabulary.size();
+    // size_t blank_id = vocabulary.size();
    std::vector<size_t> max_idx_vec(num_time_steps, 0);
    std::vector<size_t> idx_vec;
--- a/deepspeech/decoders/swig/ctc_greedy_decoder.h
+++ b/deepspeech/decoders/swig/ctc_greedy_decoder.h
@ -29,6 +29,7 @@
 */
 std::string ctc_greedy_decoder(
    const std::vector<std::vector<double>>& probs_seq,
-    const std::vector<std::string>& vocabulary);
+    const std::vector<std::string>& vocabulary,
    size_t blank_id);
 #endif  // CTC_GREEDY_DECODER_H
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@ -85,9 +85,8 @@ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
 # yapf: disable
 FILES = [
-    fn for fn in FILES
+    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
-    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+                               or fn.endswith('unittest.cc'))
        'unittest.cc'))
 ]
 # yapf: enable
--- a/deepspeech/decoders/swig_wrapper.py
+++ b/deepspeech/decoders/swig_wrapper.py
@ -32,7 +32,7 @@ class Scorer(swig_decoders.Scorer):
        swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
-def ctc_greedy_decoder(probs_seq, vocabulary):
+def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
    """Wrapper for ctc best path decoder in swig.
    :param probs_seq: 2-D list of probability distributions over each time
@ -44,7 +44,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
    :return: Decoding result string.
    :rtype: str
    """
-    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary)
+    result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
                                              blank_id)
    return result
@ -53,7 +54,8 @@ def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            cutoff_prob=1.0,
                            cutoff_top_n=40,
-                            ext_scoring_func=None):
+                            ext_scoring_func=None,
                            blank_id=0):
    """Wrapper for the CTC Beam Search Decoder.
    :param probs_seq: 2-D list of probability distributions over each time
@ -81,7 +83,7 @@ def ctc_beam_search_decoder(probs_seq,
    """
    beam_results = swig_decoders.ctc_beam_search_decoder(
        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
-        ext_scoring_func)
+        ext_scoring_func, blank_id)
    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
    return beam_results
@ -92,7 +94,8 @@ def ctc_beam_search_decoder_batch(probs_split,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
-                                  ext_scoring_func=None):
+                                  ext_scoring_func=None,
                                  blank_id=0):
    """Wrapper for the batched CTC beam search decoder.
    :param probs_seq: 3-D list with each element as an instance of 2-D list
@ -125,7 +128,7 @@ def ctc_beam_search_decoder_batch(probs_split,
    batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
-        cutoff_top_n, ext_scoring_func)
+        cutoff_top_n, ext_scoring_func, blank_id)
    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
                          for beam_results in batch_beam_results]
    return batch_beam_results
--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
@ -27,7 +27,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@ -1,191 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Beam search parameters tuning for DeepSpeech2 model."""
 import functools
 import sys
 import numpy as np
 from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils import error_rate
 from deepspeech.utils.utility import add_arguments
 from deepspeech.utils.utility import print_arguments
 def tune(config, args):
    """Tune parameters alpha and beta incrementally."""
    if not args.num_alphas >= 0:
        raise ValueError("num_alphas must be non-negative!")
    if not args.num_betas >= 0:
        raise ValueError("num_betas must be non-negative!")
    config.defrost()
    config.data.manfiest = config.data.dev_manifest
    config.data.augmentation_config = ""
    config.data.keep_transcription_text = True
    dev_dataset = ManifestDataset.from_config(config)
    valid_loader = DataLoader(
        dev_dataset,
        batch_size=config.data.batch_size,
        shuffle=False,
        drop_last=False,
        collate_fn=SpeechCollator(keep_transcription_text=True))
    model = DeepSpeech2Model.from_pretrained(valid_loader, config,
                                             args.checkpoint_path)
    model.eval()
    # decoders only accept string encoded in utf-8
    vocab_list = valid_loader.dataset.vocab_list
    errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
    # create grid for search
    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
    params_grid = [(alpha, beta) for alpha in cand_alphas
                   for beta in cand_betas]
    err_sum = [0.0 for i in range(len(params_grid))]
    err_ave = [0.0 for i in range(len(params_grid))]
    num_ins, len_refs, cur_batch = 0, 0, 0
    # initialize external scorer
    model.decoder.init_decode(args.alpha_from, args.beta_from,
                              config.decoding.lang_model_path, vocab_list,
                              config.decoding.decoding_method)
    ## incremental tuning parameters over multiple batches
    print("start tuning ...")
    for infer_data in valid_loader():
        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
            break
        def ordid2token(texts, texts_len):
            """ ord() id to chr() chr """
            trans = []
            for text, n in zip(texts, texts_len):
                n = n.numpy().item()
                ids = text[:n]
                trans.append(''.join([chr(i) for i in ids]))
            return trans
        audio, audio_len, text, text_len = infer_data
        target_transcripts = ordid2token(text, text_len)
        num_ins += audio.shape[0]
        # model infer
        eouts, eouts_len = model.encoder(audio, audio_len)
        probs = model.decoder.softmax(eouts)
        # grid search
        for index, (alpha, beta) in enumerate(params_grid):
            print(f"tuneing: alpha={alpha} beta={beta}")
            result_transcripts = model.decoder.decode_probs(
                probs.numpy(), eouts_len, vocab_list,
                config.decoding.decoding_method,
                config.decoding.lang_model_path, alpha, beta,
                config.decoding.beam_size, config.decoding.cutoff_prob,
                config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
            for target, result in zip(target_transcripts, result_transcripts):
                errors, len_ref = errors_func(target, result)
                err_sum[index] += errors
                # accumulate the length of references of every batchπ
                # in the first iteration
                if args.alpha_from == alpha and args.beta_from == beta:
                    len_refs += len_ref
            err_ave[index] = err_sum[index] / len_refs
            if index % 2 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            print("tuneing: one grid done!")
        # output on-line tuning result at the end of current batch
        err_ave_min = min(err_ave)
        min_index = err_ave.index(err_ave_min)
        print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
              " min [%s] = %f" %
              (cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
               "%.3f" % params_grid[min_index][1],
               config.decoding.error_rate_type, err_ave_min))
        cur_batch += 1
    # output WER/CER at every (alpha, beta)
    print("\nFinal %s:\n" % config.decoding.error_rate_type)
    for index in range(len(params_grid)):
        print("(alpha, beta) = (%s, %s), [%s] = %f" %
              ("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
               config.decoding.error_rate_type, err_ave[index]))
    err_ave_min = min(err_ave)
    min_index = err_ave.index(err_ave_min)
    print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
          (cur_batch, "%.3f" % params_grid[min_index][0],
           "%.3f" % params_grid[min_index][1]))
    print("finish tuning")
 def main(config, args):
    tune(config, args)
 if __name__ == "__main__":
    parser = default_argument_parser()
    add_arg = functools.partial(add_arguments, argparser=parser)
    add_arg('num_batches', int, -1, "# of batches tuning on. "
            "Default -1, on whole dev set.")
    add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
    add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
    add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
    add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
    add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
    add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
    add_arg('batch_size', int, 256, "# of samples per batch.")
    add_arg('beam_size', int, 500, "Beam search width.")
    add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
    add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
    add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
    args = parser.parse_args()
    print_arguments(args, globals())
    # https://yaml.org/type/float.html
    config = get_cfg_defaults()
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.data.batch_size = args.batch_size
    config.decoding.beam_size = args.beam_size
    config.decoding.num_proc_bsearch = args.num_proc_bsearch
    config.decoding.cutoff_prob = args.cutoff_prob
    config.decoding.cutoff_top_n = args.cutoff_top_n
    config.freeze()
    print(config)
    if args.dump_config:
        with open(args.dump_config, 'w') as f:
            print(config, file=f)
    main(config, args)
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@ -15,9 +15,11 @@
 import os
 import time
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -34,12 +36,14 @@ from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
 from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
 from deepspeech.training.reporter import report
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
 from deepspeech.utils.log import Autolog
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -65,29 +69,52 @@ class DeepSpeech2Trainer(Trainer):
        super().__init__(config, args)
    def train_batch(self, batch_index, batch_data, msg):
        batch_size = self.config.collator.batch_size
        accum_grad = self.config.training.accum_grad
        start = time.time()
        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss = self.model(audio, audio_len, text, text_len)
        losses_np = {
            'train_loss': float(loss),
        }
        # loss backward
        if (batch_index + 1) % accum_grad != 0:
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            context = self.model.no_sync
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
            context = nullcontext
        with context():
            loss.backward()
            layer_tools.print_grads(self.model, print_func=None)
        # optimizer step
        if (batch_index + 1) % accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
            self.iteration += 1
        iteration_time = time.time() - start
-        losses_np = {
+        for k, v in losses_np.items():
-            'train_loss': float(loss),
+            report(k, v)
-        }
+        report("batch_size", batch_size)
-        msg += "train time: {:>.3f}s, ".format(iteration_time)
+        report("accum", accum_grad)
-        msg += "batch size: {}, ".format(self.config.collator.batch_size)
+        report("step_cost", iteration_time)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        logger.info(msg)
        if dist.get_rank() == 0 and self.visualizer:
            for k, v in losses_np.items():
                # `step -1` since we update `step` after optimizer.step().
                self.visualizer.add_scalar("train/{}".format(k), v,
-                                           self.iteration)
+                                           self.iteration - 1)
        self.iteration += 1
    @paddle.no_grad()
    def valid(self):
@ -124,10 +151,9 @@ class DeepSpeech2Trainer(Trainer):
    def setup_model(self):
        config = self.config.clone()
-        config.defrost()
+        with UpdateConfig(config):
            config.model.feat_size = self.train_loader.collate_fn.feature_size
            config.model.dict_size = self.train_loader.collate_fn.vocab_size
        config.freeze()
        if self.args.model_type == 'offline':
            model = DeepSpeech2Model.from_config(config.model)
@ -280,9 +306,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
+            logger.info(f"Utt: {utt}")
-                        (target, result))
+            logger.info(f"Ref: {target}")
            logger.info(f"Hyp: {result}")
            logger.info("Current error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))
@ -325,7 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        cfg = self.config
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                utts, audio, audio_len, texts, texts_len = batch
                metrics = self.compute_metrics(utts, audio, audio_len, texts,
@ -378,7 +405,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
@ -610,7 +637,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
--- a/deepspeech/exps/u2/bin/train.py
+++ b/deepspeech/exps/u2/bin/train.py
@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils.utility import print_arguments
 # from deepspeech.exps.u2.trainer import U2Trainer as Trainer
 def main_sp(config, args):
    exp = Trainer(config, args)
@ -30,7 +32,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@ -17,9 +17,12 @@ import os
 import sys
 import time
 from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -32,7 +35,10 @@ from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.reporter import ObsScope
 from deepspeech.training.reporter import report
 from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@ -41,6 +47,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -79,21 +86,36 @@ class U2Trainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
        utt, audio, audio_len, text, text_len = batch_data
        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
        loss.backward()
        layer_tools.print_grads(self.model, print_func=None)
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)
        # loss backward
        if (batch_index + 1) % train_conf.accum_grad != 0:
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            # When using cpu w/o DDP, model does not have `no_sync`
            context = self.model.no_sync if self.parallel else nullcontext
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
            context = nullcontext
        with context():
            loss.backward()
            layer_tools.print_grads(self.model, print_func=None)
        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -102,14 +124,13 @@ class U2Trainer(Trainer):
        iteration_time = time.time() - start
-        if (batch_index + 1) % train_conf.log_interval == 0:
+        for k, v in losses_np.items():
-            msg += "train time: {:>.3f}s, ".format(iteration_time)
+            report(k, v)
-            msg += "batch size: {}, ".format(self.config.collator.batch_size)
+        report("batch_size", self.config.collator.batch_size)
-            msg += "accum: {}, ".format(train_conf.accum_grad)
+        report("accum", train_conf.accum_grad)
-            msg += ', '.join('{}: {:>.6f}'.format(k, v)
+        report("step_cost", iteration_time)
                             for k, v in losses_np.items())
            logger.info(msg)
        if (batch_index + 1) % train_conf.accum_grad == 0:
            if dist.get_rank() == 0 and self.visualizer:
                losses_np_v = losses_np.copy()
                losses_np_v.update({"lr": self.lr_scheduler()})
@ -163,35 +184,47 @@ class U2Trainer(Trainer):
        # script_model_path = str(self.checkpoint_dir / 'init')
        # paddle.jit.save(script_model, script_model_path)
-        from_scratch = self.resume_or_scratch()
+        self.before_train()
        if from_scratch:
            # save init model, i.e. 0 epoch
            self.save(tag='init')
        self.lr_scheduler.step(self.iteration)
        if self.parallel:
            self.train_loader.batch_sampler.set_epoch(self.epoch)
        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
                try:
                    data_start_time = time.time()
                    for batch_index, batch in enumerate(self.train_loader):
                        dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg = "Train:"
-                    msg += "epoch: {}, ".format(self.epoch)
+                        observation = OrderedDict()
-                    msg += "step: {}, ".format(self.iteration)
+                        with ObsScope(observation):
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
+                            report("Rank", dist.get_rank())
-                                                    len(self.train_loader))
+                            report("epoch", self.epoch)
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                            report('step', self.iteration)
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
+                            report("lr", self.lr_scheduler())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('iter', batch_index + 1)
                            report('total', len(self.train_loader))
                            report('reader_cost', dataload_time)
                        observation['batch_cost'] = observation[
                            'reader_cost'] + observation['step_cost']
                        observation['samples'] = observation['batch_size']
                        observation['ips[sent./sec]'] = observation[
                            'batch_size'] / observation['batch_cost']
                        for k, v in observation.items():
                            msg += f" {k}: "
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
                        if (batch_index + 1
                            ) % self.config.training.log_interval == 0:
                            logger.info(msg)
                        data_start_time = time.time()
                except Exception as e:
                    logger.error(e)
                    raise e
            with Timer("Eval Time Cost: {}"):
                total_loss, num_seen_utts = self.valid()
                if dist.get_world_size() > 1:
                    num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -294,10 +327,11 @@ class U2Trainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
+
        with UpdateConfig(model_conf):
            model_conf.input_dim = self.train_loader.collate_fn.feature_size
            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+
        model = U2Model.from_config(model_conf)
        if self.parallel:
@ -433,9 +467,10 @@ class U2Tester(U2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
+            logger.info(f"Utt: {utt}")
-                        (target, result))
+            logger.info(f"Ref: {target}")
            logger.info(f"Hyp: {result}")
            logger.info("One example error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))
@ -460,7 +495,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_metrics(*batch, fout=fout)
                num_frames += metrics['num_frames']
@ -540,7 +575,7 @@ class U2Tester(U2Trainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
@ -548,26 +583,25 @@ class U2Tester(U2Trainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
+                align_output_path = Path(self.args.result_file).parent / "align"
-                    os.path.dirname(self.args.result_file), "align")
+                align_output_path.mkdir(parents=True, exist_ok=True)
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                tier_path = align_output_path / (key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                                             key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@ -575,7 +609,7 @@ class U2Tester(U2Trainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()
@ -621,7 +655,7 @@ class U2Tester(U2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
--- a/deepspeech/exps/u2/trainer.py
+++ b/deepspeech/exps/u2/trainer.py
@ -0,0 +1,220 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains U2 model."""
 import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Evaluator
 from deepspeech.models.u2 import U2Model
 from deepspeech.models.u2 import U2Updater
 from deepspeech.training.extensions.snapshot import Snapshot
 from deepspeech.training.extensions.visualizer import VisualDL
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.training.updaters.trainer import Trainer as NewTrainer
 from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 class U2Trainer(Trainer):
    def __init__(self, config, args):
        super().__init__(config, args)
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
        config.collator.keep_transcription_text = False
        # train/valid dataset, return token ids
        config.data.manifest = config.data.train_manifest
        train_dataset = ManifestDataset.from_config(config)
        config.data.manifest = config.data.dev_manifest
        dev_dataset = ManifestDataset.from_config(config)
        collate_fn_train = SpeechCollator.from_config(config)
        config.collator.augmentation_config = ""
        collate_fn_dev = SpeechCollator.from_config(config)
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
                sortagrad=config.collator.sortagrad,
                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
                batch_size=config.collator.batch_size,
                drop_last=True,
                sortagrad=config.collator.sortagrad,
                shuffle_method=config.collator.shuffle_method)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
            collate_fn=collate_fn_train,
            num_workers=config.collator.num_workers, )
        self.valid_loader = DataLoader(
            dev_dataset,
            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
            collate_fn=collate_fn_dev)
        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        config.data.min_input_len = 0.0  # second
        config.data.max_input_len = float('inf')  # second
        config.data.min_output_len = 0.0  # tokens
        config.data.max_output_len = float('inf')  # tokens
        config.data.min_output_input_ratio = 0.00
        config.data.max_output_input_ratio = float('inf')
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
        config.collator.keep_transcription_text = True
        config.collator.augmentation_config = ""
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
            collate_fn=SpeechCollator.from_config(config))
        # return text token id
        config.collator.keep_transcription_text = False
        self.align_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
            collate_fn=SpeechCollator.from_config(config))
        logger.info("Setup train/valid/test/align Dataloader!")
    def setup_model(self):
        config = self.config
        model_conf = config.model
        with UpdateConfig(model_conf):
            model_conf.input_dim = self.train_loader.collate_fn.feature_size
            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
        model = U2Model.from_config(model_conf)
        if self.parallel:
            model = paddle.DataParallel(model)
        model.train()
        logger.info(f"{model}")
        layer_tools.print_params(model, logger.info)
        train_config = config.training
        optim_type = train_config.optim
        optim_conf = train_config.optim_conf
        scheduler_type = train_config.scheduler
        scheduler_conf = train_config.scheduler_conf
        scheduler_args = {
            "learning_rate": optim_conf.lr,
            "verbose": False,
            "warmup_steps": scheduler_conf.warmup_steps,
            "gamma": scheduler_conf.lr_decay,
            "d_model": model_conf.encoder_conf.output_size,
        }
        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
                                                    scheduler_args)
        def optimizer_args(
                config,
                parameters,
                lr_scheduler=None, ):
            train_config = config.training
            optim_type = train_config.optim
            optim_conf = train_config.optim_conf
            scheduler_type = train_config.scheduler
            scheduler_conf = train_config.scheduler_conf
            return {
                "grad_clip": train_config.global_grad_clip,
                "weight_decay": optim_conf.weight_decay,
                "learning_rate": lr_scheduler
                if lr_scheduler else optim_conf.lr,
                "parameters": parameters,
                "epsilon": 1e-9 if optim_type == 'noam' else None,
                "beta1": 0.9 if optim_type == 'noam' else None,
                "beat2": 0.98 if optim_type == 'noam' else None,
            }
        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
        self.model = model
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        logger.info("Setup model/optimizer/lr_scheduler!")
    def setup_updater(self):
        output_dir = self.output_dir
        config = self.config.training
        updater = U2Updater(
            model=self.model,
            optimizer=self.optimizer,
            scheduler=self.lr_scheduler,
            dataloader=self.train_loader,
            output_dir=output_dir,
            accum_grad=config.accum_grad)
        trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
        evaluator = U2Evaluator(self.model, self.valid_loader)
        trainer.extend(evaluator, trigger=(1, "epoch"))
        if dist.get_rank() == 0:
            trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
            num_snapshots = config.checkpoint.kbest_n
            trainer.extend(
                Snapshot(
                    mode='kbest',
                    max_size=num_snapshots,
                    indicator='VALID/LOSS',
                    less_better=True),
                trigger=(1, 'epoch'))
        # print(trainer.extensions)
        # trainer.run()
        self.trainer = trainer
    def run(self):
        """The routine of the experiment after setup. This method is intended
        to be used by the user.
        """
        self.setup_updater()
        with Timer("Training Done: {}"):
            self.trainer.run()
--- a/deepspeech/exps/u2_kaldi/bin/train.py
+++ b/deepspeech/exps/u2_kaldi/bin/train.py
@ -36,7 +36,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@ -17,9 +17,11 @@ import os
 import sys
 import time
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -31,6 +33,7 @@ from deepspeech.io.dataloader import BatchDataLoader
 from deepspeech.models.u2 import U2Model
 from deepspeech.training.optimizer import OptimizerFactory
 from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@ -39,6 +42,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -83,20 +87,34 @@ class U2Trainer(Trainer):
        train_conf = self.config.training
        start = time.time()
        # forward
        utt, audio, audio_len, text, text_len = batch_data
        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
                                                    text_len)
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
        loss.backward()
        layer_tools.print_grads(self.model, print_func=None)
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)
        # loss backward
        if (batch_index + 1) % train_conf.accum_grad != 0:
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            context = self.model.no_sync
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
            context = nullcontext
        with context():
            loss.backward()
            layer_tools.print_grads(self.model, print_func=None)
        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -167,14 +185,11 @@ class U2Trainer(Trainer):
        # script_model_path = str(self.checkpoint_dir / 'init')
        # paddle.jit.save(script_model, script_model_path)
-        from_scratch = self.resume_or_scratch()
+        self.before_train()
        if from_scratch:
            # save init model, i.e. 0 epoch
            self.save(tag='init')
        self.lr_scheduler.step(self.iteration)
        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
                try:
                    data_start_time = time.time()
@ -188,11 +203,13 @@ class U2Trainer(Trainer):
                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                        msg += "data time: {:>.3f}s, ".format(dataload_time)
                        self.train_batch(batch_index, batch, msg)
                        self.after_train_batch()
                        data_start_time = time.time()
                except Exception as e:
                    logger.error(e)
                    raise e
            with Timer("Eval Time Cost: {}"):
                total_loss, num_seen_utts = self.valid()
                if dist.get_world_size() > 1:
                    num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -300,10 +317,10 @@ class U2Trainer(Trainer):
        # model
        model_conf = config.model
-        model_conf.defrost()
+        with UpdateConfig(model_conf):
            model_conf.input_dim = self.train_loader.feat_dim
            model_conf.output_dim = self.train_loader.vocab_size
-        model_conf.freeze()
+
        model = U2Model.from_config(model_conf)
        if self.parallel:
            model = paddle.DataParallel(model)
@ -429,9 +446,10 @@ class U2Tester(U2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
+            logger.info(f"Utt: {utt}")
-                        (target, result))
+            logger.info(f"Ref: {target}")
            logger.info(f"Hyp: {result}")
            logger.info("One example error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))
@ -456,7 +474,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_metrics(*batch, fout=fout)
                num_frames += metrics['num_frames']
@ -526,9 +544,8 @@ class U2Tester(U2Trainer):
        self.model.eval()
        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
-        stride_ms = self.config.collater.stride_ms
+        stride_ms = self.align_loader.collate_fn.stride_ms
-        token_dict = self.args.char_list
+        token_dict = self.align_loader.collate_fn.vocab_list
        with open(self.args.result_file, 'w') as fout:
            # one example in batch
            for i, batch in enumerate(self.align_loader):
@ -537,7 +554,7 @@ class U2Tester(U2Trainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
@ -545,26 +562,25 @@ class U2Tester(U2Trainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
+                align_output_path = Path(self.args.result_file).parent / "align"
-                    os.path.dirname(self.args.result_file), "align")
+                align_output_path.mkdir(parents=True, exist_ok=True)
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                tier_path = align_output_path / (key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                                             key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@ -572,7 +588,7 @@ class U2Tester(U2Trainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()
@ -623,7 +639,7 @@ class U2Tester(U2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
--- a/deepspeech/exps/u2_st/bin/train.py
+++ b/deepspeech/exps/u2_st/bin/train.py
@ -30,7 +30,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@ -17,9 +17,11 @@ import os
 import sys
 import time
 from collections import defaultdict
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
 import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@ -37,6 +39,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
 from deepspeech.training.scheduler import WarmupLR
 from deepspeech.training.timer import Timer
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import bleu_score
 from deepspeech.utils import ctc_utils
@ -45,6 +48,7 @@ from deepspeech.utils import mp_tools
 from deepspeech.utils import text_grid
 from deepspeech.utils import utility
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
@ -83,6 +87,7 @@ class U2STTrainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
        # forward
        utt, audio, audio_len, text, text_len = batch_data
        if isinstance(text, list) and isinstance(text_len, list):
            # joint training with ASR. Two decoding texts [translation, transcription]
@ -94,18 +99,30 @@ class U2STTrainer(Trainer):
        else:
            loss, st_loss, attention_loss, ctc_loss = self.model(
                audio, audio_len, text, text_len)
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
        loss.backward()
        layer_tools.print_grads(self.model, print_func=None)
        losses_np = {'loss': float(loss) * train_conf.accum_grad}
        losses_np['st_loss'] = float(st_loss)
        if attention_loss:
            losses_np['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_np['ctc_loss'] = float(ctc_loss)
        # loss backward
        if (batch_index + 1) % train_conf.accum_grad != 0:
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            context = self.model.no_sync
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
            context = nullcontext
        with context():
            loss.backward()
            layer_tools.print_grads(self.model, print_func=None)
        # optimizer step
        if (batch_index + 1) % train_conf.accum_grad == 0:
            self.optimizer.step()
            self.optimizer.clear_grad()
@ -182,17 +199,11 @@ class U2STTrainer(Trainer):
        # script_model_path = str(self.checkpoint_dir / 'init')
        # paddle.jit.save(script_model, script_model_path)
-        from_scratch = self.resume_or_scratch()
+        self.before_train()
        if from_scratch:
            # save init model, i.e. 0 epoch
            self.save(tag='init')
        self.lr_scheduler.step(self.iteration)
        if self.parallel:
            self.train_loader.batch_sampler.set_epoch(self.epoch)
        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
                try:
                    data_start_time = time.time()
@ -206,11 +217,13 @@ class U2STTrainer(Trainer):
                        msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
                        msg += "data time: {:>.3f}s, ".format(dataload_time)
                        self.train_batch(batch_index, batch, msg)
                        self.after_train_batch()
                        data_start_time = time.time()
                except Exception as e:
                    logger.error(e)
                    raise e
            with Timer("Eval Time Cost: {}"):
                total_loss, num_seen_utts = self.valid()
                if dist.get_world_size() > 1:
                    num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -327,10 +340,10 @@ class U2STTrainer(Trainer):
    def setup_model(self):
        config = self.config
        model_conf = config.model
-        model_conf.defrost()
+        with UpdateConfig(model_conf):
            model_conf.input_dim = self.train_loader.collate_fn.feature_size
            model_conf.output_dim = self.train_loader.collate_fn.vocab_size
-        model_conf.freeze()
+
        model = U2STModel.from_config(model_conf)
        if self.parallel:
@ -467,8 +480,10 @@ class U2STTester(U2STTrainer):
            len_refs += len(target.split())
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nReference: %s\nHypothesis: %s" % (target, result))
+            logger.info(f"Utt: {utt}")
            logger.info(f"Ref: {target}")
            logger.info(f"Hyp: {result}")
            logger.info("One example BLEU = %s" %
                        (bleu_func([result], [[target]]).prec_str))
@ -496,7 +511,7 @@ class U2STTester(U2STTrainer):
        len_refs, num_ins = 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_translation_metrics(
                    *batch, bleu_func=bleu_func, fout=fout)
@ -569,7 +584,7 @@ class U2STTester(U2STTrainer):
                # 1. Encoder
                encoder_out, encoder_mask = self.model._forward_encoder(
                    feat, feats_length)  # (B, maxlen, encoder_dim)
-                maxlen = encoder_out.size(1)
+                maxlen = encoder_out.shape[1]
                ctc_probs = self.model.ctc.log_softmax(
                    encoder_out)  # (1, maxlen, vocab_size)
@ -577,26 +592,25 @@ class U2STTester(U2STTrainer):
                ctc_probs = ctc_probs.squeeze(0)
                target = target.squeeze(0)
                alignment = ctc_utils.forced_align(ctc_probs, target)
-                logger.info("align ids", key[0], alignment)
+                logger.info(f"align ids: {key[0]} {alignment}")
                fout.write('{} {}\n'.format(key[0], alignment))
                # 3. gen praat
                # segment alignment
                align_segs = text_grid.segment_alignment(alignment)
-                logger.info("align tokens", key[0], align_segs)
+                logger.info(f"align tokens: {key[0]}, {align_segs}")
                # IntervalTier, List["start end token\n"]
                subsample = utility.get_subsample(self.config)
                tierformat = text_grid.align_to_tierformat(
                    align_segs, subsample, token_dict)
                # write tier
-                align_output_path = os.path.join(
+                align_output_path = Path(self.args.result_file).parent / "align"
-                    os.path.dirname(self.args.result_file), "align")
+                align_output_path.mkdir(parents=True, exist_ok=True)
-                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                tier_path = align_output_path / (key[0] + ".tier")
-                with open(tier_path, 'w') as f:
+                with tier_path.open('w') as f:
                    f.writelines(tierformat)
                # write textgrid
-                textgrid_path = os.path.join(align_output_path,
+                textgrid_path = align_output_path / (key[0] + ".TextGrid")
                                             key[0] + ".TextGrid")
                second_per_frame = 1. / (1000. /
                                         stride_ms)  # 25ms window, 10ms stride
                second_per_example = (
@ -604,7 +618,7 @@ class U2STTester(U2STTrainer):
                text_grid.generate_textgrid(
                    maxtime=second_per_example,
                    intervals=tierformat,
-                    output=textgrid_path)
+                    output=str(textgrid_path))
    def run_align(self):
        self.resume_or_scratch()
@ -650,7 +664,7 @@ class U2STTester(U2STTrainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@ -196,7 +196,12 @@ class TextFeaturizer():
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
-
+        if UNK in vocab_list:
            unk_id = vocab_list.index(UNK)
        else:
            unk_id = -1
        if EOS in vocab_list:
            eos_id = vocab_list.index(EOS)
        else:
            eos_id = -1
        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@ -130,7 +130,8 @@ class FeatureNormalizer(object):
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
-        mean, istd = load_cmvn(filepath, filetype='json')
+        filetype = filepath.split(".")[-1]
        mean, istd = load_cmvn(filepath, filetype=filetype)
        self._mean = np.expand_dims(mean, axis=0)
        self._istd = np.expand_dims(istd, axis=0)
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains data helper functions."""
 import codecs
 import json
 import math
 from typing import List
 from typing import Optional
 from typing import Text
 import jsonlines
 import numpy as np
 from deepspeech.utils.log import Log
@ -92,12 +92,8 @@ def read_manifest(
    """
    manifest = []
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
+    with jsonlines.open(manifest_path, 'r') as reader:
-        try:
+        for json_data in reader:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
            feat_len = json_data["feat_shape"][
                0] if 'feat_shape' in json_data else 1.0
            token_len = json_data["token_shape"][
@ -284,6 +280,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
        cmvn = _load_json_cmvn(cmvn_file)
    elif filetype == "kaldi":
        cmvn = _load_kaldi_cmvn(cmvn_file)
    elif filetype == "npz":
        eps = 1e-14
        npzfile = np.load(cmvn_file)
        mean = np.squeeze(npzfile["mean"])
        std = np.squeeze(npzfile["std"])
        istd = 1 / (std + eps)
        cmvn = [mean, istd]
    else:
        raise ValueError(f"cmvn file type no support: {filetype}")
    return cmvn[0], cmvn[1]
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -292,10 +292,6 @@ class SpeechCollator():
        olens = np.array(text_lens).astype(np.int64)
        return utts, xs_pad, ilens, ys_pad, olens
    @property
    def manifest(self):
        return self._manifest
    @property
    def vocab_size(self):
        return self._speech_featurizer.vocab_size
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
@ -44,7 +44,7 @@ def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
 def batch_collate(x):
-    """de-tuple.
+    """de-minibatch, since user compose batch.
    Args:
        x (List[Tuple]): [(utts, xs, ilens, ys, olens)]
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@ -147,3 +147,131 @@ class TransformDataset(Dataset):
    def __getitem__(self, idx):
        """[] operator."""
        return self.converter([self.reader(self.data[idx], return_uttid=True)])
 class AudioDataset(Dataset):
    def __init__(self,
                 data_file,
                 max_length=10240,
                 min_length=0,
                 token_max_length=200,
                 token_min_length=1,
                 batch_type='static',
                 batch_size=1,
                 max_frames_in_batch=0,
                 sort=True,
                 raw_wav=True,
                 stride_ms=10):
        """Dataset for loading audio data.
        Attributes::
            data_file: input data file
                Plain text data file, each line contains following 7 fields,
                which is split by '\t':
                    utt:utt1
                    feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30
                    feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames)
                    text:i love you
                    token: i <space> l o v e <space> y o u
                    tokenid: int id of this token
                    token_shape: M,N    # M is the number of token, N is vocab size
            max_length: drop utterance which is greater than max_length(10ms), unit 10ms.
            min_length: drop utterance which is less than min_length(10ms), unit 10ms.
            token_max_length: drop utterance which is greater than token_max_length,
                especially when use char unit for english modeling
            token_min_length: drop utterance which is less than token_max_length
            batch_type: static or dynamic, see max_frames_in_batch(dynamic)
            batch_size: number of utterances in a batch,
               it's for static batch size.
            max_frames_in_batch: max feature frames in a batch,
               when batch_type is dynamic, it's for dynamic batch size.
               Then batch_size is ignored, we will keep filling the
               batch until the total frames in batch up to max_frames_in_batch.
            sort: whether to sort all data, so the utterance with the same
               length could be filled in a same batch.
            raw_wav: use raw wave or extracted featute.
                if raw wave is used, dynamic waveform-level augmentation could be used
                and the feature is extracted by torchaudio.
                if extracted featute(e.g. by kaldi) is used, only feature-level
                augmentation such as specaug could be used.
        """
        assert batch_type in ['static', 'dynamic']
        # read manifest
        data = read_manifest(data_file)
        if sort:
            data = sorted(data, key=lambda x: x["feat_shape"][0])
        if raw_wav:
            assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
                                                                        '.scp')
            data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
        self.input_dim = data[0]['feat_shape'][1]
        self.output_dim = data[0]['token_shape'][1]
        # with open(data_file, 'r') as f:
        #     for line in f:
        #         arr = line.strip().split('\t')
        #         if len(arr) != 7:
        #             continue
        #         key = arr[0].split(':')[1]
        #         tokenid = arr[5].split(':')[1]
        #         output_dim = int(arr[6].split(':')[1].split(',')[1])
        #         if raw_wav:
        #             wav_path = ':'.join(arr[1].split(':')[1:])
        #             duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
        #             data.append((key, wav_path, duration, tokenid))
        #         else:
        #             feat_ark = ':'.join(arr[1].split(':')[1:])
        #             feat_info = arr[2].split(':')[1].split(',')
        #             feat_dim = int(feat_info[1].strip())
        #             num_frames = int(feat_info[0].strip())
        #             data.append((key, feat_ark, num_frames, tokenid))
        #             self.input_dim = feat_dim
        #         self.output_dim = output_dim
        valid_data = []
        for i in range(len(data)):
            length = data[i]['feat_shape'][0]
            token_length = data[i]['token_shape'][0]
            # remove too lang or too short utt for both input and output
            # to prevent from out of memory
            if length > max_length or length < min_length:
                # logging.warn('ignore utterance {} feature {}'.format(
                #     data[i][0], length))
                pass
            elif token_length > token_max_length or token_length < token_min_length:
                pass
            else:
                valid_data.append(data[i])
        data = valid_data
        self.minibatch = []
        num_data = len(data)
        # Dynamic batch size
        if batch_type == 'dynamic':
            assert (max_frames_in_batch > 0)
            self.minibatch.append([])
            num_frames_in_batch = 0
            for i in range(num_data):
                length = data[i]['feat_shape'][0]
                num_frames_in_batch += length
                if num_frames_in_batch > max_frames_in_batch:
                    self.minibatch.append([])
                    num_frames_in_batch = length
                self.minibatch[-1].append(data[i])
        # Static batch size
        else:
            cur = 0
            while cur < num_data:
                end = min(cur + batch_size, num_data)
                item = []
                for i in range(cur, end):
                    item.append(data[i])
                self.minibatch.append(item)
                cur = end
    def __len__(self):
        return len(self.minibatch)
    def __getitem__(self, idx):
        instance = self.minibatch[idx]
        return instance["utt"], instance["feat"], instance["text"]
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
@ -106,11 +106,9 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # TODO(Hui Zhang): not support bool multiply
+        # https://github.com/PaddlePaddle/Paddle/pull/29265
-        # masks = masks.type_as(x)
+        # rhs will type promote to lhs
-        masks = masks.astype(x.dtype)
+        x = x * masks
        x = x.multiply(masks)
        return x, x_len
--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
@ -128,8 +128,8 @@ class DeepSpeech2Model(nn.Layer):
                num_rnn_layers=3,  #Number of stacking RNN layers.
                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
+                share_rnn_weights=True,  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
+                ctc_grad_norm_type='instance', ))
        if config is not None:
            config.merge_from_other_cfg(default)
        return default
@ -141,7 +141,9 @@ class DeepSpeech2Model(nn.Layer):
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
-                 share_rnn_weights=True):
+                 share_rnn_weights=True,
                 blank_id=0,
                 ctc_grad_norm_type='instance'):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -156,10 +158,11 @@ class DeepSpeech2Model(nn.Layer):
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
+            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
            grad_norm_type=ctc_grad_norm_type)
    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -221,7 +224,8 @@ class DeepSpeech2Model(nn.Layer):
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,
                    use_gru=config.model.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
+                    share_rnn_weights=config.model.share_rnn_weights,
                    blank_id=config.model.blank_id)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -246,7 +250,8 @@ class DeepSpeech2Model(nn.Layer):
                    num_rnn_layers=config.num_rnn_layers,
                    rnn_size=config.rnn_layer_size,
                    use_gru=config.use_gru,
-                    share_rnn_weights=config.share_rnn_weights)
+                    share_rnn_weights=config.share_rnn_weights,
                    blank_id=config.blank_id)
        return model
@ -258,7 +263,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                 num_rnn_layers=3,
                 rnn_size=1024,
                 use_gru=False,
-                 share_rnn_weights=True):
+                 share_rnn_weights=True,
                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
@ -266,7 +272,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_size,
            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
+            share_rnn_weights=share_rnn_weights,
            blank_id=blank_id)
    def forward(self, audio, audio_len):
        """export model function
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@ -308,7 +308,8 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
+            # https://github.com/PaddlePaddle/Paddle/pull/29265
-            masks = masks.astype(x.dtype)
+            # rhs will type promote to lhs
-            x = x.multiply(masks)
+            x = x * masks
        return x, x_len
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@ -254,6 +254,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
                num_fc_layers=2,
                fc_layers_size_list=[512, 256],
                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
                blank_id=0,  # index of blank in vocob.txt
            ))
        if config is not None:
            config.merge_from_other_cfg(default)
@ -268,7 +269,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
-                 use_gru=False):
+                 use_gru=False,
                 blank_id=0):
        super().__init__()
        self.encoder = CRNNEncoder(
            feat_size=feat_size,
@ -284,10 +286,11 @@ class DeepSpeech2ModelOnline(nn.Layer):
        self.decoder = CTCDecoder(
            odim=dict_size,  # <blank> is in  vocab
            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
+            blank_id=blank_id,
            dropout_rate=0.0,
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
            grad_norm_type='instance')
    def forward(self, audio, audio_len, text, text_len):
        """Compute Model loss
@ -353,7 +356,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                    rnn_direction=config.model.rnn_direction,
                    num_fc_layers=config.model.num_fc_layers,
                    fc_layers_size_list=config.model.fc_layers_size_list,
-                    use_gru=config.model.use_gru)
+                    use_gru=config.model.use_gru,
                    blank_id=config.model.blank_id)
        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
@ -380,7 +384,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
                    rnn_direction=config.rnn_direction,
                    num_fc_layers=config.num_fc_layers,
                    fc_layers_size_list=config.fc_layers_size_list,
-                    use_gru=config.use_gru)
+                    use_gru=config.use_gru,
                    blank_id=config.blank_id)
        return model
@ -394,7 +399,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
                 rnn_direction='forward',
                 num_fc_layers=2,
                 fc_layers_size_list=[512, 256],
-                 use_gru=False):
+                 use_gru=False,
                 blank_id=0):
        super().__init__(
            feat_size=feat_size,
            dict_size=dict_size,
@ -404,7 +410,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
            rnn_direction=rnn_direction,
            num_fc_layers=num_fc_layers,
            fc_layers_size_list=fc_layers_size_list,
-            use_gru=use_gru)
+            use_gru=use_gru,
            blank_id=blank_id)
    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box):
--- a/deepspeech/models/u2/init.py
+++ b/deepspeech/models/u2/init.py
@ -0,0 +1,19 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .u2 import U2InferModel
 from .u2 import U2Model
 from .updater import U2Evaluator
 from .updater import U2Updater
 __all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import pad_sequence
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import log_add
 from deepspeech.utils.utility import UpdateConfig
 __all__ = ["U2Model", "U2InferModel"]
@ -115,7 +116,8 @@ class U2BaseModel(nn.Layer):
                 ctc_weight: float=0.5,
                 ignore_id: int=IGNORE_ID,
                 lsm_weight: float=0.0,
-                 length_normalized_loss: bool=False):
+                 length_normalized_loss: bool=False,
                 **kwargs):
        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
        super().__init__()
@ -162,10 +164,7 @@ class U2BaseModel(nn.Layer):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        encoder_time = time.time() - start
        #logger.debug(f"encoder time: {encoder_time}")
-        #TODO(Hui Zhang): sum not support bool type
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
            1)  #[B, 1, T] -> [B]
        # 2a. Attention-decoder branch
        loss_att = None
@ -299,8 +298,8 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
-        encoder_dim = encoder_out.size(2)
+        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@ -320,8 +319,7 @@ class U2BaseModel(nn.Layer):
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            # TODO(Hui Zhang): if end_flag.sum() == running_size:
+            if end_flag.sum() == running_size:
            if end_flag.cast(paddle.int64).sum() == running_size:
                break
            # 2.1 Forward decoder step
@ -406,10 +404,8 @@ class U2BaseModel(nn.Layer):
        encoder_out, encoder_mask = self._forward_encoder(
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks, simulate_streaming)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
-        # (TODO Hui Zhang): bool no support reduce_sum
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
        # encoder_out_lens = encoder_mask.squeeze(1).sum(1)
        encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (B, maxlen, vocab_size)
        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
@ -459,7 +455,7 @@ class U2BaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
        ctc_probs = self.ctc.log_softmax(encoder_out)  # (1, maxlen, vocab_size)
        ctc_probs = ctc_probs.squeeze(0)
@ -587,7 +583,7 @@ class U2BaseModel(nn.Layer):
        encoder_out = encoder_out.repeat(beam_size, 1, 1)
        encoder_mask = paddle.ones(
-            (beam_size, 1, encoder_out.size(1)), dtype=paddle.bool)
+            (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
        decoder_out, _ = self.decoder(
            encoder_out, encoder_mask, hyps_pad,
            hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
@ -667,9 +663,7 @@ class U2BaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)
-    # @jit.to_static([
+    # @jit.to_static
    #         paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'),  # audio feat, [B,T,D]
    #     ])
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@ -696,13 +690,13 @@ class U2BaseModel(nn.Layer):
        Returns:
            paddle.Tensor: decoder output, (B, L)
        """
-        assert encoder_out.size(0) == 1
+        assert encoder_out.shape[0] == 1
-        num_hyps = hyps.size(0)
+        num_hyps = hyps.shape[0]
-        assert hyps_lens.size(0) == num_hyps
+        assert hyps_lens.shape[0] == num_hyps
        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
        # (B, 1, T)
        encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
        # (num_hyps, max_hyps_len, vocab_size)
        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                      hyps_lens)
@ -757,7 +751,7 @@ class U2BaseModel(nn.Layer):
        Returns:
            List[List[int]]: transcripts.
        """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
        if decoding_method in ['ctc_prefix_beam_search',
                               'attention_rescoring'] and batch_size > 1:
            logger.fatal(
@ -785,7 +779,7 @@ class U2BaseModel(nn.Layer):
        # result in List[int], change it to List[List[int]] for compatible
        # with other batch decoding mode
        elif decoding_method == 'ctc_prefix_beam_search':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
            hyp = self.ctc_prefix_beam_search(
                feats,
                feats_lengths,
@ -795,7 +789,7 @@ class U2BaseModel(nn.Layer):
                simulate_streaming=simulate_streaming)
            hyps = [hyp]
        elif decoding_method == 'attention_rescoring':
-            assert feats.size(0) == 1
+            assert feats.shape[0] == 1
            hyp = self.attention_rescoring(
                feats,
                feats_lengths,
@ -836,6 +830,7 @@ class U2Model(U2BaseModel):
        Returns:
            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
        """
        # cmvn
        if configs['cmvn_file'] is not None:
            mean, istd = load_cmvn(configs['cmvn_file'],
                                   configs['cmvn_file_type'])
@ -845,11 +840,13 @@ class U2Model(U2BaseModel):
        else:
            global_cmvn = None
        # input & output dim
        input_dim = configs['input_dim']
        vocab_size = configs['output_dim']
        assert input_dim != 0, input_dim
        assert vocab_size != 0, vocab_size
        # encoder
        encoder_type = configs.get('encoder', 'transformer')
        logger.info(f"U2 Encoder type: {encoder_type}")
        if encoder_type == 'transformer':
@ -861,16 +858,21 @@ class U2Model(U2BaseModel):
        else:
            raise ValueError(f"not support encoder type:{encoder_type}")
        # decoder
        decoder = TransformerDecoder(vocab_size,
                                     encoder.output_size(),
                                     **configs['decoder_conf'])
        # ctc decoder and ctc loss
        model_conf = configs['model_conf']
        ctc = CTCDecoder(
            odim=vocab_size,
            enc_n_units=encoder.output_size(),
            blank_id=0,
-            dropout_rate=0.0,
+            dropout_rate=model_conf['ctc_dropoutrate'],
            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
+            batch_average=True,  # sum / batch_size
            grad_norm_type=model_conf['ctc_grad_norm_type'])
        return vocab_size, encoder, decoder, ctc
@ -902,10 +904,10 @@ class U2Model(U2BaseModel):
        Returns:
            DeepSpeech2Model: The model built from pretrained result.
        """
-        config.defrost()
+        with UpdateConfig(config):
            config.input_dim = dataloader.collate_fn.feature_size
            config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+
        model = cls.from_config(config)
        if checkpoint_path:
--- a/deepspeech/models/u2/updater.py
+++ b/deepspeech/models/u2/updater.py
@ -0,0 +1,149 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import nullcontext
 import paddle
 from paddle import distributed as dist
 from deepspeech.training.extensions.evaluator import StandardEvaluator
 from deepspeech.training.reporter import report
 from deepspeech.training.timer import Timer
 from deepspeech.training.updaters.standard_updater import StandardUpdater
 from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 class U2Evaluator(StandardEvaluator):
    def __init__(self, model, dataloader):
        super().__init__(model, dataloader)
        self.msg = ""
        self.num_seen_utts = 0
        self.total_loss = 0.0
    def evaluate_core(self, batch):
        self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
        losses_dict = {}
        loss, attention_loss, ctc_loss = self.model(*batch[1:])
        if paddle.isfinite(loss):
            num_utts = batch[1].shape[0]
            self.num_seen_utts += num_utts
            self.total_loss += float(loss) * num_utts
            losses_dict['loss'] = float(loss)
            if attention_loss:
                losses_dict['att_loss'] = float(attention_loss)
            if ctc_loss:
                losses_dict['ctc_loss'] = float(ctc_loss)
            for k, v in losses_dict.items():
                report("eval/" + k, v)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
        logger.info(self.msg)
        return self.total_loss, self.num_seen_utts
 class U2Updater(StandardUpdater):
    def __init__(self,
                 model,
                 optimizer,
                 scheduler,
                 dataloader,
                 init_state=None,
                 accum_grad=1,
                 **kwargs):
        super().__init__(
            model, optimizer, scheduler, dataloader, init_state=init_state)
        self.accum_grad = accum_grad
        self.forward_count = 0
        self.msg = ""
    def update_core(self, batch):
        """One Step
        Args:
            batch (List[Object]): utts, xs, xlens, ys, ylens
        """
        losses_dict = {}
        self.msg = "Rank: {}, ".format(dist.get_rank())
        # forward
        batch_size = batch[1].shape[0]
        loss, attention_loss, ctc_loss = self.model(*batch[1:])
        # loss div by `batch_size * accum_grad`
        loss /= self.accum_grad
        # loss backward
        if (self.forward_count + 1) != self.accum_grad:
            # Disable gradient synchronizations across DDP processes.
            # Within this context, gradients will be accumulated on module
            # variables, which will later be synchronized.
            context = self.model.no_sync
        else:
            # Used for single gpu training and DDP gradient synchronization
            # processes.
            context = nullcontext
        with context():
            loss.backward()
            layer_tools.print_grads(self.model, print_func=None)
        # loss info
        losses_dict['loss'] = float(loss) * self.accum_grad
        if attention_loss:
            losses_dict['att_loss'] = float(attention_loss)
        if ctc_loss:
            losses_dict['ctc_loss'] = float(ctc_loss)
        # report loss
        for k, v in losses_dict.items():
            report("train/" + k, v)
        # loss msg
        self.msg += "batch size: {}, ".format(batch_size)
        self.msg += "accum: {}, ".format(self.accum_grad)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
        # Truncate the graph
        loss.detach()
        # update parameters
        self.forward_count += 1
        if self.forward_count != self.accum_grad:
            return
        self.forward_count = 0
        self.optimizer.step()
        self.optimizer.clear_grad()
        self.scheduler.step()
    def update(self):
        # model is default in train mode
        # training for a step is implemented here
        with Timer("data time cost:{}"):
            batch = self.read_batch()
        with Timer("step time cost:{}"):
            self.update_core(batch)
        # #iterations with accum_grad > 1
        # Ref.: https://github.com/espnet/espnet/issues/777
        if self.forward_count == 0:
            self.state.iteration += 1
        if self.updates_per_epoch is not None:
            if self.state.iteration % self.updates_per_epoch == 0:
                self.state.epoch += 1
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools
 from deepspeech.utils.log import Log
 from deepspeech.utils.tensor_utils import add_sos_eos
 from deepspeech.utils.tensor_utils import th_accuracy
 from deepspeech.utils.utility import UpdateConfig
 __all__ = ["U2STModel", "U2STInferModel"]
@ -163,10 +164,7 @@ class U2STBaseModel(nn.Layer):
        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
        encoder_time = time.time() - start
        #logger.debug(f"encoder time: {encoder_time}")
-        #TODO(Hui Zhang): sum not support bool type
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
        #encoder_out_lens = encoder_mask.squeeze(1).sum(1)  #[B, 1, T] -> [B]
        encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
            1)  #[B, 1, T] -> [B]
        # 2a. ST-decoder branch
        start = time.time()
@ -342,8 +340,8 @@ class U2STBaseModel(nn.Layer):
            speech, speech_lengths, decoding_chunk_size,
            num_decoding_left_chunks,
            simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.size(1)
+        maxlen = encoder_out.shape[1]
-        encoder_dim = encoder_out.size(2)
+        encoder_dim = encoder_out.shape[2]
        running_size = batch_size * beam_size
        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
@ -363,8 +361,7 @@ class U2STBaseModel(nn.Layer):
        # 2. Decoder forward step by step
        for i in range(1, maxlen + 1):
            # Stop if all batch and all beam produce eos
-            # TODO(Hui Zhang): if end_flag.sum() == running_size:
+            if end_flag.sum() == running_size:
            if end_flag.cast(paddle.int64).sum() == running_size:
                break
            # 2.1 Forward decoder step
@ -417,26 +414,26 @@ class U2STBaseModel(nn.Layer):
        best_hyps = best_hyps[:, 1:]
        return best_hyps
-    @jit.to_static
+    # @jit.to_static
    def subsampling_rate(self) -> int:
        """ Export interface for c++ call, return subsampling_rate of the
            model
        """
        return self.encoder.embed.subsampling_rate
-    @jit.to_static
+    # @jit.to_static
    def right_context(self) -> int:
        """ Export interface for c++ call, return right_context of the model
        """
        return self.encoder.embed.right_context
-    @jit.to_static
+    # @jit.to_static
    def sos_symbol(self) -> int:
        """ Export interface for c++ call, return sos symbol id of the model
        """
        return self.sos
-    @jit.to_static
+    # @jit.to_static
    def eos_symbol(self) -> int:
        """ Export interface for c++ call, return eos symbol id of the model
        """
@ -472,7 +469,7 @@ class U2STBaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)
-    @jit.to_static
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@ -499,13 +496,13 @@ class U2STBaseModel(nn.Layer):
        Returns:
            paddle.Tensor: decoder output, (B, L)
        """
-        assert encoder_out.size(0) == 1
+        assert encoder_out.shape[0] == 1
-        num_hyps = hyps.size(0)
+        num_hyps = hyps.shape[0]
-        assert hyps_lens.size(0) == num_hyps
+        assert hyps_lens.shape[0] == num_hyps
        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
        # (B, 1, T)
        encoder_mask = paddle.ones(
-            [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
+            [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
        # (num_hyps, max_hyps_len, vocab_size)
        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
                                      hyps_lens)
@ -560,7 +557,7 @@ class U2STBaseModel(nn.Layer):
        Returns:
            List[List[int]]: transcripts.
        """
-        batch_size = feats.size(0)
+        batch_size = feats.shape[0]
        if decoding_method == 'fullsentence':
            hyps = self.translate(
@ -647,13 +644,16 @@ class U2STModel(U2STBaseModel):
            decoder = TransformerDecoder(vocab_size,
                                         encoder.output_size(),
                                         **configs['decoder_conf'])
            # ctc decoder and ctc loss
            model_conf = configs['model_conf']
            ctc = CTCDecoder(
                odim=vocab_size,
                enc_n_units=encoder.output_size(),
                blank_id=0,
-                dropout_rate=0.0,
+                dropout_rate=model_conf['ctc_dropout_rate'],
                reduction=True,  # sum
-                batch_average=True)  # sum / batch_size
+                batch_average=True,  # sum / batch_size
                grad_norm_type=model_conf['ctc_grad_norm_type'])
            return vocab_size, encoder, (st_decoder, decoder, ctc)
        else:
@ -687,10 +687,10 @@ class U2STModel(U2STBaseModel):
        Returns:
            DeepSpeech2Model: The model built from pretrained result.
        """
-        config.defrost()
+        with UpdateConfig(config):
            config.input_dim = dataloader.collate_fn.feature_size
            config.output_dim = dataloader.collate_fn.vocab_size
-        config.freeze()
+
        model = cls.from_config(config)
        if checkpoint_path:
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@ -15,12 +15,13 @@ from collections import OrderedDict
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
-__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"]
+__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock", "GLU"]
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,6 +31,17 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
    return x.maximum(t_min).minimum(t_max)
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""
    def __init__(self, dim: int=-1):
        super().__init__()
        self.dim = dim
    def forward(self, xs):
        return F.glu(xs, axis=self.dim)
 class LinearGLUBlock(nn.Layer):
    """A linear Gated Linear Units (GLU) block."""
@ -133,13 +145,18 @@ def get_activation(act):
    """Return activation function."""
    # Lazy load to avoid unused import
    activation_funcs = {
        "hardshrink": paddle.nn.Hardshrink,
        "hardswish": paddle.nn.Hardswish,
        "hardtanh": paddle.nn.Hardtanh,
        "tanh": paddle.nn.Tanh,
        "relu": paddle.nn.ReLU,
        "relu6": paddle.nn.ReLU6,
        "leakyrelu": paddle.nn.LeakyReLU,
        "selu": paddle.nn.SELU,
        "swish": paddle.nn.Swish,
        "gelu": paddle.nn.GELU,
-        "brelu": brelu,
+        "glu": GLU,
        "elu": paddle.nn.ELU,
    }
    return activation_funcs[act]()
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
            paddle.Tensor: Transformed value tensor, size
                (#batch, n_head, time2, d_k).
        """
-        n_batch = query.size(0)
+        n_batch = query.shape[0]
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
            paddle.Tensor: Transformed value weighted 
                by the attention score, (#batch, time1, d_model).
        """
-        n_batch = value.size(0)
+        n_batch = value.shape[0]
        if mask is not None:
            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
            scores = scores.masked_fill(mask, -float('inf'))
@ -109,8 +109,8 @@ class MultiHeadedAttention(nn.Layer):
        p_attn = self.dropout(attn)
        x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = x.transpose([0, 2, 1, 3]).contiguous().view(
+        x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h *
-            n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+                                           self.d_k)  # (batch, time1, d_model)
        return self.linear_out(x)  # (batch, time1, d_model)
@ -172,15 +172,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
            paddle.Tensor: Output tensor. (batch, head, time1, time1)
        """
        zero_pad = paddle.zeros(
-            (x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype)
+            (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
        x_padded = paddle.cat([zero_pad, x], dim=-1)
-        x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
+        x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
                                 x.shape[2])
        x = x_padded[:, :, 1:].view_as(x)  # [B, H, T1, T1]
        if zero_triu:
-            ones = paddle.ones((x.size(2), x.size(3)))
+            ones = paddle.ones((x.shape[2], x.shape[3]))
-            x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+            x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]
        return x
@ -205,7 +206,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
        q, k, v = self.forward_qkv(query, key, value)
        q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
-        n_batch_pos = pos_emb.size(0)
+        n_batch_pos = pos_emb.shape[0]
        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
        p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@ -113,11 +113,9 @@ class ConvBn(nn.Layer):
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
-        # TODO(Hui Zhang): not support bool multiply
+        # https://github.com/PaddlePaddle/Paddle/pull/29265
-        # masks = masks.type_as(x)
+        # rhs will type promote to lhs
-        masks = masks.astype(x.dtype)
+        x = x * masks
        x = x.multiply(masks)
        return x, x_len
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@ -16,15 +16,19 @@ from paddle import nn
 from paddle.nn import functional as F
 from typeguard import check_argument_types
 from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
 from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
 from deepspeech.decoders.swig_wrapper import Scorer
 from deepspeech.modules.loss import CTCLoss
 from deepspeech.utils import ctc_utils
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 try:
    from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch  # noqa: F401
    from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder  # noqa: F401
    from deepspeech.decoders.swig_wrapper import Scorer  # noqa: F401
 except Exception as e:
    logger.info("ctcdecoder not installed!")
 __all__ = ['CTCDecoder']
@ -35,7 +39,8 @@ class CTCDecoder(nn.Layer):
                 blank_id=0,
                 dropout_rate: float=0.0,
                 reduction: bool=True,
-                 batch_average: bool=True):
+                 batch_average: bool=True,
                 grad_norm_type: str="instance"):
        """CTC decoder
        Args:
@ -44,6 +49,7 @@ class CTCDecoder(nn.Layer):
            dropout_rate (float): dropout rate (0.0 ~ 1.0)
            reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
            batch_average (bool): do batch dim wise average.
            grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
@ -56,7 +62,8 @@ class CTCDecoder(nn.Layer):
        self.criterion = CTCLoss(
            blank=self.blank_id,
            reduction=reduction_type,
-            batch_average=batch_average)
+            batch_average=batch_average,
            grad_norm_type=grad_norm_type)
        # CTCDecoder LM Score handle
        self._ext_scorer = None
@ -132,7 +139,7 @@ class CTCDecoder(nn.Layer):
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
-                probs_seq=probs, vocabulary=vocab_list)
+                probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
            results.append(output_transcription)
        return results
@ -212,13 +219,15 @@ class CTCDecoder(nn.Layer):
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
-            cutoff_top_n=cutoff_top_n)
+            cutoff_top_n=cutoff_top_n,
            blank_id=self.blank_id)
        results = [result[0][1] for result in beam_search_results]
        return results
    def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
                    decoding_method):
        if decoding_method == "ctc_beam_search":
            self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
                                  vocab_list)
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@ -122,11 +122,9 @@ class TransformerDecoder(nn.Layer):
        # tgt_mask: (B, 1, L)
        tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
        # m: (1, L, L)
-        m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0)
+        m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
        # tgt_mask: (B, L, L)
-        # TODO(Hui Zhang): not support & for tensor
+        tgt_mask = tgt_mask & m
        # tgt_mask = tgt_mask & m
        tgt_mask = tgt_mask.logical_and(m)
        x, _ = self.embed(tgt)
        for layer in self.decoders:
@ -137,9 +135,7 @@ class TransformerDecoder(nn.Layer):
        if self.use_output_layer:
            x = self.output_layer(x)
-        # TODO(Hui Zhang): reduce_sum not support bool type
+        olens = tgt_mask.sum(1)
        # olens = tgt_mask.sum(1)
        olens = tgt_mask.astype(paddle.int).sum(1)
        return x, olens
    def forward_one_step(
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
            paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
        """
        T = x.shape[1]
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + T]
        x = x * self.xscale + pos_emb
@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
            paddle.Tensor: Encoded tensor (batch, time, `*`).
            paddle.Tensor: Positional embedding tensor (1, time, `*`).
        """
-        assert offset + x.size(1) < self.max_len
+        assert offset + x.shape[1] < self.max_len
        x = x * self.xscale
        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
        pos_emb = self.pe[:, offset:offset + x.shape[1]]
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@ -159,11 +159,10 @@ class BaseEncoder(nn.Layer):
        if self.global_cmvn is not None:
            xs = self.global_cmvn(xs)
        #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
-        xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)
+        xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
        #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
        masks = masks.astype(paddle.bool)
-        #TODO(Hui Zhang): mask_pad = ~masks
+        mask_pad = ~masks
        mask_pad = masks.logical_not()
        chunk_masks = add_optional_chunk_mask(
            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
            decoding_chunk_size, self.static_chunk_size,
@ -207,11 +206,11 @@ class BaseEncoder(nn.Layer):
                chunk computation
            List[paddle.Tensor]: conformer cnn cache
        """
-        assert xs.size(0) == 1  # batch size must be one
+        assert xs.shape[0] == 1  # batch size must be one
        # tmp_masks is just for interface compatibility
        # TODO(Hui Zhang): stride_slice not support bool tensor
        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
-        tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32)
+        tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
        tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
        if self.global_cmvn is not None:
@ -221,25 +220,25 @@ class BaseEncoder(nn.Layer):
            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
        if subsampling_cache is not None:
-            cache_size = subsampling_cache.size(1)  #T
+            cache_size = subsampling_cache.shape[1]  #T
            xs = paddle.cat((subsampling_cache, xs), dim=1)
        else:
            cache_size = 0
        # only used when using `RelPositionMultiHeadedAttention`
        pos_emb = self.embed.position_encoding(
-            offset=offset - cache_size, size=xs.size(1))
+            offset=offset - cache_size, size=xs.shape[1])
        if required_cache_size < 0:
            next_cache_start = 0
        elif required_cache_size == 0:
-            next_cache_start = xs.size(1)
+            next_cache_start = xs.shape[1]
        else:
-            next_cache_start = xs.size(1) - required_cache_size
+            next_cache_start = xs.shape[1] - required_cache_size
        r_subsampling_cache = xs[:, next_cache_start:, :]
        # Real mask for transformer/conformer layers
-        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
        masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
        r_elayers_output_cache = []
        r_conformer_cnn_cache = []
@ -303,7 +302,7 @@ class BaseEncoder(nn.Layer):
        stride = subsampling * decoding_chunk_size
        decoding_window = (decoding_chunk_size - 1) * subsampling + context
-        num_frames = xs.size(1)
+        num_frames = xs.shape[1]
        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
        subsampling_cache: Optional[paddle.Tensor] = None
        elayers_output_cache: Optional[List[paddle.Tensor]] = None
@ -319,10 +318,10 @@ class BaseEncoder(nn.Layer):
                 chunk_xs, offset, required_cache_size, subsampling_cache,
                 elayers_output_cache, conformer_cnn_cache)
            outputs.append(y)
-            offset += y.size(1)
+            offset += y.shape[1]
        ys = paddle.cat(outputs, 1)
        # fake mask, just for jit script and compatibility with `forward` api
-        masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
+        masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
        masks = masks.unsqueeze(1)
        return ys, masks
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -23,11 +23,32 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"]
 class CTCLoss(nn.Layer):
-    def __init__(self, blank=0, reduction='sum', batch_average=False):
+    def __init__(self,
                 blank=0,
                 reduction='sum',
                 batch_average=False,
                 grad_norm_type=None):
        super().__init__()
        # last token id as blank id
        self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
        self.batch_average = batch_average
        logger.info(
            f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
        # instance for norm_by_times
        # batch for norm_by_batchsize
        # frame for norm_by_total_logits_len
        assert grad_norm_type in ('instance', 'batch', 'frame', None)
        self.norm_by_times = False
        self.norm_by_batchsize = False
        self.norm_by_total_logits_len = False
        logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
        if grad_norm_type == 'instance':
            self.norm_by_times = True
        if grad_norm_type == 'batch':
            self.norm_by_batchsize = True
        if grad_norm_type == 'frame':
            self.norm_by_total_logits_len = True
    def forward(self, logits, ys_pad, hlens, ys_lens):
        """Compute CTC loss.
@ -46,10 +67,15 @@ class CTCLoss(nn.Layer):
        # warp-ctc need activation with shape [T, B, V + 1]
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        ys_pad = ys_pad.astype(paddle.int32)
        loss = self.loss(
-            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
+            logits,
            ys_pad,
            hlens,
            ys_lens,
            norm_by_times=self.norm_by_times,
            norm_by_batchsize=self.norm_by_batchsize,
            norm_by_total_logits_len=self.norm_by_total_logits_len)
        if self.batch_average:
            # Batch-size average
            loss = loss / B
@ -124,9 +150,9 @@ class LabelSmoothingLoss(nn.Layer):
        # use zeros_like instead of torch.no_grad() for true_dist,
        # since no_grad() can not be exported by JIT
        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
-        ignore = target == self.padding_idx  # (B,)
+        ignore = (target == self.padding_idx)  # (B,)
-        # target = target * (1 - ignore)  # avoid -1 index
+        #TODO(Hui Zhang): target = target * (1 - ignore)  # avoid -1 index
        target = target.masked_fill(ignore, 0)  # avoid -1 index
        # true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        target_mask = F.one_hot(target, self.size)
@ -135,10 +161,8 @@ class LabelSmoothingLoss(nn.Layer):
        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)
-        #TODO(Hui Zhang): sum not support bool type
+        total = len(target) - int(ignore.sum())
        #total = len(target) - int(ignore.sum())
        total = len(target) - int(ignore.type_as(target).sum())
        denom = total if self.normalize_length else B
-        #numer = (kl * (1 - ignore)).sum()
+        #TODO(Hui Zhang): numer = (kl * (1 - ignore)).sum()
        numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()
        return numer / denom
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@ -69,8 +69,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
                 [1, 1, 1, 0, 0],
                 [1, 1, 0, 0, 0]]
    """
-    #TODO(Hui Zhang): return ~make_pad_mask(lengths), not support ~
+    return ~make_pad_mask(lengths)
    return make_pad_mask(lengths).logical_not()
 def subsequent_mask(size: int) -> paddle.Tensor:
@ -92,12 +91,7 @@ def subsequent_mask(size: int) -> paddle.Tensor:
         [1, 1, 1]]
    """
    ret = paddle.ones([size, size], dtype=paddle.bool)
-    #TODO(Hui Zhang): tril not support bool
+    return paddle.tril(ret)
    #return paddle.tril(ret)
    ret = ret.astype(paddle.float)
    ret = paddle.tril(ret)
    ret = ret.astype(paddle.bool)
    return ret
 def subsequent_chunk_mask(
@ -186,15 +180,13 @@ def add_optional_chunk_mask(xs: paddle.Tensor,
        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        # chunk_masks = masks & chunk_masks  # (B, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
    elif static_chunk_size > 0:
        num_left_chunks = num_decoding_left_chunks
        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
                                            num_left_chunks)  # (L, L)
        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
-        # chunk_masks = masks & chunk_masks  # (B, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
        chunk_masks = masks.logical_and(chunk_masks)  # (B, L, L)
    else:
        chunk_masks = masks
    return chunk_masks
--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@ -308,7 +308,7 @@ class RNNStack(nn.Layer):
            x, x_len = rnn(x, x_len)
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
-            # TODO(Hui Zhang): not support bool multiply
+            # https://github.com/PaddlePaddle/Paddle/pull/29265
-            masks = masks.astype(x.dtype)
+            # rhs will type promote to lhs
-            x = x.multiply(masks)
+            x = x * masks
        return x, x_len
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@ -14,6 +14,20 @@
 import argparse
 class ExtendAction(argparse.Action):
    """
    [Since Python 3.8, the "extend" is available directly in stdlib]
    (https://docs.python.org/3.8/library/argparse.html#action).
    If you only have to support 3.8+ then defining it yourself is no longer required. 
    Usage of stdlib "extend" action is exactly the same way as this answer originally described:
    """
    def __call__(self, parser, namespace, values, option_string=None):
        items = getattr(namespace, self.dest) or []
        items.extend(values)
        setattr(namespace, self.dest, items)
 def default_argument_parser():
    r"""A simple yet genral argument parser for experiments with parakeet.
@ -30,7 +44,7 @@ def default_argument_parser():
    The ``--checkpoint_path`` specifies the checkpoint to load from.
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--nprocs`` specifies how to run the training.
    See Also
@ -42,29 +56,53 @@ def default_argument_parser():
        the parser
    """
    parser = argparse.ArgumentParser()
    parser.register('action', 'extend', ExtendAction)
-    # yapf: disable
+    train_group = parser.add_argument_group(
-    # data and output
+        title='Train Options', description=None)
-    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
+    train_group.add_argument(
-    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
+        "--seed",
-    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")
+        type=int,
-
+        default=None,
-    # load from saved checkpoint
+        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
-    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")
+    )
-
+    train_group.add_argument(
-    # running
+        "--nprocs",
-    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
+        type=int,
-                        help="device type to use, cpu and gpu are supported.")
+        default=1,
-    parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.")
+        help="number of parallel processes. 0 for cpu.")
-
+    train_group.add_argument(
-    # overwrite extra config and default config
+        "--config", metavar="CONFIG_FILE", help="config file.")
-    # parser.add_argument("--opts", nargs=argparse.REMAINDER,
+    train_group.add_argument(
-    # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
-    parser.add_argument("--opts", type=str, default=[], nargs='+',
+    train_group.add_argument(
-                        help="options to overwrite --config file and the default config, passing in KEY VALUE pairs")
+        "--checkpoint_path", type=str, help="path to load checkpoint")
    train_group.add_argument(
        "--opts",
        action='extend',
        nargs=2,
        metavar=('key', 'val'),
        help="overwrite --config field, passing (KEY VALUE) pairs")
    train_group.add_argument(
        "--dump-config", metavar="FILE", help="dump config to `this` file.")
-    parser.add_argument("--seed", type=int, default=None,
+    profile_group = parser.add_argument_group(
-                        help="seed to use for paddle, np and random. None or 0 for random, else set seed.")
+        title='Benchmark Options', description=None)
-    # yapd: enable
+    profile_group.add_argument(
        '--profiler-options',
        type=str,
        default=None,
        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
    )
    profile_group.add_argument(
        '--benchmark-batch-size',
        type=int,
        default=None,
        help='batch size for benchmark.')
    profile_group.add_argument(
        '--benchmark-max-step',
        type=int,
        default=None,
        help='max iteration for benchmark.')
    return parser
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
@ -13,14 +13,18 @@
 # limitations under the License.
 from typing import Dict
 import extension
 import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.nn import Layer
 from . import extension
 from ..reporter import DictSummary
 from ..reporter import ObsScope
 from ..reporter import report
-from ..reporter import scope
+from ..timer import Timer
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 class StandardEvaluator(extension.Extension):
@ -43,6 +47,27 @@ class StandardEvaluator(extension.Extension):
    def evaluate_core(self, batch):
        # compute
        self.model(batch)  # you may report here
        return
    def evaluate_sync(self, data):
        # dist sync `evaluate_core` outputs
        if data is None:
            return
        numerator, denominator = data
        if dist.get_world_size() > 1:
            numerator = paddle.to_tensor(numerator)
            denominator = paddle.to_tensor(denominator)
            # the default operator in all_reduce function is sum.
            dist.all_reduce(numerator)
            dist.all_reduce(denominator)
            value = numerator / denominator
            value = float(value)
        else:
            value = numerator / denominator
        # used for `snapshort` to do kbest save.
        report("VALID/LOSS", value)
        logger.info(f"Valid: all-reduce loss {value}")
    def evaluate(self):
        # switch to eval mode
@ -53,12 +78,16 @@ class StandardEvaluator(extension.Extension):
        summary = DictSummary()
        for batch in self.dataloader:
            observation = {}
-            with scope(observation):
+            with ObsScope(observation):
                # main evaluation computation here.
                with paddle.no_grad():
-                    self.evaluate_core(batch)
+                    self.evaluate_sync(self.evaluate_core(batch))
            summary.add(observation)
        summary = summary.compute_mean()
        # switch to train mode
        for model in self.models.values():
            model.train()
        return summary
    def __call__(self, trainer=None):
@ -66,6 +95,7 @@ class StandardEvaluator(extension.Extension):
        # if it is used to extend a trainer, the metrics is reported to
        # to observation of the trainer
        # or otherwise, you can use your own observation
        with Timer("Eval Time Cost: {}"):
            summary = self.evaluate()
        for k, v in summary.items():
            report(k, v)
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
@ -20,8 +20,9 @@ from typing import List
 import jsonlines
-from deepspeech.training.extensions import extension
+from . import extension
-from deepspeech.training.updaters.trainer import Trainer
+from ..reporter import get_observations
 from ..updaters.trainer import Trainer
 from deepspeech.utils.log import Log
 from deepspeech.utils.mp_tools import rank_zero_only
@ -52,8 +53,19 @@ class Snapshot(extension.Extension):
    priority = -100
    default_name = "snapshot"
-    def __init__(self, max_size: int=5, snapshot_on_error: bool=False):
+    def __init__(self,
                 mode='latest',
                 max_size: int=5,
                 indicator=None,
                 less_better=True,
                 snapshot_on_error: bool=False):
        self.records: List[Dict[str, Any]] = []
        assert mode in ('latest', 'kbest'), mode
        if mode == 'kbest':
            assert indicator is not None
        self.mode = mode
        self.indicator = indicator
        self.less_is_better = less_better
        self.max_size = max_size
        self._snapshot_on_error = snapshot_on_error
        self._save_all = (max_size == -1)
@ -66,16 +78,17 @@ class Snapshot(extension.Extension):
        # load existing records
        record_path: Path = self.checkpoint_dir / "records.jsonl"
        if record_path.exists():
            logger.debug("Loading from an existing checkpoint dir")
            self.records = load_records(record_path)
-            trainer.updater.load(self.records[-1]['path'])
+            ckpt_path = self.records[-1]['path']
            logger.info(f"Loading from an existing checkpoint {ckpt_path}")
            trainer.updater.load(ckpt_path)
    def on_error(self, trainer, exc, tb):
        if self._snapshot_on_error:
-            self.save_checkpoint_and_update(trainer)
+            self.save_checkpoint_and_update(trainer, 'latest')
    def __call__(self, trainer: Trainer):
-        self.save_checkpoint_and_update(trainer)
+        self.save_checkpoint_and_update(trainer, self.mode)
    def full(self):
        """Whether the number of snapshots it keeps track of is greater
@ -83,12 +96,12 @@ class Snapshot(extension.Extension):
        return (not self._save_all) and len(self.records) > self.max_size
    @rank_zero_only
-    def save_checkpoint_and_update(self, trainer: Trainer):
+    def save_checkpoint_and_update(self, trainer: Trainer, mode: str):
        """Saving new snapshot and remove the oldest snapshot if needed."""
        iteration = trainer.updater.state.iteration
        epoch = trainer.updater.state.epoch
        num = epoch if self.trigger[1] == 'epoch' else iteration
-        path = self.checkpoint_dir / f"{num}.pdz"
+        path = self.checkpoint_dir / f"{num}.np"
        # add the new one
        trainer.updater.save(path)
@ -97,11 +110,17 @@ class Snapshot(extension.Extension):
            'path': str(path.resolve()),  # use absolute path
            'iteration': iteration,
            'epoch': epoch,
            'indicator': get_observations()[self.indicator]
        }
        self.records.append(record)
        # remove the earist
        if self.full():
            if mode == 'kbest':
                self.records = sorted(
                    self.records,
                    key=lambda record: record['indicator'],
                    reverse=not self.less_is_better)
            eariest_record = self.records[0]
            os.remove(eariest_record["path"])
            self.records.pop(0)
--- a/deepspeech/training/extensions/visualizer.py
+++ b/deepspeech/training/extensions/visualizer.py
@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from deepspeech.training.extensions import extension
+from visualdl import LogWriter
-from deepspeech.training.updaters.trainer import Trainer
+
 from . import extension
 from ..updaters.trainer import Trainer
 class VisualDL(extension.Extension):
@ -26,8 +28,8 @@ class VisualDL(extension.Extension):
    default_name = 'visualdl'
    priority = extension.PRIORITY_READER
-    def __init__(self, writer):
+    def __init__(self, output_dir):
-        self.writer = writer
+        self.writer = LogWriter(str(output_dir))
    def __call__(self, trainer: Trainer):
        for k, v in trainer.observation.items():
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@ -47,7 +47,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)
-            # debug log
+            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
@ -76,7 +76,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))
-            # debug log
+            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
--- a/deepspeech/training/reporter.py
+++ b/deepspeech/training/reporter.py
@ -19,7 +19,7 @@ OBSERVATIONS = None
@contextlib.contextmanager
-def scope(observations):
+def ObsScope(observations):
    # make `observation` the target to report to.
    # it is basically a dictionary that stores temporary observations
    global OBSERVATIONS
--- a/deepspeech/training/timer.py
+++ b/deepspeech/training/timer.py
@ -0,0 +1,50 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import datetime
 import time
 from deepspeech.utils.log import Log
 __all__ = ["Timer"]
 logger = Log(__name__).getlog()
 class Timer():
    """To be used like this: 
        with Timer("Message") as value:
            do some thing
    """
    def __init__(self, message=None):
        self.message = message
    def duration(self) -> str:
        elapsed_time = time.time() - self.start
        time_str = str(datetime.timedelta(seconds=elapsed_time))
        return time_str
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, type, value, traceback):
        if self.message:
            logger.info(self.message.format(self.duration()))
    def __call__(self) -> float:
        return time.time() - self.start
    def __str__(self):
        return self.duration()
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@ -11,17 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import time
 from collections import OrderedDict
 from pathlib import Path
 import paddle
 from paddle import distributed as dist
 from tensorboardX import SummaryWriter
 from deepspeech.training.reporter import ObsScope
 from deepspeech.training.reporter import report
 from deepspeech.training.timer import Timer
 from deepspeech.utils import mp_tools
 from deepspeech.utils import profiler
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 from deepspeech.utils.utility import seed_all
 from deepspeech.utils.utility import UpdateConfig
 __all__ = ["Trainer"]
@ -79,7 +86,7 @@ class Trainer():
    >>>     config.merge_from_list(args.opts)
    >>> config.freeze()
    >>>
-    >>> if args.nprocs > 1 and args.device == "gpu":
+    >>> if args.nprocs > 0:
    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    >>> else:
    >>>     main_sp(config, args)
@ -94,15 +101,25 @@ class Trainer():
        self.checkpoint_dir = None
        self.iteration = 0
        self.epoch = 0
        self.rank = dist.get_rank()
        logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
        if args.seed:
            seed_all(args.seed)
            logger.info(f"Set seed {args.seed}")
        if self.args.benchmark_batch_size:
            with UpdateConfig(self.config):
                self.config.collator.batch_size = self.args.benchmark_batch_size
                self.config.training.log_interval = 1
            logger.info(
                f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        if self.parallel:
            self.init_parallel()
@ -122,7 +139,7 @@ class Trainer():
        """A flag indicating whether the experiment should run with
        multiprocessing.
        """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.nprocs > 0
    def init_parallel(self):
        """Init environment for multiprocess training.
@ -162,56 +179,97 @@ class Trainer():
            checkpoint_dir=self.checkpoint_dir,
            checkpoint_path=self.args.checkpoint_path)
        if infos:
-            # restore from ckpt
+            # just restore ckpt
            # lr will resotre from optimizer ckpt
            self.iteration = infos["step"]
            self.epoch = infos["epoch"]
            scratch = False
            logger.info(
                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
        else:
            self.iteration = 0
            self.epoch = 0
            scratch = True
-
+            logger.info("Init from scratch!")
        return scratch
-    def new_epoch(self):
+    def maybe_batch_sampler_step(self):
-        """Reset the train loader seed and increment `epoch`.
+        """ batch_sampler seed by epoch """
-        """
+        if hasattr(self.train_loader, "batch_sampler"):
        self.epoch += 1
        if self.parallel and hasattr(self.train_loader, "batch_sampler"):
            batch_sampler = self.train_loader.batch_sampler
            if isinstance(batch_sampler, paddle.io.DistributedBatchSampler):
                batch_sampler.set_epoch(self.epoch)
-    def train(self):
+    def before_train(self):
        """The training process control by epoch."""
        from_scratch = self.resume_or_scratch()
        if from_scratch:
-            # save init model, i.e. 0 epoch
+            # scratch: save init model, i.e. 0 epoch
            self.save(tag='init', infos=None)
-        self.lr_scheduler.step(self.epoch)
+        else:
-        if self.parallel and hasattr(self.train_loader, "batch_sampler"):
+            # resume: train next_epoch and next_iteration
-            self.train_loader.batch_sampler.set_epoch(self.epoch)
+            self.epoch += 1
            self.iteration += 1
            logger.info(
                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
        self.maybe_batch_sampler_step()
    def new_epoch(self):
        """Reset the train loader seed and increment `epoch`.
        """
        # `iteration` increased by train step
        self.epoch += 1
        self.maybe_batch_sampler_step()
    def after_train_batch(self):
        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
            profiler.add_profiler_step(self.args.profiler_options)
            logger.info(
                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
            sys.exit(
                f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
    def train(self):
        """The training process control by epoch."""
        self.before_train()
        logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
        while self.epoch < self.config.training.n_epoch:
            with Timer("Epoch-Train Time Cost: {}"):
                self.model.train()
                try:
                    data_start_time = time.time()
                    for batch_index, batch in enumerate(self.train_loader):
                        dataload_time = time.time() - data_start_time
-                    msg = "Train: Rank: {}, ".format(dist.get_rank())
+                        msg = "Train:"
-                    msg += "epoch: {}, ".format(self.epoch)
+                        observation = OrderedDict()
-                    msg += "step: {}, ".format(self.iteration)
+                        with ObsScope(observation):
-                    msg += "batch : {}/{}, ".format(batch_index + 1,
+                            report("Rank", dist.get_rank())
-                                                    len(self.train_loader))
+                            report("epoch", self.epoch)
-                    msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
+                            report('step', self.iteration)
-                    msg += "data time: {:>.3f}s, ".format(dataload_time)
+                            report("lr", self.lr_scheduler())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('iter', batch_index + 1)
                            report('total', len(self.train_loader))
                            report('reader_cost', dataload_time)
                        observation['batch_cost'] = observation[
                            'reader_cost'] + observation['step_cost']
                        observation['samples'] = observation['batch_size']
                        observation['ips[sent./sec]'] = observation[
                            'batch_size'] / observation['batch_cost']
                        for k, v in observation.items():
                            msg += f" {k}: "
                            msg += f"{v:>.8f}" if isinstance(v,
                                                             float) else f"{v}"
                            msg += ","
                        logger.info(msg)
                        data_start_time = time.time()
                except Exception as e:
                    logger.error(e)
                    raise e
            with Timer("Eval Time Cost: {}"):
                total_loss, num_seen_utts = self.valid()
                if dist.get_world_size() > 1:
                    num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -231,6 +289,7 @@ class Trainer():
                    'epoch', {'cv_loss': cv_loss,
                              'lr': self.lr_scheduler()}, self.epoch)
            # after epoch
            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
            # step lr every epoch
            self.lr_scheduler.step()
@ -240,14 +299,13 @@ class Trainer():
        """The routine of the experiment after setup. This method is intended
        to be used by the user.
        """
        with Timer("Training Done: {}"):
            try:
                self.train()
            except KeyboardInterrupt:
            self.save()
                exit(-1)
            finally:
                self.destory()
        logger.info("Training Done.")
    def setup_output_dir(self):
        """Create a directory used for output.
--- a/deepspeech/training/updaters/standard_updater.py
+++ b/deepspeech/training/updaters/standard_updater.py
@ -14,12 +14,12 @@
 from typing import Dict
 from typing import Optional
-from paddle import Tensor
+import paddle
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
-from timer import timer
+from paddle.optimizer.lr import LRScheduler
 from deepspeech.training.reporter import report
 from deepspeech.training.updaters.updater import UpdaterBase
@ -39,8 +39,10 @@ class StandardUpdater(UpdaterBase):
    def __init__(self,
                 model: Layer,
                 optimizer: Optimizer,
                 scheduler: LRScheduler,
                 dataloader: DataLoader,
                 init_state: Optional[UpdaterState]=None):
        super().__init__(init_state)
        # it is designed to hold multiple models
        models = {"main": model}
        self.models: Dict[str, Layer] = models
@ -51,15 +53,14 @@ class StandardUpdater(UpdaterBase):
        self.optimizer = optimizer
        self.optimizers: Dict[str, Optimizer] = optimizers
        # it is designed to hold multiple scheduler
        schedulers = {"main": scheduler}
        self.scheduler = scheduler
        self.schedulers: Dict[str, LRScheduler] = schedulers
        # dataloaders
        self.dataloader = dataloader
        # init state
        if init_state is None:
            self.state = UpdaterState()
        else:
            self.state = init_state
        self.train_iterator = iter(dataloader)
    def update(self):
@ -103,7 +104,9 @@ class StandardUpdater(UpdaterBase):
            model.train()
        # training for a step is implemented here
        with Timier("data time cost:{}"):
            batch = self.read_batch()
        with Timier("step time cost:{}"):
            self.update_core(batch)
        self.state.iteration += 1
@ -115,13 +118,14 @@ class StandardUpdater(UpdaterBase):
        """A simple case for a training step. Basic assumptions are:
        Single model;
        Single optimizer;
        Single scheduler, and update learning rate each step;
        A batch from the dataloader is just the input of the model;
        The model return a single loss, or a dict containing serval losses.
        Parameters updates at every batch, no gradient accumulation.
        """
        loss = self.model(*batch)
-        if isinstance(loss, Tensor):
+        if isinstance(loss, paddle.Tensor):
            loss_dict = {"main": loss}
        else:
            # Dict[str, Tensor]
@ -135,14 +139,15 @@ class StandardUpdater(UpdaterBase):
        for name, loss_item in loss_dict.items():
            report(name, float(loss_item))
-        self.optimizer.clear_gradient()
+        self.optimizer.clear_grad()
        loss_dict["main"].backward()
-        self.optimizer.update()
+        self.optimizer.step()
        self.scheduler.step()
    @property
    def updates_per_epoch(self):
-        """Number of updater per epoch, determined by the length of the
+        """Number of steps per epoch, 
-        dataloader."""
+        determined by the length of the dataloader."""
        length_of_dataloader = None
        try:
            length_of_dataloader = len(self.dataloader)
@ -163,18 +168,16 @@ class StandardUpdater(UpdaterBase):
    def read_batch(self):
        """Read a batch from the data loader, auto renew when data is exhausted."""
        with timer() as t:
        try:
            batch = next(self.train_iterator)
        except StopIteration:
            self.new_epoch()
            batch = next(self.train_iterator)
            logger.debug(
                f"Read a batch takes {t.elapse}s.")  # replace it with logger
        return batch
    def state_dict(self):
-        """State dict of a Updater, model, optimizer and updater state are included."""
+        """State dict of a Updater, model, optimizers/schedulers 
        and updater state are included."""
        state_dict = super().state_dict()
        for name, model in self.models.items():
            state_dict[f"{name}_params"] = model.state_dict()
@ -184,7 +187,7 @@ class StandardUpdater(UpdaterBase):
    def set_state_dict(self, state_dict):
        """Set state dict for a Updater. Parameters of models, states for
-        optimizers and UpdaterState are restored."""
+        optimizers/schedulers and UpdaterState are restored."""
        for name, model in self.models.items():
            model.set_state_dict(state_dict[f"{name}_params"])
        for name, optim in self.optimizers.items():
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
@ -24,7 +24,7 @@ import tqdm
 from deepspeech.training.extensions.extension import Extension
 from deepspeech.training.extensions.extension import PRIORITY_READER
-from deepspeech.training.reporter import scope
+from deepspeech.training.reporter import ObsScope
 from deepspeech.training.triggers import get_trigger
 from deepspeech.training.triggers.limit_trigger import LimitTrigger
 from deepspeech.training.updaters.updater import UpdaterBase
@ -140,11 +140,11 @@ class Trainer():
        try:
            while not stop_trigger(self):
                self.observation = {}
-                # set observation as the report target
+                # set observation as the `report` target
-                # you can use report freely in Updater.update()
+                # you can use `report` freely in Updater.update()
                # updating parameters and state
-                with scope(self.observation):
+                with ObsScope(self.observation):
                    update()
                    p.update()
--- a/deepspeech/training/updaters/updater.py
+++ b/deepspeech/training/updaters/updater.py
@ -52,6 +52,7 @@ class UpdaterBase():
    """
    def __init__(self, init_state=None):
        # init state
        if init_state is None:
            self.state = UpdaterState()
        else:
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
@ -114,13 +114,13 @@ class Checkpoint():
        params_path = checkpoint_path + ".pdparams"
        model_dict = paddle.load(params_path)
        model.set_state_dict(model_dict)
-        logger.info("Rank {}: loaded model from {}".format(rank, params_path))
+        logger.info("Rank {}: Restore model from {}".format(rank, params_path))
        optimizer_path = checkpoint_path + ".pdopt"
        if optimizer and os.path.isfile(optimizer_path):
            optimizer_dict = paddle.load(optimizer_path)
            optimizer.set_state_dict(optimizer_dict)
-            logger.info("Rank {}: loaded optimizer state from {}".format(
+            logger.info("Rank {}: Restore optimizer state from {}".format(
                rank, optimizer_path))
        info_path = re.sub('.pdparams$', '.json', params_path)
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
@ -84,19 +84,19 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    y_insert_blank = insert_blank(y, blank_id)  #(2L+1)
    log_alpha = paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)))  #(T, 2L+1)
+        (ctc_probs.shape[0], len(y_insert_blank)))  #(T, 2L+1)
    log_alpha = log_alpha - float('inf')  # log of zero
-    # TODO(Hui Zhang): zeros not support paddle.int16
+
    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
    state_path = (paddle.zeros(
-        (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1
+        (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
                  )  # state path, Tuple((T, 2L+1))
    # init start state
-    # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
+    log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]]  # State-b, Sb
-    log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])]  # State-b, Sb
+    log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]]  # State-nb, Snb
    log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])]  # State-nb, Snb
-    for t in range(1, ctc_probs.size(0)):  # T
+    for t in range(1, ctc_probs.shape[0]):  # T
        for s in range(len(y_insert_blank)):  # 2L+1
            if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
                    s] == y_insert_blank[s - 2]:
@ -110,13 +110,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
                    log_alpha[t - 1, s - 2],
                ])
                prev_state = [s, s - 1, s - 2]
-            # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64
+            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
-            log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int(
+                y_insert_blank[s]]
                y_insert_blank[s])]
            state_path[t, s] = prev_state[paddle.argmax(candidates)]
-
+    # self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
-    # TODO(Hui Zhang): zeros not support paddle.int16
+    state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
    state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
    candidates = paddle.to_tensor([
        log_alpha[-1, len(y_insert_blank) - 1],  # Sb
@ -124,11 +122,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
    ])
    prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
    state_seq[-1] = prev_state[paddle.argmax(candidates)]
-    for t in range(ctc_probs.size(0) - 2, -1, -1):
+    for t in range(ctc_probs.shape[0] - 2, -1, -1):
        state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
    output_alignment = []
-    for t in range(0, ctc_probs.size(0)):
+    for t in range(0, ctc_probs.shape[0]):
        output_alignment.append(y_insert_blank[state_seq[t, 0]])
    return output_alignment
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import getpass
 import logging
 import os
 import socket
 import sys
 from loguru import logger
 from paddle import inference
 FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
 DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
 logging.basicConfig(
    level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
 def find_log_dir(log_dir=None):
    """Returns the most suitable directory to put log files into.
@ -98,59 +92,28 @@ def find_log_dir_and_names(program_name=None, log_dir=None):
 class Log():
-
+    """Default Logger for all."""
-    log_name = None
+    logger.remove()
-
+    logger.add(
-    def __init__(self, logger=None):
+        sys.stdout,
-        self.logger = logging.getLogger(logger)
+        level='INFO',
-        self.logger.setLevel(logging.DEBUG)
+        enqueue=True,
-
+        filter=lambda record: record['level'].no >= 20)
-        file_dir = os.getcwd() + '/log'
+    _, file_prefix, _ = find_log_dir_and_names()
-        if not os.path.exists(file_dir):
+    sink_prefix = os.path.join("exp/log", file_prefix)
-            os.mkdir(file_dir)
+    sink_path = sink_prefix[:-3] + "{time}.log"
-        self.log_dir = file_dir
+    logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB")
-
+
-        actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names(
+    def __init__(self, name=None):
            program_name=None, log_dir=self.log_dir)
        basename = '%s.DEBUG.%d' % (file_prefix, os.getpid())
        filename = os.path.join(actual_log_dir, basename)
        if Log.log_name is None:
            Log.log_name = filename
        # Create a symlink to the log file with a canonical name.
        symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG')
        try:
            if os.path.islink(symlink):
                os.unlink(symlink)
            os.symlink(os.path.basename(Log.log_name), symlink)
        except EnvironmentError:
            # If it fails, we're sad but it's no error.  Commonly, this
            # fails because the symlink was created by another user and so
            # we can't modify it
        pass
        if not self.logger.hasHandlers():
            formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
            fh = logging.FileHandler(Log.log_name)
            fh.setLevel(logging.DEBUG)
            fh.setFormatter(formatter)
            self.logger.addHandler(fh)
            ch = logging.StreamHandler()
            ch.setLevel(logging.INFO)
            ch.setFormatter(formatter)
            self.logger.addHandler(ch)
        # stop propagate for propagating may print
        # log multiple times
        self.logger.propagate = False
    def getlog(self):
-        return self.logger
+        return logger
 class Autolog:
    """Just used by fullchain project"""
    def __init__(self,
                 batch_size,
                 model_name="DeepSpeech",
--- a/deepspeech/utils/profiler.py
+++ b/deepspeech/utils/profiler.py
@ -0,0 +1,119 @@
 # copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import paddle
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 # A global variable to record the number of calling times for profiler
 # functions. It is used to specify the tracing range of training steps.
 _profiler_step_id = 0
 # A global variable to avoid parsing from string every time.
 _profiler_options = None
 class ProfilerOptions(object):
    '''
    Use a string to initialize a ProfilerOptions.
    The string should be in the format: "key1=value1;key2=value;key3=value3".
    For example:
      "profile_path=model.profile"
      "batch_range=[50, 60]; profile_path=model.profile"
      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
    ProfilerOptions supports following key-value pair:
      batch_range      - a integer list, e.g. [100, 110].
      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
      sorted_key       - a string, the optional values are 'calls', 'total',
                         'max', 'min' or 'ave.
      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
                         'AllOpDetail'.
      profile_path     - a string, the path to save the serialized profile data,
                         which can be used to generate a timeline.
      exit_on_finished - a boolean.
    '''
    def __init__(self, options_str):
        assert isinstance(options_str, str)
        self._options = {
            'batch_range': [10, 20],
            'state': 'All',
            'sorted_key': 'total',
            'tracer_option': 'Default',
            'profile_path': '/tmp/profile',
            'exit_on_finished': True
        }
        self._parse_from_string(options_str)
    def _parse_from_string(self, options_str):
        if not options_str:
            return
        for kv in options_str.replace(' ', '').split(';'):
            key, value = kv.split('=')
            if key == 'batch_range':
                value_list = value.replace('[', '').replace(']', '').split(',')
                value_list = list(map(int, value_list))
                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
                        1] > value_list[0]:
                    self._options[key] = value_list
            elif key == 'exit_on_finished':
                self._options[key] = value.lower() in ("yes", "true", "t", "1")
            elif key in [
                    'state', 'sorted_key', 'tracer_option', 'profile_path'
            ]:
                self._options[key] = value
    def __getitem__(self, name):
        if self._options.get(name, None) is None:
            raise ValueError(
                "ProfilerOptions does not have an option named %s." % name)
        return self._options[name]
 def add_profiler_step(options_str=None):
    '''
    Enable the operator-level timing using PaddlePaddle's profiler.
    The profiler uses a independent variable to count the profiler steps.
    One call of this function is treated as a profiler step.
    Args:
      profiler_options - a string to initialize the ProfilerOptions.
                         Default is None, and the profiler is disabled.
    '''
    if options_str is None:
        return
    global _profiler_step_id
    global _profiler_options
    if _profiler_options is None:
        _profiler_options = ProfilerOptions(options_str)
        logger.info(f"Profiler: {options_str}")
        logger.info(f"Profiler: {_profiler_options._options}")
    if _profiler_step_id == _profiler_options['batch_range'][0]:
        paddle.utils.profiler.start_profiler(_profiler_options['state'],
                                             _profiler_options['tracer_option'])
    elif _profiler_step_id == _profiler_options['batch_range'][1]:
        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
                                            _profiler_options['profile_path'])
        if _profiler_options['exit_on_finished']:
            sys.exit(0)
    _profiler_step_id += 1
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@ -83,7 +83,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
-    max_len = max([s.size(0) for s in sequences])
+    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
@ -91,12 +91,22 @@ def pad_sequence(sequences: List[paddle.Tensor],
    out_tensor = sequences[0].new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
-        length = tensor.size(0)
+        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            # TODO (Hui Zhang): set_value op not supprot `end==start`
            # out_tensor[i, :length, ...] = tensor
            if length != 0:
                out_tensor[i, :length, ...] = tensor
            else:
                out_tensor[i, length, ...] = tensor
        else:
            # TODO (Hui Zhang): set_value op not supprot `end==start`
            # out_tensor[:length, i, ...] = tensor
            if length != 0:
                out_tensor[:length, i, ...] = tensor
            else:
                out_tensor[length, i, ...] = tensor
    return out_tensor
@ -139,7 +149,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
-    B = ys_pad.size(0)
+    B = ys_pad.shape[0]
    _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
    _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
    ys_in = paddle.cat([_sos, ys_pad], dim=1)
@ -165,16 +175,10 @@ def th_accuracy(pad_outputs: paddle.Tensor,
    Returns:
        float: Accuracy value (0.0 - 1.0).
    """
-    pad_pred = pad_outputs.view(
+    pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
-        pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2)
+                                pad_outputs.shape[1]).argmax(2)
    mask = pad_targets != ignore_label
-    #TODO(Hui Zhang): sum not support bool type
+    numerator = paddle.sum(
    # numerator = paddle.sum(
    #     pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
    numerator = (
        pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
-    numerator = paddle.sum(numerator.type_as(pad_targets))
+    denominator = paddle.sum(mask)
    #TODO(Hui Zhang): sum not support bool type
    # denominator = paddle.sum(mask)
    denominator = paddle.sum(mask.type_as(pad_targets))
    return float(numerator) / float(denominator)
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
@ -16,15 +16,27 @@ import distutils.util
 import math
 import os
 import random
 from contextlib import contextmanager
 from typing import List
 import numpy as np
 import paddle
-__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"]
+__all__ = [
    "UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
 ]
@contextmanager
 def UpdateConfig(config):
    """Update yacs config"""
    config.defrost()
    yield
    config.freeze()
 def seed_all(seed: int=210329):
    """freeze random generator seed."""
    np.random.seed(seed)
    random.seed(seed)
    paddle.seed(seed)
--- a/doc/images/multi_gpu_speedup.png
+++ b/doc/images/multi_gpu_speedup.png
--- a/doc/images/tuning_error_surface.png
+++ b/doc/images/tuning_error_surface.png
--- a/doc/src/benchmark.md
+++ b/doc/src/benchmark.md
@ -1,16 +0,0 @@
 # Benchmarks
 ## Acceleration with Multi-GPUs
 We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds).  And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
 <img src="../images/multi_gpu_speedup.png" width=450>
 | # of GPU  | Acceleration Rate |
 | --------  | --------------:   |
 | 1         | 1.00 X |
 | 2         | 1.98 X |
 | 4         | 3.73 X |
 | 8         | 6.95 X |
 `utils/profile.sh` provides such a demo profiling tool, you can change it as need.
--- a/doc/src/faq.md
+++ b/doc/src/faq.md
@ -1,37 +0,0 @@
 # FAQ
 1. 音频变速快慢到达什么晨读会影响识别率？
   变速会提升识别效果，一般用0.9， 1.0， 1.1 的变速。
 2. 音量大小到什么程度会影响识别率？
   一般训练会固定音量到一个范围内，波动过大会影响训练，估计在10dB ~ 20dB吧。
 3. 语音模型训练数据的最小时长要求时多少？
   Aishell-1大约178h的数据，数据越多越好。
 4. 那些噪声或背景生会影响识别率？  
   主要是人生干扰和低信噪比会影响识别率。
 5. 单条语音数据的长度限制是多少？  
   一般训练的语音长度会限制在1s~6s之间，和训练配置有关。
 6. 背景声在识别前是否需要分离出来，或做降噪处理？  
   需要分离的，需要结合具体场景考虑。
 7. 模型是否带有VAD人生激活识别能力？  
   VAD是单独的模型或模块，模型不包含此能力。
 8. 是否支持长语音识别？  
   一般过VAD后识别。
 9. Mandarin LM Large语言模型需要的硬件配置时怎样的？  
   内存能放得下LM即可。
--- a/doc/src/reference.md
+++ b/doc/src/reference.md
@ -1,3 +0,0 @@
 # Reference
 * [wenet](https://github.com/mobvoi/wenet)
--- a/doc/src/released_model.md
+++ b/doc/src/released_model.md
@ -1,9 +0,0 @@
 # Released Models
 ## Language Model Released
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
--- a/doc/src/server.md
+++ b/doc/src/server.md
@ -1,34 +0,0 @@
 # Trying Live Demo with Your Own Voice
 Until now, an ASR model is trained and tested qualitatively (`infer`) and quantitatively (`test`) with existing audio files. But it is not yet tested with your own speech. We build up a real-time demo ASR engine with the trained model, enabling you to test and play around with the demo, with your own voice.
 First, change your directory to `examples/aishell` and `source path.sh`.
 To start the demo's server, please run this in one console:
 ```bash
 CUDA_VISIBLE_DEVICES=0 bash local/server.sh
 ```
 For the machine (might not be the same machine) to run the demo's client, please do the following installation before moving on.
 For example, on MAC OS X:
 ```bash
 brew install portaudio
 pip install pyaudio
 pip install keyboard
 ```
 Then to start the client, please run this in another console:
 ```bash
 CUDA_VISIBLE_DEVICES=0 bash local/client.sh
 ```
 Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key.
 Notice that `deepspeech/exps/deepspeech2/deploy/client.py` must be run on a machine with a microphone device, while `deepspeech/exps/deepspeech2/deploy/server.py` could be run on one without any audio recording hardware, e.g. any remote server machine. Just be careful to set the `host_ip` and `host_port` argument with the actual accessible IP address and port, if the server and client are running with two separate machines. Nothing should be done if they are running on one single machine.
 Please also refer to `examples/aishell/local/server.sh`, which will first download a pre-trained Chinese model (trained with AISHELL1) and then start the demo server with the model. With running `examples/aishell/local/client.sh`, you can speak Chinese to test it. If you would like to try some other models, just update `--checkpoint_path` argument in the script.  
--- a/docs/images/ds2offlineModel.png
+++ b/docs/images/ds2offlineModel.png
--- a/docs/images/ds2onlineModel.png
+++ b/docs/images/ds2onlineModel.png
--- a/docs/src/augmentation.md
+++ b/docs/src/augmentation.md
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@ -0,0 +1,190 @@
 # Deepspeech2
 ## Streaming
 The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
 The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
 To illustrate the model implementation clearly, 3 parts are described in detail.  
 - Data Preparation
 - Encoder
 - Decoder
 In addition, the training process and the testing process are also introduced.
 The arcitecture of the model is shown in Fig.1.
 <p align="center">
 <img src="../images/ds2onlineModel.png" width=800>
 <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>
 ### Data Preparation
 #### Vocabulary
 For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>.  For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
 ```
 # The code to build vocabulary
 cd examples/aishell/s0
 python3 ../../../utils/build_vocab.py \
     --unit_type="char" \
     --count_threshold=0 \
     --vocab_path="data/vocab.txt" \
     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
 # vocabulary for aishell dataset (Mandarin)
 vi examples/aishell/s0/data/vocab.txt
 # vocabulary for librispeech dataset (English)
 vi examples/librispeech/s0/data/vocab.txt
 ```
 #### CMVN
 For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
 ```
 # The code to compute the feature mean and std
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --specgram_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \
     --sample_rate=16000 \
     --use_dB_normalization=True \
     --num_samples=2000 \
     --num_workers=10 \
     --output_path="data/mean_std.json"
 ```
 #### Feature Extraction
 For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
 Currently, the released deepspeech2 online model use the linear feature extraction method.
 ```
 The code for feature extraction
 vi deepspeech/frontend/featurizer/audio_featurizer.py
 ```
 ### Encoder
 The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.
 The code of Encoder is in:
 ```
 vi deepspeech/models/ds2_online/deepspeech2.py
 ```
 ### Decoder
 To got the character possibilities of each frame, the feature representation of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
 The code of the decoder is in:
 ```
 # The code of constructing the decoder in model
 vi deepspeech/models/ds2_online/deepspeech2.py
 # The code of CTC Decoder
 vi deepspeech/modules/ctc.py
 ```
 ## Training Process
 Using the command below, you can train the deepspeech2 online model.
 ```
 cd examples/aishell/s0
 bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
 ```
 The detail commands are:
 ```  
 # The code for training in run.sh
 set -e
 source path.sh
 gpus=2,3,5,7
 stage=0
 stop_stage=5
 conf_path=conf/deepspeech2_online.yaml     # conf/deepspeech2.yaml | conf/deepspeech2_online.yaml
 avg_num=1
 model_type=online    # online | offline
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} ${model_type}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
    avg.sh exp/${ckpt}/checkpoints ${avg_num}
 fi
 ```
 By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.  
 ## Testing Process
 Using the command below, you can test the deepspeech2 online model.
 ```
 bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
 ```
 The detail commands are:
 ```
 conf_path=conf/deepspeech2_online.yaml
 avg_num=1
 model_type=online
 avg_ckpt=avg_${avg_num}
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
    CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # export ckpt avg_n
    CUDA_VISIBLE_DEVICES=5 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
 fi
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
 ```
 After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
 ## Non-Streaming
 The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
 The arcitecture of the model is shown in Fig.2.
 <p align="center">
 <img src="../images/ds2offlineModel.png" width=800>
 <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>
 For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
 The code of encoder and decoder for deepspeech2 offline model is in:
 ```
 vi deepspeech/models/ds2/deepspeech2.py
 ```
 The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
 Only some changes should be noticed.
 For training and testing, the "model_type" and the "conf_path" must be set.
 ```
 # Training offline
 cd examples/aishell/s0
 bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
 ```
 ```
 # Testing offline
 cd examples/aishell/s0
 bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
 ```
--- a/docs/src/feature_list.md
+++ b/docs/src/feature_list.md
@ -1,13 +1,20 @@
 # Features
 ### Dataset
 * Aishell
 * Librispeech
 * THCHS30
 * TIMIT
 ### Speech Recognition
-* Offline
+* Non-Streaming
  * [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
  * [Transformer](https://arxiv.org/abs/1706.03762)
  * [Conformer](https://arxiv.org/abs/2005.08100)
-* Online
+* Streaming
  * [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
  * [U2](https://arxiv.org/pdf/2012.05481.pdf)
 ### Language Model
@ -22,6 +29,15 @@
 * beam search
 * attention rescore
 ### Deployment
 * Paddle Inference
 ### Aligment  
 * MFA  
 * CTC Aligment  
 ### Speech Frontend
 * Audio
--- a/docs/src/getting_started.md
+++ b/docs/src/getting_started.md
--- a/docs/src/install.md
+++ b/docs/src/install.md
@ -4,15 +4,16 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
 ## Prerequisites
 - Python >= 3.7
- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
+- PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
-## Setup
+## Setup (Important)
 - Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox, and `swig`, e.g. installing them via `apt-get`:
 ```bash
 sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
 ```
 The version of `swig` should >= 3.0
 or, installing them via `yum`:
--- a/docs/src/ngram_lm.md
+++ b/docs/src/ngram_lm.md
@ -35,52 +35,3 @@ Different from the English language model, Mandarin language model is character-
  * A whitespace character between two tokens is inserted.
 Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings.
 ## [KenLM](http://kheafield.com/code/kenlm/)
 统计语言模型工具有比较多的选择，目前使用比较好的有srilm及kenlm，其中kenlm比srilm晚出来，训练速度也更快，而且支持单机大数据的训练。现在介绍一下kenlm的使用方法。
 1. 工具包的下载地址：http://kheafield.com/code/kenlm.tar.gz
 2. 使用。该工具在linux环境下使用方便。 先确保linux环境已经按照1.36.0的Boost和zlib
   ```
   boost:
   yum install boost
   yum install boost-devel
   zlib:
   yum install zlib
   yum install zlib-devel
   ```
   然后gcc版本需要是4.8.2及以上。
   ```
   wget -O - https://kheafield.com/code/kenlm.tar.gz |tar xz
   mkdir kenlm/build
   cd kenlm/build
   cmake ..
   make -j2
   ```
 3. 训练。使用如下命令进行训练：
   ```
   build/bin/lmplz -o 3 --verbose_header --text people2014corpus_words.txt --arpa result/people2014corpus_words.arps
   ```
   其中，
   1）people2014corpus_words.txt文件必须是分词以后的文件。
   训练语料<人民日报2014版熟语料>，包括： 1）标准人工切词及词性数据people2014.tar.gz， 2）未切词文本数据people2014_words.txt， 3）kenlm训练字粒度语言模型文件及其二进制文件people2014corpus_chars.arps/klm， 4）kenlm词粒度语言模型文件及其二进制文件people2014corpus_words.arps/klm。
   2）-o后面的5表示的是5-gram,一般取到3即可，但可以结合自己实际情况判断。
 4. 压缩。压缩模型为二进制，方便模型快速加载：
   ```
   build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
   ```
--- a/docs/src/reference.md
+++ b/docs/src/reference.md
@ -0,0 +1,8 @@
 # Reference
 We refer these repos to build `model` and `engine`:
 * [delta](https://github.com/Delta-ML/delta.git)
 * [espnet](https://github.com/espnet/espnet.git)
 * [kaldi](https://github.com/kaldi-asr/kaldi.git)
 * [wenet](https://github.com/mobvoi/wenet)
--- a/docs/src/released_model.md
+++ b/docs/src/released_model.md
@ -0,0 +1,28 @@
 # Released Models
 ## Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
 :-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
 [Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
 [Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
 [Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
 [Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
 [Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
 [Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
 ## Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
 :-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
 [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
 [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
 [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
 ## Language Model Released
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
--- a/examples/1xt2x/.gitignore
+++ b/examples/1xt2x/.gitignore
@ -0,0 +1 @@
 tmp
--- a/examples/1xt2x/README.md
+++ b/examples/1xt2x/README.md
@ -0,0 +1,11 @@
 # 1xt2x
 Convert Deepspeech 1.8 released model to 2.x.
 ## Model
 * Deepspeech2x
 ## Exp
 * baidu_en8k
 * aishell
 * librispeech
--- a/examples/1xt2x/aishell/.gitignore
+++ b/examples/1xt2x/aishell/.gitignore
@ -0,0 +1,4 @@
 exp
 data
 *log
 tmp
--- a/examples/1xt2x/aishell/conf/augmentation.json
+++ b/examples/1xt2x/aishell/conf/augmentation.json
@ -0,0 +1 @@
 []
--- a/Show More
+++ b/Show More
		`@ -1,3 +0,0 @@`
			`# Reference`

			`* [wenet](https://github.com/mobvoi/wenet)`