diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb
index 5d182979..12870a8e 100644
--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
@@ -10,13 +10,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/workspace/DeepSpeech-2.x\n"
+      "/workspace/zhanghui/DeepSpeech-2.x\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "'/workspace/DeepSpeech-2.x'"
+       "'/workspace/zhanghui/DeepSpeech-2.x'"
       ]
      },
      "execution_count": 1,
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 8,
    "id": "correct-window",
    "metadata": {},
    "outputs": [
@@ -45,22 +45,22 @@
     }
    ],
    "source": [
-    "!ls /workspace/DeepSpeech-2.x/examples/librispeech/s2/data/"
+    "!ls /workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "id": "exceptional-cheese",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dev_data='/workspace/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'"
+    "dev_data='/workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "id": "extraordinary-orleans",
    "metadata": {},
    "outputs": [
@@ -68,6 +68,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n",
       "register user softmax to paddle, remove this when fixed!\n",
       "register user log_softmax to paddle, remove this when fixed!\n",
       "register user sigmoid to paddle, remove this when fixed!\n",
@@ -105,26 +106,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "id": "returning-lighter",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "dev_json = read_manifest(dev_data)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "id": "western-founder",
    "metadata": {},
    "outputs": [
@@ -166,7 +158,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 14,
    "id": "motivated-receptor",
    "metadata": {},
    "outputs": [],
@@ -646,19 +638,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 15,
    "id": "acquired-hurricane",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:284] use shuffled batch.\n",
-      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:286] # utts: 5542\n",
-      "[INFO 2021/08/17 04:09:47 <ipython-input-19-4c01301916ec>:467] # minibatches: 555\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -703,7 +686,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 16,
    "id": "warming-malpractice",
    "metadata": {},
    "outputs": [
@@ -713,16 +696,16 @@
      "text": [
       "Collecting kaldiio\n",
       "  Downloading kaldiio-2.17.2.tar.gz (24 kB)\n",
-      "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages (from kaldiio) (1.20.1)\n",
+      "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages/numpy-1.21.2-py3.7-linux-x86_64.egg (from kaldiio) (1.21.2)\n",
       "Building wheels for collected packages: kaldiio\n",
       "  Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n",
-      "\u001b[?25h  Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24469 sha256=aadc8b1a8de5c9769af065ae724fb11326691d2350145019f6e3dba69f020134\n",
+      "\u001b[?25h  Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24468 sha256=cd6e066764dcc8c24a9dfe3f7bd8acda18761a6fbcb024995729da8debdb466e\n",
       "  Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n",
       "Successfully built kaldiio\n",
       "Installing collected packages: kaldiio\n",
       "Successfully installed kaldiio-2.17.2\n",
-      "\u001b[33mWARNING: You are using pip version 20.0.1; however, version 21.2.4 is available.\n",
-      "You should consider upgrading via the '/workspace/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
+      "\u001b[33mWARNING: You are using pip version 20.3.3; however, version 21.2.4 is available.\n",
+      "You should consider upgrading via the '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n"
      ]
     }
    ],
@@ -740,7 +723,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 19,
    "id": "superb-methodology",
    "metadata": {},
    "outputs": [],
@@ -1046,7 +1029,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 20,
    "id": "monthly-muscle",
    "metadata": {},
    "outputs": [],
@@ -1064,70 +1047,263 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 23,
    "id": "periodic-senegal",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "res = load(dev_data[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "7f0307eb",
+   "metadata": {},
    "outputs": [
     {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-56-9f483b231463>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, batch, return_uttid)\u001b[0m\n\u001b[1;32m     94\u001b[0m                     x = self._get_from_loader(\n\u001b[1;32m     95\u001b[0m                         \u001b[0mfilepath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"feat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m                         filetype=inp.get(\"filetype\", \"mat\"))\n\u001b[0m\u001b[1;32m     97\u001b[0m                     \u001b[0mx_feats_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m<ipython-input-54-9deb677b23d5>\u001b[0m in \u001b[0;36m_get_from_loader\u001b[0;34m(self, filepath, filetype)\u001b[0m\n\u001b[1;32m    278\u001b[0m             \u001b[0;31m# load_mat can load both matrix and vector\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    279\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeep_all_data_on_mem\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    281\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mfilepath\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    282\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/matio.py\u001b[0m in \u001b[0;36mload_mat\u001b[0;34m(ark_name, endian, fd_dict)\u001b[0m\n\u001b[1;32m    238\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    239\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mopen_like_kaldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mark\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfd\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    241\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/utils.py\u001b[0m in \u001b[0;36mopen_like_kaldi\u001b[0;34m(name, mode)\u001b[0m\n\u001b[1;32m    206\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    207\u001b[0m         \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdefault_encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'tuple'>\n",
+      "2\n",
+      "10\n",
+      "10\n",
+      "(1763, 83) float32\n",
+      "(73,) int64\n"
      ]
     }
    ],
    "source": [
-    "res = load(dev_data[0])"
+    "print(type(res))\n",
+    "print(len(res))\n",
+    "print(len(res[0]))\n",
+    "print(len(res[1]))\n",
+    "print(res[0][0].shape, res[0][0].dtype)\n",
+    "print(res[1][0].shape, res[1][0].dtype)\n",
+    "# Tuple[Tuple[np.ndarry], Tuple[np.ndarry]]\n",
+    "# 2[10, 10]\n",
+    "# feats, labels"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 36,
    "id": "humanitarian-container",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "(inputs, outputs), utts = load(dev_data[0], return_uttid=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "heard-prize",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ls: cannot access '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark': No such file or directory\r\n"
+      "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038'] 10\n",
+      "10\n"
      ]
     }
    ],
    "source": [
-    "!ls /workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark"
+    "print(utts, len(utts))\n",
+    "print(len(inputs))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
-   "id": "heard-prize",
+   "execution_count": 83,
+   "id": "convinced-animation",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import paddle\n",
+    "from deepspeech.io.utility import pad_list\n",
+    "class CustomConverter():\n",
+    "    \"\"\"Custom batch converter.\n",
+    "\n",
+    "    Args:\n",
+    "        subsampling_factor (int): The subsampling factor.\n",
+    "        dtype (paddle.dtype): Data type to convert.\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, subsampling_factor=1, dtype=np.float32):\n",
+    "        \"\"\"Construct a CustomConverter object.\"\"\"\n",
+    "        self.subsampling_factor = subsampling_factor\n",
+    "        self.ignore_id = -1\n",
+    "        self.dtype = dtype\n",
+    "\n",
+    "    def __call__(self, batch):\n",
+    "        \"\"\"Transform a batch and send it to a device.\n",
+    "\n",
+    "        Args:\n",
+    "            batch (list): The batch to transform.\n",
+    "\n",
+    "        Returns:\n",
+    "            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)\n",
+    "\n",
+    "        \"\"\"\n",
+    "        # batch should be located in list\n",
+    "        assert len(batch) == 1\n",
+    "        (xs, ys), utts = batch[0]\n",
+    "\n",
+    "        # perform subsampling\n",
+    "        if self.subsampling_factor > 1:\n",
+    "            xs = [x[::self.subsampling_factor, :] for x in xs]\n",
+    "\n",
+    "        # get batch of lengths of input sequences\n",
+    "        ilens = np.array([x.shape[0] for x in xs])\n",
+    "\n",
+    "        # perform padding and convert to tensor\n",
+    "        # currently only support real number\n",
+    "        if xs[0].dtype.kind == \"c\":\n",
+    "            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)\n",
+    "            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)\n",
+    "            # Note(kamo):\n",
+    "            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.\n",
+    "            # Don't create ComplexTensor and give it E2E here\n",
+    "            # because torch.nn.DataParellel can't handle it.\n",
+    "            xs_pad = {\"real\": xs_pad_real, \"imag\": xs_pad_imag}\n",
+    "        else:\n",
+    "            xs_pad = pad_list(xs, 0).astype(self.dtype)\n",
+    "\n",
+    "        # NOTE: this is for multi-output (e.g., speech translation)\n",
+    "        ys_pad = pad_list(\n",
+    "            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],\n",
+    "            self.ignore_id)\n",
+    "\n",
+    "        olens = np.array([y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])\n",
+    "        return utts, xs_pad, ilens, ys_pad, olens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "1b6508fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "convert = CustomConverter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "25d655c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "utts, xs, ilen, ys, olen = convert([load(dev_data[0], return_uttid=True)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "a28e5141",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ls: cannot access '/workspace/espnet/': No such file or directory\r\n"
+      "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038']\n",
+      "(10, 1763, 83)\n",
+      "(10,)\n",
+      "[1763 1214 1146  757  751  661  625  512  426  329]\n",
+      "(10, 73)\n",
+      "[[2896  621 4502 2176  404  198 3538  391  278  407  389 3719 4577  846\n",
+      "  4501  482 1004  103  116  178 4222  624 4689  176  459   89  101 3465\n",
+      "  3204 4502 2029 1834 2298  829 3366  278 4705 4925  482 2920 3204 2481\n",
+      "   448  627 1254  404   20  202   36 2047  627 2495 4504  481  479   99\n",
+      "    18 2079 4502 1628  202  226 4512 3267  210  278  483  234  367 4502\n",
+      "  2438 3204 1141]\n",
+      " [ 742 4501 4768 4569  742 4483 2495 4502 3040 3204 4502 3961 3204 3992\n",
+      "  3089 4832 4258  621 2391 4642 3218 4502 3439  235  270  313 2385 2833\n",
+      "   742 4502 3282  332    3  280 4237 3252  830 2387   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2099  278 4904 2302  124 4832 3158  482 2888 2495  482 2450  627 1560\n",
+      "  3158 4729  482 3514 3204 1027 3233 2391 2862  399  389 4962 2495  121\n",
+      "   221    7 2340 1216 1658   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2458 2659 1362    2  404 4975 4995  487 3079 2785 2371 3158  824 2603\n",
+      "  4832 2323  999 2603 4832 4156 4678  627 1784   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2458 2340 1661  101 4723 2138 4502 4690  463  332  251 2345 4534 4502\n",
+      "  2396  444 4501 2287  389 4531 4894 1466  959  389 1658 2584 4502 3681\n",
+      "   279 3204 4502 2228 3204 4502 4690  463  332  251   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2368 1248  208 4832 3158  482 1473 3401  999  482 4159 3838  389  478\n",
+      "  4572  404 3158 3063 1481  113 4499 4501 3204 4643    2  389 4111   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2882 2932 4329 1808 4577 4350 4577  482 1636    2  389 1841 3204 3079\n",
+      "  1091  389 3204 2816 2079 4172 4986 4990   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [4869 2598 2603 1976   96  389  478    3 4031  721 4925 2263 1259 2598\n",
+      "  4508  653 4979 4925 2741  252   72  236   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [2458 4447 4505  713  624 3207  206 4577 4502 2404 3837 3458 2812 4936\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]\n",
+      " [1501 3897 2537  278 2601    2  404 2603  482 2235 3388   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1   -1\n",
+      "    -1   -1   -1]]\n",
+      "[73 38 33 23 38 27 22 22 14 11]\n",
+      "float32\n",
+      "int64\n",
+      "int64\n",
+      "int64\n"
      ]
     }
    ],
    "source": [
-    "!ls /workspace/espnet/"
+    "print(utts)\n",
+    "print(xs.shape)\n",
+    "print(ilen.shape)\n",
+    "print(ilen)\n",
+    "print(ys.shape)\n",
+    "print(ys)\n",
+    "print(olen)\n",
+    "print(xs.dtype)\n",
+    "print(ilen.dtype)\n",
+    "print(ys.dtype)\n",
+    "print(olen.dtype)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "convinced-animation",
+   "id": "1d981df4",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -1135,7 +1311,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/requirements.txt b/requirements.txt
index baaa9ba9..692f3499 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ tensorboardX
 textgrid
 typeguard
 yacs
+kaldiio