diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb index 5d182979..12870a8e 100644 --- a/.notebook/espnet_dataloader.ipynb +++ b/.notebook/espnet_dataloader.ipynb @@ -10,13 +10,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "/workspace/DeepSpeech-2.x\n" + "/workspace/zhanghui/DeepSpeech-2.x\n" ] }, { "data": { "text/plain": [ - "'/workspace/DeepSpeech-2.x'" + "'/workspace/zhanghui/DeepSpeech-2.x'" ] }, "execution_count": 1, @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "id": "correct-window", "metadata": {}, "outputs": [ @@ -45,22 +45,22 @@ } ], "source": [ - "!ls /workspace/DeepSpeech-2.x/examples/librispeech/s2/data/" + "!ls /workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "exceptional-cheese", "metadata": {}, "outputs": [], "source": [ - "dev_data='/workspace/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'" + "dev_data='/workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "id": "extraordinary-orleans", "metadata": {}, "outputs": [ @@ -68,6 +68,7 @@ "name": "stderr", "output_type": "stream", "text": [ + "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n", "register user softmax to paddle, remove this when fixed!\n", "register user log_softmax to paddle, remove this when fixed!\n", "register user sigmoid to paddle, remove this when fixed!\n", @@ -105,26 +106,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "id": "returning-lighter", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", - " and should_run_async(code)\n" - ] - } - ], + "outputs": [], "source": [ "dev_json = read_manifest(dev_data)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "id": "western-founder", "metadata": {}, "outputs": [ @@ -166,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "id": "motivated-receptor", "metadata": {}, "outputs": [], @@ -646,19 +638,10 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 15, "id": "acquired-hurricane", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[INFO 2021/08/17 04:09:47 :284] use shuffled batch.\n", - "[INFO 2021/08/17 04:09:47 :286] # utts: 5542\n", - "[INFO 2021/08/17 04:09:47 :467] # minibatches: 555\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -703,7 +686,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 16, "id": "warming-malpractice", "metadata": {}, "outputs": [ @@ -713,16 +696,16 @@ "text": [ "Collecting kaldiio\n", " Downloading kaldiio-2.17.2.tar.gz (24 kB)\n", - "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages (from kaldiio) (1.20.1)\n", + "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages/numpy-1.21.2-py3.7-linux-x86_64.egg (from kaldiio) (1.21.2)\n", "Building wheels for collected packages: kaldiio\n", " Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24469 sha256=aadc8b1a8de5c9769af065ae724fb11326691d2350145019f6e3dba69f020134\n", + "\u001b[?25h Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24468 sha256=cd6e066764dcc8c24a9dfe3f7bd8acda18761a6fbcb024995729da8debdb466e\n", " Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n", "Successfully built kaldiio\n", "Installing collected packages: kaldiio\n", "Successfully installed kaldiio-2.17.2\n", - "\u001b[33mWARNING: You are using pip version 20.0.1; however, version 21.2.4 is available.\n", - "You should consider upgrading via the '/workspace/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + "\u001b[33mWARNING: You are using pip version 20.3.3; however, version 21.2.4 is available.\n", + "You should consider upgrading via the '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], @@ -740,7 +723,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 19, "id": "superb-methodology", "metadata": {}, "outputs": [], @@ -1046,7 +1029,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 20, "id": "monthly-muscle", "metadata": {}, "outputs": [], @@ -1064,70 +1047,263 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 23, "id": "periodic-senegal", "metadata": {}, + "outputs": [], + "source": [ + "res = load(dev_data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "7f0307eb", + "metadata": {}, "outputs": [ { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, batch, return_uttid)\u001b[0m\n\u001b[1;32m 94\u001b[0m x = self._get_from_loader(\n\u001b[1;32m 95\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"feat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m filetype=inp.get(\"filetype\", \"mat\"))\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0mx_feats_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36m_get_from_loader\u001b[0;34m(self, filepath, filetype)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;31m# load_mat can load both matrix and vector\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeep_all_data_on_mem\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 281\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfilepath\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/matio.py\u001b[0m in \u001b[0;36mload_mat\u001b[0;34m(ark_name, endian, fd_dict)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen_like_kaldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mark\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfd\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/utils.py\u001b[0m in \u001b[0;36mopen_like_kaldi\u001b[0;34m(name, mode)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdefault_encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'" + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2\n", + "10\n", + "10\n", + "(1763, 83) float32\n", + "(73,) int64\n" ] } ], "source": [ - "res = load(dev_data[0])" + "print(type(res))\n", + "print(len(res))\n", + "print(len(res[0]))\n", + "print(len(res[1]))\n", + "print(res[0][0].shape, res[0][0].dtype)\n", + "print(res[1][0].shape, res[1][0].dtype)\n", + "# Tuple[Tuple[np.ndarry], Tuple[np.ndarry]]\n", + "# 2[10, 10]\n", + "# feats, labels" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 36, "id": "humanitarian-container", "metadata": {}, + "outputs": [], + "source": [ + "(inputs, outputs), utts = load(dev_data[0], return_uttid=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "heard-prize", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ls: cannot access '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark': No such file or directory\r\n" + "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038'] 10\n", + "10\n" ] } ], "source": [ - "!ls /workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark" + "print(utts, len(utts))\n", + "print(len(inputs))" ] }, { "cell_type": "code", - "execution_count": 77, - "id": "heard-prize", + "execution_count": 83, + "id": "convinced-animation", + "metadata": {}, + "outputs": [], + "source": [ + "import paddle\n", + "from deepspeech.io.utility import pad_list\n", + "class CustomConverter():\n", + " \"\"\"Custom batch converter.\n", + "\n", + " Args:\n", + " subsampling_factor (int): The subsampling factor.\n", + " dtype (paddle.dtype): Data type to convert.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, subsampling_factor=1, dtype=np.float32):\n", + " \"\"\"Construct a CustomConverter object.\"\"\"\n", + " self.subsampling_factor = subsampling_factor\n", + " self.ignore_id = -1\n", + " self.dtype = dtype\n", + "\n", + " def __call__(self, batch):\n", + " \"\"\"Transform a batch and send it to a device.\n", + "\n", + " Args:\n", + " batch (list): The batch to transform.\n", + "\n", + " Returns:\n", + " tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)\n", + "\n", + " \"\"\"\n", + " # batch should be located in list\n", + " assert len(batch) == 1\n", + " (xs, ys), utts = batch[0]\n", + "\n", + " # perform subsampling\n", + " if self.subsampling_factor > 1:\n", + " xs = [x[::self.subsampling_factor, :] for x in xs]\n", + "\n", + " # get batch of lengths of input sequences\n", + " ilens = np.array([x.shape[0] for x in xs])\n", + "\n", + " # perform padding and convert to tensor\n", + " # currently only support real number\n", + " if xs[0].dtype.kind == \"c\":\n", + " xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)\n", + " xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)\n", + " # Note(kamo):\n", + " # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.\n", + " # Don't create ComplexTensor and give it E2E here\n", + " # because torch.nn.DataParellel can't handle it.\n", + " xs_pad = {\"real\": xs_pad_real, \"imag\": xs_pad_imag}\n", + " else:\n", + " xs_pad = pad_list(xs, 0).astype(self.dtype)\n", + "\n", + " # NOTE: this is for multi-output (e.g., speech translation)\n", + " ys_pad = pad_list(\n", + " [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],\n", + " self.ignore_id)\n", + "\n", + " olens = np.array([y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])\n", + " return utts, xs_pad, ilens, ys_pad, olens" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "1b6508fc", + "metadata": {}, + "outputs": [], + "source": [ + "convert = CustomConverter()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "25d655c0", + "metadata": {}, + "outputs": [], + "source": [ + "utts, xs, ilen, ys, olen = convert([load(dev_data[0], return_uttid=True)])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "a28e5141", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ls: cannot access '/workspace/espnet/': No such file or directory\r\n" + "['1673-143396-0008', '1650-173552-0000', '2803-154320-0000', '6267-65525-0045', '7641-96684-0029', '5338-284437-0010', '8173-294714-0033', '5543-27761-0047', '8254-115543-0043', '6467-94831-0038']\n", + "(10, 1763, 83)\n", + "(10,)\n", + "[1763 1214 1146 757 751 661 625 512 426 329]\n", + "(10, 73)\n", + "[[2896 621 4502 2176 404 198 3538 391 278 407 389 3719 4577 846\n", + " 4501 482 1004 103 116 178 4222 624 4689 176 459 89 101 3465\n", + " 3204 4502 2029 1834 2298 829 3366 278 4705 4925 482 2920 3204 2481\n", + " 448 627 1254 404 20 202 36 2047 627 2495 4504 481 479 99\n", + " 18 2079 4502 1628 202 226 4512 3267 210 278 483 234 367 4502\n", + " 2438 3204 1141]\n", + " [ 742 4501 4768 4569 742 4483 2495 4502 3040 3204 4502 3961 3204 3992\n", + " 3089 4832 4258 621 2391 4642 3218 4502 3439 235 270 313 2385 2833\n", + " 742 4502 3282 332 3 280 4237 3252 830 2387 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [2099 278 4904 2302 124 4832 3158 482 2888 2495 482 2450 627 1560\n", + " 3158 4729 482 3514 3204 1027 3233 2391 2862 399 389 4962 2495 121\n", + " 221 7 2340 1216 1658 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [2458 2659 1362 2 404 4975 4995 487 3079 2785 2371 3158 824 2603\n", + " 4832 2323 999 2603 4832 4156 4678 627 1784 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [2458 2340 1661 101 4723 2138 4502 4690 463 332 251 2345 4534 4502\n", + " 2396 444 4501 2287 389 4531 4894 1466 959 389 1658 2584 4502 3681\n", + " 279 3204 4502 2228 3204 4502 4690 463 332 251 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [2368 1248 208 4832 3158 482 1473 3401 999 482 4159 3838 389 478\n", + " 4572 404 3158 3063 1481 113 4499 4501 3204 4643 2 389 4111 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [2882 2932 4329 1808 4577 4350 4577 482 1636 2 389 1841 3204 3079\n", + " 1091 389 3204 2816 2079 4172 4986 4990 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [4869 2598 2603 1976 96 389 478 3 4031 721 4925 2263 1259 2598\n", + " 4508 653 4979 4925 2741 252 72 236 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [2458 4447 4505 713 624 3207 206 4577 4502 2404 3837 3458 2812 4936\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]\n", + " [1501 3897 2537 278 2601 2 404 2603 482 2235 3388 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1]]\n", + "[73 38 33 23 38 27 22 22 14 11]\n", + "float32\n", + "int64\n", + "int64\n", + "int64\n" ] } ], "source": [ - "!ls /workspace/espnet/" + "print(utts)\n", + "print(xs.shape)\n", + "print(ilen.shape)\n", + "print(ilen)\n", + "print(ys.shape)\n", + "print(ys)\n", + "print(olen)\n", + "print(xs.dtype)\n", + "print(ilen.dtype)\n", + "print(ys.dtype)\n", + "print(olen.dtype)" ] }, { "cell_type": "code", "execution_count": null, - "id": "convinced-animation", + "id": "1d981df4", "metadata": {}, "outputs": [], "source": [] @@ -1135,7 +1311,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/requirements.txt b/requirements.txt index baaa9ba9..692f3499 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ tensorboardX textgrid typeguard yacs +kaldiio