diff --git a/.bashrc b/.bashrc new file mode 100755 index 00000000..15131969 --- /dev/null +++ b/.bashrc @@ -0,0 +1,10 @@ +# Locales + +export LC_ALL=en_US.UTF-8 +export LANG=en_US.UTF-8 +export LANGUAGE=en_US.UTF-8 + +# Aliases +alias nvs="nvidia-smi" +alias rsync="rsync --progress -raz" +alias his="history" diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb new file mode 100644 index 00000000..5d182979 --- /dev/null +++ b/.notebook/espnet_dataloader.ipynb @@ -0,0 +1,1157 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "extensive-venice", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x\n" + ] + }, + { + "data": { + "text/plain": [ + "'/workspace/DeepSpeech-2.x'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%cd ..\n", + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "correct-window", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "manifest.dev\t manifest.test-clean\t manifest.train\r\n", + "manifest.dev.raw manifest.test-clean.raw manifest.train.raw\r\n" + ] + } + ], + "source": [ + "!ls /workspace/DeepSpeech-2.x/examples/librispeech/s2/data/" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "exceptional-cheese", + "metadata": {}, + "outputs": [], + "source": [ + "dev_data='/workspace/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "extraordinary-orleans", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "register user softmax to paddle, remove this when fixed!\n", + "register user log_softmax to paddle, remove this when fixed!\n", + "register user sigmoid to paddle, remove this when fixed!\n", + "register user log_sigmoid to paddle, remove this when fixed!\n", + "register user relu to paddle, remove this when fixed!\n", + "override cat of paddle if exists or register, remove this when fixed!\n", + "override long of paddle.Tensor if exists or register, remove this when fixed!\n", + "override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", + "override eq of paddle.Tensor if exists or register, remove this when fixed!\n", + "override eq of paddle if exists or register, remove this when fixed!\n", + "override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", + "override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", + "register user view to paddle.Tensor, remove this when fixed!\n", + "register user view_as to paddle.Tensor, remove this when fixed!\n", + "register user masked_fill to paddle.Tensor, remove this when fixed!\n", + "register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", + "register user fill_ to paddle.Tensor, remove this when fixed!\n", + "register user repeat to paddle.Tensor, remove this when fixed!\n", + "register user softmax to paddle.Tensor, remove this when fixed!\n", + "register user sigmoid to paddle.Tensor, remove this when fixed!\n", + "register user relu to paddle.Tensor, remove this when fixed!\n", + "register user type_as to paddle.Tensor, remove this when fixed!\n", + "register user to to paddle.Tensor, remove this when fixed!\n", + "register user float to paddle.Tensor, remove this when fixed!\n", + "register user int to paddle.Tensor, remove this when fixed!\n", + "register user GLU to paddle.nn, remove this when fixed!\n", + "register user ConstantPad2d to paddle.nn, remove this when fixed!\n", + "register user export to paddle.jit, remove this when fixed!\n" + ] + } + ], + "source": [ + "from deepspeech.frontend.utility import read_manifest" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "returning-lighter", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + } + ], + "source": [ + "dev_json = read_manifest(dev_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "western-founder", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'input': [{'feat': '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.1.ark:16',\n", + " 'name': 'input1',\n", + " 'shape': [1063, 83]}],\n", + " 'output': [{'name': 'target1',\n", + " 'shape': [41, 5002],\n", + " 'text': 'AS I APPROACHED THE CITY I HEARD BELLS RINGING AND A '\n", + " 'LITTLE LATER I FOUND THE STREETS ASTIR WITH THRONGS OF '\n", + " 'WELL DRESSED PEOPLE IN FAMILY GROUPS WENDING THEIR WAY '\n", + " 'HITHER AND THITHER',\n", + " 'token': '▁AS ▁I ▁APPROACHED ▁THE ▁CITY ▁I ▁HEARD ▁BELL S ▁RING '\n", + " 'ING ▁AND ▁A ▁LITTLE ▁LATER ▁I ▁FOUND ▁THE ▁STREETS ▁AS '\n", + " 'T IR ▁WITH ▁THRONG S ▁OF ▁WELL ▁DRESSED ▁PEOPLE ▁IN '\n", + " '▁FAMILY ▁GROUP S ▁WE ND ING ▁THEIR ▁WAY ▁HITHER ▁AND '\n", + " '▁THITHER',\n", + " 'tokenid': '713 2458 676 4502 1155 2458 2351 849 389 3831 206 627 '\n", + " '482 2812 2728 2458 2104 4502 4316 713 404 212 4925 '\n", + " '4549 389 3204 4861 1677 3339 2495 1950 2279 389 4845 '\n", + " '302 206 4504 4843 2394 627 4526'}],\n", + " 'utt': '116-288045-0000',\n", + " 'utt2spk': '116-288045'}\n", + "5542\n", + "\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "pprint(dev_json[0])\n", + "print(len(dev_json))\n", + "print(type(dev_json))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "motivated-receptor", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "import itertools\n", + "\n", + "import numpy as np\n", + "\n", + "from deepspeech.utils.log import Log\n", + "\n", + "__all__ = [\"make_batchset\"]\n", + "\n", + "logger = Log(__name__).getlog()\n", + "\n", + "\n", + "def batchfy_by_seq(\n", + " sorted_data,\n", + " batch_size,\n", + " max_length_in,\n", + " max_length_out,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " ikey=\"input\",\n", + " iaxis=0,\n", + " okey=\"output\",\n", + " oaxis=0, ):\n", + " \"\"\"Make batch set from json dictionary\n", + "\n", + " :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n", + " :param int batch_size: batch size\n", + " :param int max_length_in: maximum length of input to decide adaptive batch size\n", + " :param int max_length_out: maximum length of output to decide adaptive batch size\n", + " :param int min_batch_size: mininum batch size (for multi-gpu)\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + " :param str ikey: key to access input\n", + " (for ASR ikey=\"input\", for TTS, MT ikey=\"output\".)\n", + " :param int iaxis: dimension to access input\n", + " (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n", + " :param str okey: key to access output\n", + " (for ASR, MT okey=\"output\". for TTS okey=\"input\".)\n", + " :param int oaxis: dimension to access output\n", + " (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)\n", + " :return: List[List[Tuple[str, dict]]] list of batches\n", + " \"\"\"\n", + " if batch_size <= 0:\n", + " raise ValueError(f\"Invalid batch_size={batch_size}\")\n", + "\n", + " # check #utts is more than min_batch_size\n", + " if len(sorted_data) < min_batch_size:\n", + " raise ValueError(\n", + " f\"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size}).\"\n", + " )\n", + "\n", + " # make list of minibatches\n", + " minibatches = []\n", + " start = 0\n", + " while True:\n", + " _, info = sorted_data[start]\n", + " ilen = int(info[ikey][iaxis][\"shape\"][0])\n", + " olen = (int(info[okey][oaxis][\"shape\"][0]) if oaxis >= 0 else\n", + " max(map(lambda x: int(x[\"shape\"][0]), info[okey])))\n", + " factor = max(int(ilen / max_length_in), int(olen / max_length_out))\n", + " # change batchsize depending on the input and output length\n", + " # if ilen = 1000 and max_length_in = 800\n", + " # then b = batchsize / 2\n", + " # and max(min_batches, .) avoids batchsize = 0\n", + " bs = max(min_batch_size, int(batch_size / (1 + factor)))\n", + " end = min(len(sorted_data), start + bs)\n", + " minibatch = sorted_data[start:end]\n", + " if shortest_first:\n", + " minibatch.reverse()\n", + "\n", + " # check each batch is more than minimum batchsize\n", + " if len(minibatch) < min_batch_size:\n", + " mod = min_batch_size - len(minibatch) % min_batch_size\n", + " additional_minibatch = [\n", + " sorted_data[i] for i in np.random.randint(0, start, mod)\n", + " ]\n", + " if shortest_first:\n", + " additional_minibatch.reverse()\n", + " minibatch.extend(additional_minibatch)\n", + " minibatches.append(minibatch)\n", + "\n", + " if end == len(sorted_data):\n", + " break\n", + " start = end\n", + "\n", + " # batch: List[List[Tuple[str, dict]]]\n", + " return minibatches\n", + "\n", + "\n", + "def batchfy_by_bin(\n", + " sorted_data,\n", + " batch_bins,\n", + " num_batches=0,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " ikey=\"input\",\n", + " okey=\"output\", ):\n", + " \"\"\"Make variably sized batch set, which maximizes\n", + "\n", + " the number of bins up to `batch_bins`.\n", + "\n", + " :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n", + " :param int batch_bins: Maximum frames of a batch\n", + " :param int num_batches: # number of batches to use (for debug)\n", + " :param int min_batch_size: minimum batch size (for multi-gpu)\n", + " :param int test: Return only every `test` batches\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + "\n", + " :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n", + " :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n", + "\n", + " :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n", + " \"\"\"\n", + " if batch_bins <= 0:\n", + " raise ValueError(f\"invalid batch_bins={batch_bins}\")\n", + " length = len(sorted_data)\n", + " idim = int(sorted_data[0][1][ikey][0][\"shape\"][1])\n", + " odim = int(sorted_data[0][1][okey][0][\"shape\"][1])\n", + " logger.info(\"# utts: \" + str(len(sorted_data)))\n", + " minibatches = []\n", + " start = 0\n", + " n = 0\n", + " while True:\n", + " # Dynamic batch size depending on size of samples\n", + " b = 0\n", + " next_size = 0\n", + " max_olen = 0\n", + " while next_size < batch_bins and (start + b) < length:\n", + " ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0]) * idim\n", + " olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0]) * odim\n", + " if olen > max_olen:\n", + " max_olen = olen\n", + " next_size = (max_olen + ilen) * (b + 1)\n", + " if next_size <= batch_bins:\n", + " b += 1\n", + " elif next_size == 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in batch_bins ({batch_bins}): \"\n", + " f\"Please increase the value\")\n", + " end = min(length, start + max(min_batch_size, b))\n", + " batch = sorted_data[start:end]\n", + " if shortest_first:\n", + " batch.reverse()\n", + " minibatches.append(batch)\n", + " # Check for min_batch_size and fixes the batches if needed\n", + " i = -1\n", + " while len(minibatches[i]) < min_batch_size:\n", + " missing = min_batch_size - len(minibatches[i])\n", + " if -i == len(minibatches):\n", + " minibatches[i + 1].extend(minibatches[i])\n", + " minibatches = minibatches[1:]\n", + " break\n", + " else:\n", + " minibatches[i].extend(minibatches[i - 1][:missing])\n", + " minibatches[i - 1] = minibatches[i - 1][missing:]\n", + " i -= 1\n", + " if end == length:\n", + " break\n", + " start = end\n", + " n += 1\n", + " if num_batches > 0:\n", + " minibatches = minibatches[:num_batches]\n", + " lengths = [len(x) for x in minibatches]\n", + " logger.info(\n", + " str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n", + " + \" to \" + str(max(lengths)) + \" samples \" + \"(avg \" + str(\n", + " int(np.mean(lengths))) + \" samples).\")\n", + " return minibatches\n", + "\n", + "\n", + "def batchfy_by_frame(\n", + " sorted_data,\n", + " max_frames_in,\n", + " max_frames_out,\n", + " max_frames_inout,\n", + " num_batches=0,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " ikey=\"input\",\n", + " okey=\"output\", ):\n", + " \"\"\"Make variable batch set, which maximizes the number of frames to max_batch_frame.\n", + "\n", + " :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json\n", + " :param int max_frames_in: Maximum input frames of a batch\n", + " :param int max_frames_out: Maximum output frames of a batch\n", + " :param int max_frames_inout: Maximum input+output frames of a batch\n", + " :param int num_batches: # number of batches to use (for debug)\n", + " :param int min_batch_size: minimum batch size (for multi-gpu)\n", + " :param int test: Return only every `test` batches\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + "\n", + " :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n", + " :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n", + "\n", + " :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n", + " \"\"\"\n", + " if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:\n", + " raise ValueError(\n", + " \"At least, one of `--batch-frames-in`, `--batch-frames-out` or \"\n", + " \"`--batch-frames-inout` should be > 0\")\n", + " length = len(sorted_data)\n", + " minibatches = []\n", + " start = 0\n", + " end = 0\n", + " while end != length:\n", + " # Dynamic batch size depending on size of samples\n", + " b = 0\n", + " max_olen = 0\n", + " max_ilen = 0\n", + " while (start + b) < length:\n", + " ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0])\n", + " if ilen > max_frames_in and max_frames_in != 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in --batch-frames-in ({max_frames_in}): \"\n", + " f\"Please increase the value\")\n", + " olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0])\n", + " if olen > max_frames_out and max_frames_out != 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in --batch-frames-out ({max_frames_out}): \"\n", + " f\"Please increase the value\")\n", + " if ilen + olen > max_frames_inout and max_frames_inout != 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in --batch-frames-out ({max_frames_inout}): \"\n", + " f\"Please increase the value\")\n", + " max_olen = max(max_olen, olen)\n", + " max_ilen = max(max_ilen, ilen)\n", + " in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0\n", + " out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0\n", + " inout_ok = (max_ilen + max_olen) * (\n", + " b + 1) <= max_frames_inout or max_frames_inout == 0\n", + " if in_ok and out_ok and inout_ok:\n", + " # add more seq in the minibatch\n", + " b += 1\n", + " else:\n", + " # no more seq in the minibatch\n", + " break\n", + " end = min(length, start + b)\n", + " batch = sorted_data[start:end]\n", + " if shortest_first:\n", + " batch.reverse()\n", + " minibatches.append(batch)\n", + " # Check for min_batch_size and fixes the batches if needed\n", + " i = -1\n", + " while len(minibatches[i]) < min_batch_size:\n", + " missing = min_batch_size - len(minibatches[i])\n", + " if -i == len(minibatches):\n", + " minibatches[i + 1].extend(minibatches[i])\n", + " minibatches = minibatches[1:]\n", + " break\n", + " else:\n", + " minibatches[i].extend(minibatches[i - 1][:missing])\n", + " minibatches[i - 1] = minibatches[i - 1][missing:]\n", + " i -= 1\n", + " start = end\n", + " if num_batches > 0:\n", + " minibatches = minibatches[:num_batches]\n", + " lengths = [len(x) for x in minibatches]\n", + " logger.info(\n", + " str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n", + " + \" to \" + str(max(lengths)) + \" samples\" + \"(avg \" + str(\n", + " int(np.mean(lengths))) + \" samples).\")\n", + "\n", + " return minibatches\n", + "\n", + "\n", + "def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,\n", + " shortest_first):\n", + " import random\n", + "\n", + " logger.info(\"use shuffled batch.\")\n", + " sorted_data = random.sample(data.items(), len(data.items()))\n", + " logger.info(\"# utts: \" + str(len(sorted_data)))\n", + " # make list of minibatches\n", + " minibatches = []\n", + " start = 0\n", + " while True:\n", + " end = min(len(sorted_data), start + batch_size)\n", + " # check each batch is more than minimum batchsize\n", + " minibatch = sorted_data[start:end]\n", + " if shortest_first:\n", + " minibatch.reverse()\n", + " if len(minibatch) < min_batch_size:\n", + " mod = min_batch_size - len(minibatch) % min_batch_size\n", + " additional_minibatch = [\n", + " sorted_data[i] for i in np.random.randint(0, start, mod)\n", + " ]\n", + " if shortest_first:\n", + " additional_minibatch.reverse()\n", + " minibatch.extend(additional_minibatch)\n", + " minibatches.append(minibatch)\n", + " if end == len(sorted_data):\n", + " break\n", + " start = end\n", + "\n", + " # for debugging\n", + " if num_batches > 0:\n", + " minibatches = minibatches[:num_batches]\n", + " logger.info(\"# minibatches: \" + str(len(minibatches)))\n", + " return minibatches\n", + "\n", + "\n", + "BATCH_COUNT_CHOICES = [\"auto\", \"seq\", \"bin\", \"frame\"]\n", + "BATCH_SORT_KEY_CHOICES = [\"input\", \"output\", \"shuffle\"]\n", + "\n", + "\n", + "def make_batchset(\n", + " data,\n", + " batch_size=0,\n", + " max_length_in=float(\"inf\"),\n", + " max_length_out=float(\"inf\"),\n", + " num_batches=0,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " batch_sort_key=\"input\",\n", + " count=\"auto\",\n", + " batch_bins=0,\n", + " batch_frames_in=0,\n", + " batch_frames_out=0,\n", + " batch_frames_inout=0,\n", + " iaxis=0,\n", + " oaxis=0, ):\n", + " \"\"\"Make batch set from json dictionary\n", + "\n", + " if utts have \"category\" value,\n", + "\n", + " >>> data = {'utt1': {'category': 'A', 'input': ...},\n", + " ... 'utt2': {'category': 'B', 'input': ...},\n", + " ... 'utt3': {'category': 'B', 'input': ...},\n", + " ... 'utt4': {'category': 'A', 'input': ...}}\n", + " >>> make_batchset(data, batchsize=2, ...)\n", + " [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]\n", + "\n", + " Note that if any utts doesn't have \"category\",\n", + " perform as same as batchfy_by_{count}\n", + "\n", + " :param List[Dict[str, Any]] data: dictionary loaded from data.json\n", + " :param int batch_size: maximum number of sequences in a minibatch.\n", + " :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.\n", + " :param int batch_frames_in: maximum number of input frames in a minibatch.\n", + " :param int batch_frames_out: maximum number of output frames in a minibatch.\n", + " :param int batch_frames_out: maximum number of input+output frames in a minibatch.\n", + " :param str count: strategy to count maximum size of batch.\n", + " For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES\n", + "\n", + " :param int max_length_in: maximum length of input to decide adaptive batch size\n", + " :param int max_length_out: maximum length of output to decide adaptive batch size\n", + " :param int num_batches: # number of batches to use (for debug)\n", + " :param int min_batch_size: minimum batch size (for multi-gpu)\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + " :param str batch_sort_key: how to sort data before creating minibatches\n", + " [\"input\", \"output\", \"shuffle\"]\n", + " :param bool swap_io: if True, use \"input\" as output and \"output\"\n", + " as input in `data` dict\n", + " :param bool mt: if True, use 0-axis of \"output\" as output and 1-axis of \"output\"\n", + " as input in `data` dict\n", + " :param int iaxis: dimension to access input\n", + " (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n", + " :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,\n", + " reserved for future research, -1 means all axis.)\n", + " :return: List[List[Tuple[str, dict]]] list of batches\n", + " \"\"\"\n", + "\n", + " # check args\n", + " if count not in BATCH_COUNT_CHOICES:\n", + " raise ValueError(\n", + " f\"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}\")\n", + " if batch_sort_key not in BATCH_SORT_KEY_CHOICES:\n", + " raise ValueError(f\"arg 'batch_sort_key' ({batch_sort_key}) should be \"\n", + " f\"one of {BATCH_SORT_KEY_CHOICES}\")\n", + "\n", + " ikey = \"input\"\n", + " okey = \"output\"\n", + " batch_sort_axis = 0 # index of list \n", + "\n", + " if count == \"auto\":\n", + " if batch_size != 0:\n", + " count = \"seq\"\n", + " elif batch_bins != 0:\n", + " count = \"bin\"\n", + " elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:\n", + " count = \"frame\"\n", + " else:\n", + " raise ValueError(\n", + " f\"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}\"\n", + " )\n", + " logger.info(f\"count is auto detected as {count}\")\n", + "\n", + " if count != \"seq\" and batch_sort_key == \"shuffle\":\n", + " raise ValueError(\n", + " \"batch_sort_key=shuffle is only available if batch_count=seq\")\n", + "\n", + " category2data = {} # Dict[str, dict]\n", + " for v in data:\n", + " k = v['utt']\n", + " category2data.setdefault(v.get(\"category\"), {})[k] = v\n", + "\n", + " batches_list = [] # List[List[List[Tuple[str, dict]]]]\n", + " for d in category2data.values():\n", + " if batch_sort_key == \"shuffle\":\n", + " batches = batchfy_shuffle(d, batch_size, min_batch_size,\n", + " num_batches, shortest_first)\n", + " batches_list.append(batches)\n", + " continue\n", + "\n", + " # sort it by input lengths (long to short)\n", + " sorted_data = sorted(\n", + " d.items(),\n", + " key=lambda data: int(data[1][batch_sort_key][batch_sort_axis][\"shape\"][0]),\n", + " reverse=not shortest_first, )\n", + " logger.info(\"# utts: \" + str(len(sorted_data)))\n", + " \n", + " if count == \"seq\":\n", + " batches = batchfy_by_seq(\n", + " sorted_data,\n", + " batch_size=batch_size,\n", + " max_length_in=max_length_in,\n", + " max_length_out=max_length_out,\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=shortest_first,\n", + " ikey=ikey,\n", + " iaxis=iaxis,\n", + " okey=okey,\n", + " oaxis=oaxis, )\n", + " if count == \"bin\":\n", + " batches = batchfy_by_bin(\n", + " sorted_data,\n", + " batch_bins=batch_bins,\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=shortest_first,\n", + " ikey=ikey,\n", + " okey=okey, )\n", + " if count == \"frame\":\n", + " batches = batchfy_by_frame(\n", + " sorted_data,\n", + " max_frames_in=batch_frames_in,\n", + " max_frames_out=batch_frames_out,\n", + " max_frames_inout=batch_frames_inout,\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=shortest_first,\n", + " ikey=ikey,\n", + " okey=okey, )\n", + " batches_list.append(batches)\n", + "\n", + " if len(batches_list) == 1:\n", + " batches = batches_list[0]\n", + " else:\n", + " # Concat list. This way is faster than \"sum(batch_list, [])\"\n", + " batches = list(itertools.chain(*batches_list))\n", + "\n", + " # for debugging\n", + " if num_batches > 0:\n", + " batches = batches[:num_batches]\n", + " logger.info(\"# minibatches: \" + str(len(batches)))\n", + "\n", + " # batch: List[List[Tuple[str, dict]]]\n", + " return batches\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "acquired-hurricane", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO 2021/08/17 04:09:47 :284] use shuffled batch.\n", + "[INFO 2021/08/17 04:09:47 :286] # utts: 5542\n", + "[INFO 2021/08/17 04:09:47 :467] # minibatches: 555\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "555\n" + ] + } + ], + "source": [ + "batch_size=10\n", + "maxlen_in=300\n", + "maxlen_out=400\n", + "minibatches=0 # for debug\n", + "min_batch_size=2\n", + "use_sortagrad=True\n", + "batch_count='seq'\n", + "batch_bins=0\n", + "batch_frames_in=3000\n", + "batch_frames_out=0\n", + "batch_frames_inout=0\n", + " \n", + "dev_data = make_batchset(\n", + " dev_json,\n", + " batch_size,\n", + " maxlen_in,\n", + " maxlen_out,\n", + " minibatches, # for debug\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=use_sortagrad,\n", + " batch_sort_key=\"shuffle\",\n", + " count=batch_count,\n", + " batch_bins=batch_bins,\n", + " batch_frames_in=batch_frames_in,\n", + " batch_frames_out=batch_frames_out,\n", + " batch_frames_inout=batch_frames_inout,\n", + " iaxis=0,\n", + " oaxis=0, )\n", + "print(len(dev_data))\n", + "# for i in range(len(dev_data)):\n", + "# print(len(dev_data[i]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "warming-malpractice", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting kaldiio\n", + " Downloading kaldiio-2.17.2.tar.gz (24 kB)\n", + "Requirement already satisfied: numpy in ./tools/venv/lib/python3.7/site-packages (from kaldiio) (1.20.1)\n", + "Building wheels for collected packages: kaldiio\n", + " Building wheel for kaldiio (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for kaldiio: filename=kaldiio-2.17.2-py3-none-any.whl size=24469 sha256=aadc8b1a8de5c9769af065ae724fb11326691d2350145019f6e3dba69f020134\n", + " Stored in directory: /root/.cache/pip/wheels/04/07/e8/45641287c59bf6ce41e22259f8680b521c31e6306cb88392ac\n", + "Successfully built kaldiio\n", + "Installing collected packages: kaldiio\n", + "Successfully installed kaldiio-2.17.2\n", + "\u001b[33mWARNING: You are using pip version 20.0.1; however, version 21.2.4 is available.\n", + "You should consider upgrading via the '/workspace/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install kaldiio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "equipped-subject", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "superb-methodology", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "import kaldiio\n", + "\n", + "class LoadInputsAndTargets():\n", + " \"\"\"Create a mini-batch from a list of dicts\n", + "\n", + " >>> batch = [('utt1',\n", + " ... dict(input=[dict(feat='some.ark:123',\n", + " ... filetype='mat',\n", + " ... name='input1',\n", + " ... shape=[100, 80])],\n", + " ... output=[dict(tokenid='1 2 3 4',\n", + " ... name='target1',\n", + " ... shape=[4, 31])]]))\n", + " >>> l = LoadInputsAndTargets()\n", + " >>> feat, target = l(batch)\n", + "\n", + " :param: str mode: Specify the task mode, \"asr\" or \"tts\"\n", + " :param: str preprocess_conf: The path of a json file for pre-processing\n", + " :param: bool load_input: If False, not to load the input data\n", + " :param: bool load_output: If False, not to load the output data\n", + " :param: bool sort_in_input_length: Sort the mini-batch in descending order\n", + " of the input length\n", + " :param: bool use_speaker_embedding: Used for tts mode only\n", + " :param: bool use_second_target: Used for tts mode only\n", + " :param: dict preprocess_args: Set some optional arguments for preprocessing\n", + " :param: Optional[dict] preprocess_args: Used for tts mode only\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " mode=\"asr\",\n", + " preprocess_conf=None,\n", + " load_input=True,\n", + " load_output=True,\n", + " sort_in_input_length=True,\n", + " preprocess_args=None,\n", + " keep_all_data_on_mem=False, ):\n", + " self._loaders = {}\n", + "\n", + " if mode not in [\"asr\"]:\n", + " raise ValueError(\"Only asr are allowed: mode={}\".format(mode))\n", + "\n", + " if preprocess_conf is not None:\n", + " self.preprocessing = AugmentationPipeline(preprocess_conf)\n", + " logging.warning(\n", + " \"[Experimental feature] Some preprocessing will be done \"\n", + " \"for the mini-batch creation using {}\".format(\n", + " self.preprocessing))\n", + " else:\n", + " # If conf doesn't exist, this function don't touch anything.\n", + " self.preprocessing = None\n", + "\n", + " self.mode = mode\n", + " self.load_output = load_output\n", + " self.load_input = load_input\n", + " self.sort_in_input_length = sort_in_input_length\n", + " if preprocess_args is None:\n", + " self.preprocess_args = {}\n", + " else:\n", + " assert isinstance(preprocess_args, dict), type(preprocess_args)\n", + " self.preprocess_args = dict(preprocess_args)\n", + "\n", + " self.keep_all_data_on_mem = keep_all_data_on_mem\n", + "\n", + " def __call__(self, batch, return_uttid=False):\n", + " \"\"\"Function to load inputs and targets from list of dicts\n", + "\n", + " :param List[Tuple[str, dict]] batch: list of dict which is subset of\n", + " loaded data.json\n", + " :param bool return_uttid: return utterance ID information for visualization\n", + " :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]\n", + " :return: list of input feature sequences\n", + " [(T_1, D), (T_2, D), ..., (T_B, D)]\n", + " :rtype: list of float ndarray\n", + " :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]\n", + " :rtype: list of int ndarray\n", + "\n", + " \"\"\"\n", + " x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]\n", + " y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]\n", + " uttid_list = [] # List[str]\n", + "\n", + " for uttid, info in batch:\n", + " uttid_list.append(uttid)\n", + "\n", + " if self.load_input:\n", + " # Note(kamo): This for-loop is for multiple inputs\n", + " for idx, inp in enumerate(info[\"input\"]):\n", + " # {\"input\":\n", + " # [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"hdf5\",\n", + " # \"name\": \"input1\", ...}], ...}\n", + " x = self._get_from_loader(\n", + " filepath=inp[\"feat\"],\n", + " filetype=inp.get(\"filetype\", \"mat\"))\n", + " x_feats_dict.setdefault(inp[\"name\"], []).append(x)\n", + "\n", + " if self.load_output:\n", + " for idx, inp in enumerate(info[\"output\"]):\n", + " if \"tokenid\" in inp:\n", + " # ======= Legacy format for output =======\n", + " # {\"output\": [{\"tokenid\": \"1 2 3 4\"}])\n", + " x = np.fromiter(\n", + " map(int, inp[\"tokenid\"].split()), dtype=np.int64)\n", + " else:\n", + " # ======= New format =======\n", + " # {\"input\":\n", + " # [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"hdf5\",\n", + " # \"name\": \"target1\", ...}], ...}\n", + " x = self._get_from_loader(\n", + " filepath=inp[\"feat\"],\n", + " filetype=inp.get(\"filetype\", \"mat\"))\n", + "\n", + " y_feats_dict.setdefault(inp[\"name\"], []).append(x)\n", + "\n", + " if self.mode == \"asr\":\n", + " return_batch, uttid_list = self._create_batch_asr(\n", + " x_feats_dict, y_feats_dict, uttid_list)\n", + " else:\n", + " raise NotImplementedError(self.mode)\n", + "\n", + " if self.preprocessing is not None:\n", + " # Apply pre-processing all input features\n", + " for x_name in return_batch.keys():\n", + " if x_name.startswith(\"input\"):\n", + " return_batch[x_name] = self.preprocessing(\n", + " return_batch[x_name], uttid_list,\n", + " **self.preprocess_args)\n", + "\n", + " if return_uttid:\n", + " return tuple(return_batch.values()), uttid_list\n", + "\n", + " # Doesn't return the names now.\n", + " return tuple(return_batch.values())\n", + "\n", + " def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):\n", + " \"\"\"Create a OrderedDict for the mini-batch\n", + "\n", + " :param OrderedDict x_feats_dict:\n", + " e.g. {\"input1\": [ndarray, ndarray, ...],\n", + " \"input2\": [ndarray, ndarray, ...]}\n", + " :param OrderedDict y_feats_dict:\n", + " e.g. {\"target1\": [ndarray, ndarray, ...],\n", + " \"target2\": [ndarray, ndarray, ...]}\n", + " :param: List[str] uttid_list:\n", + " Give uttid_list to sort in the same order as the mini-batch\n", + " :return: batch, uttid_list\n", + " :rtype: Tuple[OrderedDict, List[str]]\n", + " \"\"\"\n", + " # handle single-input and multi-input (paralell) asr mode\n", + " xs = list(x_feats_dict.values())\n", + "\n", + " if self.load_output:\n", + " ys = list(y_feats_dict.values())\n", + " assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))\n", + "\n", + " # get index of non-zero length samples\n", + " nonzero_idx = list(\n", + " filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))\n", + " for n in range(1, len(y_feats_dict)):\n", + " nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)\n", + " else:\n", + " # Note(kamo): Be careful not to make nonzero_idx to a generator\n", + " nonzero_idx = list(range(len(xs[0])))\n", + "\n", + " if self.sort_in_input_length:\n", + " # sort in input lengths based on the first input\n", + " nonzero_sorted_idx = sorted(\n", + " nonzero_idx, key=lambda i: -len(xs[0][i]))\n", + " else:\n", + " nonzero_sorted_idx = nonzero_idx\n", + "\n", + " if len(nonzero_sorted_idx) != len(xs[0]):\n", + " logging.warning(\n", + " \"Target sequences include empty tokenid (batch {} -> {}).\".\n", + " format(len(xs[0]), len(nonzero_sorted_idx)))\n", + "\n", + " # remove zero-length samples\n", + " xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]\n", + " uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]\n", + "\n", + " x_names = list(x_feats_dict.keys())\n", + " if self.load_output:\n", + " ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]\n", + " y_names = list(y_feats_dict.keys())\n", + "\n", + " # Keeping x_name and y_name, e.g. input1, for future extension\n", + " return_batch = OrderedDict([\n", + " * [(x_name, x) for x_name, x in zip(x_names, xs)],\n", + " * [(y_name, y) for y_name, y in zip(y_names, ys)],\n", + " ])\n", + " else:\n", + " return_batch = OrderedDict(\n", + " [(x_name, x) for x_name, x in zip(x_names, xs)])\n", + " return return_batch, uttid_list\n", + "\n", + " def _get_from_loader(self, filepath, filetype):\n", + " \"\"\"Return ndarray\n", + "\n", + " In order to make the fds to be opened only at the first referring,\n", + " the loader are stored in self._loaders\n", + "\n", + " >>> ndarray = loader.get_from_loader(\n", + " ... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')\n", + "\n", + " :param: str filepath:\n", + " :param: str filetype:\n", + " :return:\n", + " :rtype: np.ndarray\n", + " \"\"\"\n", + " if filetype == \"hdf5\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"hdf5\",\n", + " # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n", + " filepath, key = filepath.split(\":\", 1)\n", + "\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = h5py.File(filepath, \"r\")\n", + " self._loaders[filepath] = loader\n", + " return loader[key][()]\n", + " elif filetype == \"sound.hdf5\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"sound.hdf5\",\n", + " # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n", + " filepath, key = filepath.split(\":\", 1)\n", + "\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = SoundHDF5File(filepath, \"r\", dtype=\"int16\")\n", + " self._loaders[filepath] = loader\n", + " array, rate = loader[key]\n", + " return array\n", + " elif filetype == \"sound\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.wav\",\n", + " # \"filetype\": \"sound\"},\n", + " # Assume PCM16\n", + " if not self.keep_all_data_on_mem:\n", + " array, _ = soundfile.read(filepath, dtype=\"int16\")\n", + " return array\n", + " if filepath not in self._loaders:\n", + " array, _ = soundfile.read(filepath, dtype=\"int16\")\n", + " self._loaders[filepath] = array\n", + " return self._loaders[filepath]\n", + " elif filetype == \"npz\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.npz:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"npz\",\n", + " filepath, key = filepath.split(\":\", 1)\n", + "\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = np.load(filepath)\n", + " self._loaders[filepath] = loader\n", + " return loader[key]\n", + " elif filetype == \"npy\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.npy\",\n", + " # \"filetype\": \"npy\"},\n", + " if not self.keep_all_data_on_mem:\n", + " return np.load(filepath)\n", + " if filepath not in self._loaders:\n", + " self._loaders[filepath] = np.load(filepath)\n", + " return self._loaders[filepath]\n", + " elif filetype in [\"mat\", \"vec\"]:\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.ark:123\",\n", + " # \"filetype\": \"mat\"}]},\n", + " # In this case, \"123\" indicates the starting points of the matrix\n", + " # load_mat can load both matrix and vector\n", + " if not self.keep_all_data_on_mem:\n", + " return kaldiio.load_mat(filepath)\n", + " if filepath not in self._loaders:\n", + " self._loaders[filepath] = kaldiio.load_mat(filepath)\n", + " return self._loaders[filepath]\n", + " elif filetype == \"scp\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.scp:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"scp\",\n", + " filepath, key = filepath.split(\":\", 1)\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = kaldiio.load_scp(filepath)\n", + " self._loaders[filepath] = loader\n", + " return loader[key]\n", + " else:\n", + " raise NotImplementedError(\n", + " \"Not supported: loader_type={}\".format(filetype))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "monthly-muscle", + "metadata": {}, + "outputs": [], + "source": [ + "preprocess_conf=None\n", + "train_mode=True\n", + "load = LoadInputsAndTargets(\n", + " mode=\"asr\",\n", + " load_output=True,\n", + " preprocess_conf=preprocess_conf,\n", + " preprocess_args={\"train\":\n", + " train_mode}, # Switch the mode of preprocessing\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "periodic-senegal", + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdev_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, batch, return_uttid)\u001b[0m\n\u001b[1;32m 94\u001b[0m x = self._get_from_loader(\n\u001b[1;32m 95\u001b[0m \u001b[0mfilepath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"feat\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m filetype=inp.get(\"filetype\", \"mat\"))\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0mx_feats_dict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m_get_from_loader\u001b[0;34m(self, filepath, filetype)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;31m# load_mat can load both matrix and vector\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeep_all_data_on_mem\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 280\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 281\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfilepath\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_loaders\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkaldiio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/matio.py\u001b[0m in \u001b[0;36mload_mat\u001b[0;34m(ark_name, endian, fd_dict)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 239\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 240\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen_like_kaldi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mark\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfd\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 241\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_load_mat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfd\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mslices\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mendian\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mendian\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 242\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/kaldiio/utils.py\u001b[0m in \u001b[0;36mopen_like_kaldi\u001b[0;34m(name, mode)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0mencoding\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"b\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mdefault_encoding\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 208\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 209\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark'" + ] + } + ], + "source": [ + "res = load(dev_data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "humanitarian-container", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ls: cannot access '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark': No such file or directory\r\n" + ] + } + ], + "source": [ + "!ls /workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.12.ark" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "heard-prize", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ls: cannot access '/workspace/espnet/': No such file or directory\r\n" + ] + } + ], + "source": [ + "!ls /workspace/espnet/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "convinced-animation", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py index d237eb74..36c1ec31 100644 --- a/deepspeech/io/batchfy.py +++ b/deepspeech/io/batchfy.py @@ -347,7 +347,7 @@ def make_batchset( Note that if any utts doesn't have "category", perform as same as batchfy_by_{count} - :param Dict[str, Dict[str, Any]] data: dictionary loaded from data.json + :param List[Dict[str, Any]] data: dictionary loaded from data.json :param int batch_size: maximum number of sequences in a minibatch. :param int batch_bins: maximum number of bins (frames x dim) in a minibatch. :param int batch_frames_in: maximum number of input frames in a minibatch. @@ -374,7 +374,6 @@ def make_batchset( reserved for future research, -1 means all axis.) :return: List[List[Tuple[str, dict]]] list of batches """ - # check args if count not in BATCH_COUNT_CHOICES: raise ValueError( @@ -386,7 +385,6 @@ def make_batchset( ikey = "input" okey = "output" batch_sort_axis = 0 # index of list - if count == "auto": if batch_size != 0: count = "seq" @@ -405,7 +403,8 @@ def make_batchset( "batch_sort_key=shuffle is only available if batch_count=seq") category2data = {} # Dict[str, dict] - for k, v in data.items(): + for v in data: + k = v['utt'] category2data.setdefault(v.get("category"), {})[k] = v batches_list = [] # List[List[List[Tuple[str, dict]]]] @@ -422,6 +421,7 @@ def make_batchset( key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), reverse=not shortest_first, ) logger.info("# utts: " + str(len(sorted_data))) + if count == "seq": batches = batchfy_by_seq( sorted_data, @@ -466,4 +466,4 @@ def make_batchset( logger.info("# minibatches: " + str(len(batches))) # batch: List[List[Tuple[str, dict]]] - return batches + return batches \ No newline at end of file diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index e2db9340..a30666b4 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -16,7 +16,7 @@ from typing import Optional from paddle.io import Dataset from yacs.config import CfgNode -from deepspeech.frontend.utility import read_manifest + from deepspeech.utils.log import Log __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]