diff --git a/.notebook/espnet_dataloader.ipynb b/.notebook/espnet_dataloader.ipynb new file mode 100644 index 000000000..1bfc13e3c --- /dev/null +++ b/.notebook/espnet_dataloader.ipynb @@ -0,0 +1,1541 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 147, + "id": "extensive-venice", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/\n" + ] + }, + { + "data": { + "text/plain": [ + "'/'" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%cd ..\n", + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "correct-window", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "manifest.dev\t manifest.test-clean\t manifest.train\r\n", + "manifest.dev.raw manifest.test-clean.raw manifest.train.raw\r\n" + ] + } + ], + "source": [ + "!ls /workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "exceptional-cheese", + "metadata": {}, + "outputs": [], + "source": [ + "dev_data='/workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev'" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "extraordinary-orleans", + "metadata": {}, + "outputs": [], + "source": [ + "from deepspeech.frontend.utility import read_manifest" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "returning-lighter", + "metadata": {}, + "outputs": [], + "source": [ + "dev_json = read_manifest(dev_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "western-founder", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'input': [{'feat': '/workspace/zhanghui/asr/espnet/egs/librispeech/asr1/dump/dev/deltafalse/feats.1.ark:16',\n", + " 'name': 'input1',\n", + " 'shape': [1063, 83]}],\n", + " 'output': [{'name': 'target1',\n", + " 'shape': [41, 5002],\n", + " 'text': 'AS I APPROACHED THE CITY I HEARD BELLS RINGING AND A '\n", + " 'LITTLE LATER I FOUND THE STREETS ASTIR WITH THRONGS OF '\n", + " 'WELL DRESSED PEOPLE IN FAMILY GROUPS WENDING THEIR WAY '\n", + " 'HITHER AND THITHER',\n", + " 'token': '▁AS ▁I ▁APPROACHED ▁THE ▁CITY ▁I ▁HEARD ▁BELL S ▁RING '\n", + " 'ING ▁AND ▁A ▁LITTLE ▁LATER ▁I ▁FOUND ▁THE ▁STREETS ▁AS '\n", + " 'T IR ▁WITH ▁THRONG S ▁OF ▁WELL ▁DRESSED ▁PEOPLE ▁IN '\n", + " '▁FAMILY ▁GROUP S ▁WE ND ING ▁THEIR ▁WAY ▁HITHER ▁AND '\n", + " '▁THITHER',\n", + " 'tokenid': '713 2458 676 4502 1155 2458 2351 849 389 3831 206 627 '\n", + " '482 2812 2728 2458 2104 4502 4316 713 404 212 4925 '\n", + " '4549 389 3204 4861 1677 3339 2495 1950 2279 389 4845 '\n", + " '302 206 4504 4843 2394 627 4526'}],\n", + " 'utt': '116-288045-0000',\n", + " 'utt2spk': '116-288045'}\n", + "5542\n", + "\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "pprint(dev_json[0])\n", + "print(len(dev_json))\n", + "print(type(dev_json))" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "motivated-receptor", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "import itertools\n", + "\n", + "import numpy as np\n", + "\n", + "from deepspeech.utils.log import Log\n", + "\n", + "__all__ = [\"make_batchset\"]\n", + "\n", + "logger = Log(__name__).getlog()\n", + "\n", + "\n", + "def batchfy_by_seq(\n", + " sorted_data,\n", + " batch_size,\n", + " max_length_in,\n", + " max_length_out,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " ikey=\"input\",\n", + " iaxis=0,\n", + " okey=\"output\",\n", + " oaxis=0, ):\n", + " \"\"\"Make batch set from json dictionary\n", + "\n", + " :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n", + " :param int batch_size: batch size\n", + " :param int max_length_in: maximum length of input to decide adaptive batch size\n", + " :param int max_length_out: maximum length of output to decide adaptive batch size\n", + " :param int min_batch_size: mininum batch size (for multi-gpu)\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + " :param str ikey: key to access input\n", + " (for ASR ikey=\"input\", for TTS, MT ikey=\"output\".)\n", + " :param int iaxis: dimension to access input\n", + " (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n", + " :param str okey: key to access output\n", + " (for ASR, MT okey=\"output\". for TTS okey=\"input\".)\n", + " :param int oaxis: dimension to access output\n", + " (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)\n", + " :return: List[List[Tuple[str, dict]]] list of batches\n", + " \"\"\"\n", + " if batch_size <= 0:\n", + " raise ValueError(f\"Invalid batch_size={batch_size}\")\n", + "\n", + " # check #utts is more than min_batch_size\n", + " if len(sorted_data) < min_batch_size:\n", + " raise ValueError(\n", + " f\"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size}).\"\n", + " )\n", + "\n", + " # make list of minibatches\n", + " minibatches = []\n", + " start = 0\n", + " while True:\n", + " _, info = sorted_data[start]\n", + " ilen = int(info[ikey][iaxis][\"shape\"][0])\n", + " olen = (int(info[okey][oaxis][\"shape\"][0]) if oaxis >= 0 else\n", + " max(map(lambda x: int(x[\"shape\"][0]), info[okey])))\n", + " factor = max(int(ilen / max_length_in), int(olen / max_length_out))\n", + " # change batchsize depending on the input and output length\n", + " # if ilen = 1000 and max_length_in = 800\n", + " # then b = batchsize / 2\n", + " # and max(min_batches, .) avoids batchsize = 0\n", + " bs = max(min_batch_size, int(batch_size / (1 + factor)))\n", + " end = min(len(sorted_data), start + bs)\n", + " minibatch = sorted_data[start:end]\n", + " if shortest_first:\n", + " minibatch.reverse()\n", + "\n", + " # check each batch is more than minimum batchsize\n", + " if len(minibatch) < min_batch_size:\n", + " mod = min_batch_size - len(minibatch) % min_batch_size\n", + " additional_minibatch = [\n", + " sorted_data[i] for i in np.random.randint(0, start, mod)\n", + " ]\n", + " if shortest_first:\n", + " additional_minibatch.reverse()\n", + " minibatch.extend(additional_minibatch)\n", + " minibatches.append(minibatch)\n", + "\n", + " if end == len(sorted_data):\n", + " break\n", + " start = end\n", + "\n", + " # batch: List[List[Tuple[str, dict]]]\n", + " return minibatches\n", + "\n", + "\n", + "def batchfy_by_bin(\n", + " sorted_data,\n", + " batch_bins,\n", + " num_batches=0,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " ikey=\"input\",\n", + " okey=\"output\", ):\n", + " \"\"\"Make variably sized batch set, which maximizes\n", + "\n", + " the number of bins up to `batch_bins`.\n", + "\n", + " :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json\n", + " :param int batch_bins: Maximum frames of a batch\n", + " :param int num_batches: # number of batches to use (for debug)\n", + " :param int min_batch_size: minimum batch size (for multi-gpu)\n", + " :param int test: Return only every `test` batches\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + "\n", + " :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n", + " :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n", + "\n", + " :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n", + " \"\"\"\n", + " if batch_bins <= 0:\n", + " raise ValueError(f\"invalid batch_bins={batch_bins}\")\n", + " length = len(sorted_data)\n", + " idim = int(sorted_data[0][1][ikey][0][\"shape\"][1])\n", + " odim = int(sorted_data[0][1][okey][0][\"shape\"][1])\n", + " logger.info(\"# utts: \" + str(len(sorted_data)))\n", + " minibatches = []\n", + " start = 0\n", + " n = 0\n", + " while True:\n", + " # Dynamic batch size depending on size of samples\n", + " b = 0\n", + " next_size = 0\n", + " max_olen = 0\n", + " while next_size < batch_bins and (start + b) < length:\n", + " ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0]) * idim\n", + " olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0]) * odim\n", + " if olen > max_olen:\n", + " max_olen = olen\n", + " next_size = (max_olen + ilen) * (b + 1)\n", + " if next_size <= batch_bins:\n", + " b += 1\n", + " elif next_size == 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in batch_bins ({batch_bins}): \"\n", + " f\"Please increase the value\")\n", + " end = min(length, start + max(min_batch_size, b))\n", + " batch = sorted_data[start:end]\n", + " if shortest_first:\n", + " batch.reverse()\n", + " minibatches.append(batch)\n", + " # Check for min_batch_size and fixes the batches if needed\n", + " i = -1\n", + " while len(minibatches[i]) < min_batch_size:\n", + " missing = min_batch_size - len(minibatches[i])\n", + " if -i == len(minibatches):\n", + " minibatches[i + 1].extend(minibatches[i])\n", + " minibatches = minibatches[1:]\n", + " break\n", + " else:\n", + " minibatches[i].extend(minibatches[i - 1][:missing])\n", + " minibatches[i - 1] = minibatches[i - 1][missing:]\n", + " i -= 1\n", + " if end == length:\n", + " break\n", + " start = end\n", + " n += 1\n", + " if num_batches > 0:\n", + " minibatches = minibatches[:num_batches]\n", + " lengths = [len(x) for x in minibatches]\n", + " logger.info(\n", + " str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n", + " + \" to \" + str(max(lengths)) + \" samples \" + \"(avg \" + str(\n", + " int(np.mean(lengths))) + \" samples).\")\n", + " return minibatches\n", + "\n", + "\n", + "def batchfy_by_frame(\n", + " sorted_data,\n", + " max_frames_in,\n", + " max_frames_out,\n", + " max_frames_inout,\n", + " num_batches=0,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " ikey=\"input\",\n", + " okey=\"output\", ):\n", + " \"\"\"Make variable batch set, which maximizes the number of frames to max_batch_frame.\n", + "\n", + " :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json\n", + " :param int max_frames_in: Maximum input frames of a batch\n", + " :param int max_frames_out: Maximum output frames of a batch\n", + " :param int max_frames_inout: Maximum input+output frames of a batch\n", + " :param int num_batches: # number of batches to use (for debug)\n", + " :param int min_batch_size: minimum batch size (for multi-gpu)\n", + " :param int test: Return only every `test` batches\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + "\n", + " :param str ikey: key to access input (for ASR ikey=\"input\", for TTS ikey=\"output\".)\n", + " :param str okey: key to access output (for ASR okey=\"output\". for TTS okey=\"input\".)\n", + "\n", + " :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches\n", + " \"\"\"\n", + " if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:\n", + " raise ValueError(\n", + " \"At least, one of `--batch-frames-in`, `--batch-frames-out` or \"\n", + " \"`--batch-frames-inout` should be > 0\")\n", + " length = len(sorted_data)\n", + " minibatches = []\n", + " start = 0\n", + " end = 0\n", + " while end != length:\n", + " # Dynamic batch size depending on size of samples\n", + " b = 0\n", + " max_olen = 0\n", + " max_ilen = 0\n", + " while (start + b) < length:\n", + " ilen = int(sorted_data[start + b][1][ikey][0][\"shape\"][0])\n", + " if ilen > max_frames_in and max_frames_in != 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in --batch-frames-in ({max_frames_in}): \"\n", + " f\"Please increase the value\")\n", + " olen = int(sorted_data[start + b][1][okey][0][\"shape\"][0])\n", + " if olen > max_frames_out and max_frames_out != 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in --batch-frames-out ({max_frames_out}): \"\n", + " f\"Please increase the value\")\n", + " if ilen + olen > max_frames_inout and max_frames_inout != 0:\n", + " raise ValueError(\n", + " f\"Can't fit one sample in --batch-frames-out ({max_frames_inout}): \"\n", + " f\"Please increase the value\")\n", + " max_olen = max(max_olen, olen)\n", + " max_ilen = max(max_ilen, ilen)\n", + " in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0\n", + " out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0\n", + " inout_ok = (max_ilen + max_olen) * (\n", + " b + 1) <= max_frames_inout or max_frames_inout == 0\n", + " if in_ok and out_ok and inout_ok:\n", + " # add more seq in the minibatch\n", + " b += 1\n", + " else:\n", + " # no more seq in the minibatch\n", + " break\n", + " end = min(length, start + b)\n", + " batch = sorted_data[start:end]\n", + " if shortest_first:\n", + " batch.reverse()\n", + " minibatches.append(batch)\n", + " # Check for min_batch_size and fixes the batches if needed\n", + " i = -1\n", + " while len(minibatches[i]) < min_batch_size:\n", + " missing = min_batch_size - len(minibatches[i])\n", + " if -i == len(minibatches):\n", + " minibatches[i + 1].extend(minibatches[i])\n", + " minibatches = minibatches[1:]\n", + " break\n", + " else:\n", + " minibatches[i].extend(minibatches[i - 1][:missing])\n", + " minibatches[i - 1] = minibatches[i - 1][missing:]\n", + " i -= 1\n", + " start = end\n", + " if num_batches > 0:\n", + " minibatches = minibatches[:num_batches]\n", + " lengths = [len(x) for x in minibatches]\n", + " logger.info(\n", + " str(len(minibatches)) + \" batches containing from \" + str(min(lengths))\n", + " + \" to \" + str(max(lengths)) + \" samples\" + \"(avg \" + str(\n", + " int(np.mean(lengths))) + \" samples).\")\n", + "\n", + " return minibatches\n", + "\n", + "\n", + "def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,\n", + " shortest_first):\n", + " import random\n", + "\n", + " logger.info(\"use shuffled batch.\")\n", + " sorted_data = random.sample(data.items(), len(data.items()))\n", + " logger.info(\"# utts: \" + str(len(sorted_data)))\n", + " # make list of minibatches\n", + " minibatches = []\n", + " start = 0\n", + " while True:\n", + " end = min(len(sorted_data), start + batch_size)\n", + " # check each batch is more than minimum batchsize\n", + " minibatch = sorted_data[start:end]\n", + " if shortest_first:\n", + " minibatch.reverse()\n", + " if len(minibatch) < min_batch_size:\n", + " mod = min_batch_size - len(minibatch) % min_batch_size\n", + " additional_minibatch = [\n", + " sorted_data[i] for i in np.random.randint(0, start, mod)\n", + " ]\n", + " if shortest_first:\n", + " additional_minibatch.reverse()\n", + " minibatch.extend(additional_minibatch)\n", + " minibatches.append(minibatch)\n", + " if end == len(sorted_data):\n", + " break\n", + " start = end\n", + "\n", + " # for debugging\n", + " if num_batches > 0:\n", + " minibatches = minibatches[:num_batches]\n", + " logger.info(\"# minibatches: \" + str(len(minibatches)))\n", + " return minibatches\n", + "\n", + "\n", + "BATCH_COUNT_CHOICES = [\"auto\", \"seq\", \"bin\", \"frame\"]\n", + "BATCH_SORT_KEY_CHOICES = [\"input\", \"output\", \"shuffle\"]\n", + "\n", + "\n", + "def make_batchset(\n", + " data,\n", + " batch_size=0,\n", + " max_length_in=float(\"inf\"),\n", + " max_length_out=float(\"inf\"),\n", + " num_batches=0,\n", + " min_batch_size=1,\n", + " shortest_first=False,\n", + " batch_sort_key=\"input\",\n", + " count=\"auto\",\n", + " batch_bins=0,\n", + " batch_frames_in=0,\n", + " batch_frames_out=0,\n", + " batch_frames_inout=0,\n", + " iaxis=0,\n", + " oaxis=0, ):\n", + " \"\"\"Make batch set from json dictionary\n", + "\n", + " if utts have \"category\" value,\n", + "\n", + " >>> data = {'utt1': {'category': 'A', 'input': ...},\n", + " ... 'utt2': {'category': 'B', 'input': ...},\n", + " ... 'utt3': {'category': 'B', 'input': ...},\n", + " ... 'utt4': {'category': 'A', 'input': ...}}\n", + " >>> make_batchset(data, batchsize=2, ...)\n", + " [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]\n", + "\n", + " Note that if any utts doesn't have \"category\",\n", + " perform as same as batchfy_by_{count}\n", + "\n", + " :param List[Dict[str, Any]] data: dictionary loaded from data.json\n", + " :param int batch_size: maximum number of sequences in a minibatch.\n", + " :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.\n", + " :param int batch_frames_in: maximum number of input frames in a minibatch.\n", + " :param int batch_frames_out: maximum number of output frames in a minibatch.\n", + " :param int batch_frames_out: maximum number of input+output frames in a minibatch.\n", + " :param str count: strategy to count maximum size of batch.\n", + " For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES\n", + "\n", + " :param int max_length_in: maximum length of input to decide adaptive batch size\n", + " :param int max_length_out: maximum length of output to decide adaptive batch size\n", + " :param int num_batches: # number of batches to use (for debug)\n", + " :param int min_batch_size: minimum batch size (for multi-gpu)\n", + " :param bool shortest_first: Sort from batch with shortest samples\n", + " to longest if true, otherwise reverse\n", + " :param str batch_sort_key: how to sort data before creating minibatches\n", + " [\"input\", \"output\", \"shuffle\"]\n", + " :param bool swap_io: if True, use \"input\" as output and \"output\"\n", + " as input in `data` dict\n", + " :param bool mt: if True, use 0-axis of \"output\" as output and 1-axis of \"output\"\n", + " as input in `data` dict\n", + " :param int iaxis: dimension to access input\n", + " (for ASR, TTS iaxis=0, for MT iaxis=\"1\".)\n", + " :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,\n", + " reserved for future research, -1 means all axis.)\n", + " :return: List[List[Tuple[str, dict]]] list of batches\n", + " \"\"\"\n", + "\n", + " # check args\n", + " if count not in BATCH_COUNT_CHOICES:\n", + " raise ValueError(\n", + " f\"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}\")\n", + " if batch_sort_key not in BATCH_SORT_KEY_CHOICES:\n", + " raise ValueError(f\"arg 'batch_sort_key' ({batch_sort_key}) should be \"\n", + " f\"one of {BATCH_SORT_KEY_CHOICES}\")\n", + "\n", + " ikey = \"input\"\n", + " okey = \"output\"\n", + " batch_sort_axis = 0 # index of list \n", + "\n", + " if count == \"auto\":\n", + " if batch_size != 0:\n", + " count = \"seq\"\n", + " elif batch_bins != 0:\n", + " count = \"bin\"\n", + " elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:\n", + " count = \"frame\"\n", + " else:\n", + " raise ValueError(\n", + " f\"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}\"\n", + " )\n", + " logger.info(f\"count is auto detected as {count}\")\n", + "\n", + " if count != \"seq\" and batch_sort_key == \"shuffle\":\n", + " raise ValueError(\n", + " \"batch_sort_key=shuffle is only available if batch_count=seq\")\n", + "\n", + " category2data = {} # Dict[str, dict]\n", + " for v in data:\n", + " k = v['utt']\n", + " category2data.setdefault(v.get(\"category\"), {})[k] = v\n", + "\n", + " batches_list = [] # List[List[List[Tuple[str, dict]]]]\n", + " for d in category2data.values():\n", + " if batch_sort_key == \"shuffle\":\n", + " batches = batchfy_shuffle(d, batch_size, min_batch_size,\n", + " num_batches, shortest_first)\n", + " batches_list.append(batches)\n", + " continue\n", + "\n", + " # sort it by input lengths (long to short)\n", + " sorted_data = sorted(\n", + " d.items(),\n", + " key=lambda data: int(data[1][batch_sort_key][batch_sort_axis][\"shape\"][0]),\n", + " reverse=not shortest_first, )\n", + " logger.info(\"# utts: \" + str(len(sorted_data)))\n", + " \n", + " if count == \"seq\":\n", + " batches = batchfy_by_seq(\n", + " sorted_data,\n", + " batch_size=batch_size,\n", + " max_length_in=max_length_in,\n", + " max_length_out=max_length_out,\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=shortest_first,\n", + " ikey=ikey,\n", + " iaxis=iaxis,\n", + " okey=okey,\n", + " oaxis=oaxis, )\n", + " if count == \"bin\":\n", + " batches = batchfy_by_bin(\n", + " sorted_data,\n", + " batch_bins=batch_bins,\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=shortest_first,\n", + " ikey=ikey,\n", + " okey=okey, )\n", + " if count == \"frame\":\n", + " batches = batchfy_by_frame(\n", + " sorted_data,\n", + " max_frames_in=batch_frames_in,\n", + " max_frames_out=batch_frames_out,\n", + " max_frames_inout=batch_frames_inout,\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=shortest_first,\n", + " ikey=ikey,\n", + " okey=okey, )\n", + " batches_list.append(batches)\n", + "\n", + " if len(batches_list) == 1:\n", + " batches = batches_list[0]\n", + " else:\n", + " # Concat list. This way is faster than \"sum(batch_list, [])\"\n", + " batches = list(itertools.chain(*batches_list))\n", + "\n", + " # for debugging\n", + " if num_batches > 0:\n", + " batches = batches[:num_batches]\n", + " logger.info(\"# minibatches: \" + str(len(batches)))\n", + "\n", + " # batch: List[List[Tuple[str, dict]]]\n", + " return batches\n" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "acquired-hurricane", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO 2021/08/18 06:57:10 1445365138.py:284] use shuffled batch.\n", + "[INFO 2021/08/18 06:57:10 1445365138.py:286] # utts: 5542\n", + "[INFO 2021/08/18 06:57:10 1445365138.py:468] # minibatches: 555\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "555\n" + ] + } + ], + "source": [ + "batch_size=10\n", + "maxlen_in=300\n", + "maxlen_out=400\n", + "minibatches=0 # for debug\n", + "min_batch_size=2\n", + "use_sortagrad=True\n", + "batch_count='seq'\n", + "batch_bins=0\n", + "batch_frames_in=3000\n", + "batch_frames_out=0\n", + "batch_frames_inout=0\n", + " \n", + "dev_data = make_batchset(\n", + " dev_json,\n", + " batch_size,\n", + " maxlen_in,\n", + " maxlen_out,\n", + " minibatches, # for debug\n", + " min_batch_size=min_batch_size,\n", + " shortest_first=use_sortagrad,\n", + " batch_sort_key=\"shuffle\",\n", + " count=batch_count,\n", + " batch_bins=batch_bins,\n", + " batch_frames_in=batch_frames_in,\n", + " batch_frames_out=batch_frames_out,\n", + " batch_frames_inout=batch_frames_inout,\n", + " iaxis=0,\n", + " oaxis=0, )\n", + "print(len(dev_data))\n", + "# for i in range(len(dev_data)):\n", + "# print(len(dev_data[i]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "warming-malpractice", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: kaldiio in ./DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages (2.17.2)\n", + "Requirement already satisfied: numpy in ./DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numpy-1.21.2-py3.7-linux-x86_64.egg (from kaldiio) (1.21.2)\n", + "\u001b[33mWARNING: You are using pip version 20.3.3; however, version 21.2.4 is available.\n", + "You should consider upgrading via the '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install kaldiio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "equipped-subject", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "superb-methodology", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "import kaldiio\n", + "\n", + "class LoadInputsAndTargets():\n", + " \"\"\"Create a mini-batch from a list of dicts\n", + "\n", + " >>> batch = [('utt1',\n", + " ... dict(input=[dict(feat='some.ark:123',\n", + " ... filetype='mat',\n", + " ... name='input1',\n", + " ... shape=[100, 80])],\n", + " ... output=[dict(tokenid='1 2 3 4',\n", + " ... name='target1',\n", + " ... shape=[4, 31])]]))\n", + " >>> l = LoadInputsAndTargets()\n", + " >>> feat, target = l(batch)\n", + "\n", + " :param: str mode: Specify the task mode, \"asr\" or \"tts\"\n", + " :param: str preprocess_conf: The path of a json file for pre-processing\n", + " :param: bool load_input: If False, not to load the input data\n", + " :param: bool load_output: If False, not to load the output data\n", + " :param: bool sort_in_input_length: Sort the mini-batch in descending order\n", + " of the input length\n", + " :param: bool use_speaker_embedding: Used for tts mode only\n", + " :param: bool use_second_target: Used for tts mode only\n", + " :param: dict preprocess_args: Set some optional arguments for preprocessing\n", + " :param: Optional[dict] preprocess_args: Used for tts mode only\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " mode=\"asr\",\n", + " preprocess_conf=None,\n", + " load_input=True,\n", + " load_output=True,\n", + " sort_in_input_length=True,\n", + " preprocess_args=None,\n", + " keep_all_data_on_mem=False, ):\n", + " self._loaders = {}\n", + "\n", + " if mode not in [\"asr\"]:\n", + " raise ValueError(\"Only asr are allowed: mode={}\".format(mode))\n", + "\n", + " if preprocess_conf is not None:\n", + " self.preprocessing = AugmentationPipeline(preprocess_conf)\n", + " logging.warning(\n", + " \"[Experimental feature] Some preprocessing will be done \"\n", + " \"for the mini-batch creation using {}\".format(\n", + " self.preprocessing))\n", + " else:\n", + " # If conf doesn't exist, this function don't touch anything.\n", + " self.preprocessing = None\n", + "\n", + " self.mode = mode\n", + " self.load_output = load_output\n", + " self.load_input = load_input\n", + " self.sort_in_input_length = sort_in_input_length\n", + " if preprocess_args is None:\n", + " self.preprocess_args = {}\n", + " else:\n", + " assert isinstance(preprocess_args, dict), type(preprocess_args)\n", + " self.preprocess_args = dict(preprocess_args)\n", + "\n", + " self.keep_all_data_on_mem = keep_all_data_on_mem\n", + "\n", + " def __call__(self, batch, return_uttid=False):\n", + " \"\"\"Function to load inputs and targets from list of dicts\n", + "\n", + " :param List[Tuple[str, dict]] batch: list of dict which is subset of\n", + " loaded data.json\n", + " :param bool return_uttid: return utterance ID information for visualization\n", + " :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)]\n", + " :return: list of input feature sequences\n", + " [(T_1, D), (T_2, D), ..., (T_B, D)]\n", + " :rtype: list of float ndarray\n", + " :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)]\n", + " :rtype: list of int ndarray\n", + "\n", + " \"\"\"\n", + " x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]\n", + " y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]]\n", + " uttid_list = [] # List[str]\n", + "\n", + " for uttid, info in batch:\n", + " uttid_list.append(uttid)\n", + "\n", + " if self.load_input:\n", + " # Note(kamo): This for-loop is for multiple inputs\n", + " for idx, inp in enumerate(info[\"input\"]):\n", + " # {\"input\":\n", + " # [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"hdf5\",\n", + " # \"name\": \"input1\", ...}], ...}\n", + " x = self._get_from_loader(\n", + " filepath=inp[\"feat\"],\n", + " filetype=inp.get(\"filetype\", \"mat\"))\n", + " x_feats_dict.setdefault(inp[\"name\"], []).append(x)\n", + "\n", + " if self.load_output:\n", + " for idx, inp in enumerate(info[\"output\"]):\n", + " if \"tokenid\" in inp:\n", + " # ======= Legacy format for output =======\n", + " # {\"output\": [{\"tokenid\": \"1 2 3 4\"}])\n", + " x = np.fromiter(\n", + " map(int, inp[\"tokenid\"].split()), dtype=np.int64)\n", + " else:\n", + " # ======= New format =======\n", + " # {\"input\":\n", + " # [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"hdf5\",\n", + " # \"name\": \"target1\", ...}], ...}\n", + " x = self._get_from_loader(\n", + " filepath=inp[\"feat\"],\n", + " filetype=inp.get(\"filetype\", \"mat\"))\n", + "\n", + " y_feats_dict.setdefault(inp[\"name\"], []).append(x)\n", + "\n", + " if self.mode == \"asr\":\n", + " return_batch, uttid_list = self._create_batch_asr(\n", + " x_feats_dict, y_feats_dict, uttid_list)\n", + " else:\n", + " raise NotImplementedError(self.mode)\n", + "\n", + " if self.preprocessing is not None:\n", + " # Apply pre-processing all input features\n", + " for x_name in return_batch.keys():\n", + " if x_name.startswith(\"input\"):\n", + " return_batch[x_name] = self.preprocessing(\n", + " return_batch[x_name], uttid_list,\n", + " **self.preprocess_args)\n", + "\n", + " if return_uttid:\n", + " return tuple(return_batch.values()), uttid_list\n", + "\n", + " # Doesn't return the names now.\n", + " return tuple(return_batch.values())\n", + "\n", + " def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list):\n", + " \"\"\"Create a OrderedDict for the mini-batch\n", + "\n", + " :param OrderedDict x_feats_dict:\n", + " e.g. {\"input1\": [ndarray, ndarray, ...],\n", + " \"input2\": [ndarray, ndarray, ...]}\n", + " :param OrderedDict y_feats_dict:\n", + " e.g. {\"target1\": [ndarray, ndarray, ...],\n", + " \"target2\": [ndarray, ndarray, ...]}\n", + " :param: List[str] uttid_list:\n", + " Give uttid_list to sort in the same order as the mini-batch\n", + " :return: batch, uttid_list\n", + " :rtype: Tuple[OrderedDict, List[str]]\n", + " \"\"\"\n", + " # handle single-input and multi-input (paralell) asr mode\n", + " xs = list(x_feats_dict.values())\n", + "\n", + " if self.load_output:\n", + " ys = list(y_feats_dict.values())\n", + " assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0]))\n", + "\n", + " # get index of non-zero length samples\n", + " nonzero_idx = list(\n", + " filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0]))))\n", + " for n in range(1, len(y_feats_dict)):\n", + " nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx)\n", + " else:\n", + " # Note(kamo): Be careful not to make nonzero_idx to a generator\n", + " nonzero_idx = list(range(len(xs[0])))\n", + "\n", + " if self.sort_in_input_length:\n", + " # sort in input lengths based on the first input\n", + " nonzero_sorted_idx = sorted(\n", + " nonzero_idx, key=lambda i: -len(xs[0][i]))\n", + " else:\n", + " nonzero_sorted_idx = nonzero_idx\n", + "\n", + " if len(nonzero_sorted_idx) != len(xs[0]):\n", + " logging.warning(\n", + " \"Target sequences include empty tokenid (batch {} -> {}).\".\n", + " format(len(xs[0]), len(nonzero_sorted_idx)))\n", + "\n", + " # remove zero-length samples\n", + " xs = [[x[i] for i in nonzero_sorted_idx] for x in xs]\n", + " uttid_list = [uttid_list[i] for i in nonzero_sorted_idx]\n", + "\n", + " x_names = list(x_feats_dict.keys())\n", + " if self.load_output:\n", + " ys = [[y[i] for i in nonzero_sorted_idx] for y in ys]\n", + " y_names = list(y_feats_dict.keys())\n", + "\n", + " # Keeping x_name and y_name, e.g. input1, for future extension\n", + " return_batch = OrderedDict([\n", + " * [(x_name, x) for x_name, x in zip(x_names, xs)],\n", + " * [(y_name, y) for y_name, y in zip(y_names, ys)],\n", + " ])\n", + " else:\n", + " return_batch = OrderedDict(\n", + " [(x_name, x) for x_name, x in zip(x_names, xs)])\n", + " return return_batch, uttid_list\n", + "\n", + " def _get_from_loader(self, filepath, filetype):\n", + " \"\"\"Return ndarray\n", + "\n", + " In order to make the fds to be opened only at the first referring,\n", + " the loader are stored in self._loaders\n", + "\n", + " >>> ndarray = loader.get_from_loader(\n", + " ... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5')\n", + "\n", + " :param: str filepath:\n", + " :param: str filetype:\n", + " :return:\n", + " :rtype: np.ndarray\n", + " \"\"\"\n", + " if filetype == \"hdf5\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"hdf5\",\n", + " # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n", + " filepath, key = filepath.split(\":\", 1)\n", + "\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = h5py.File(filepath, \"r\")\n", + " self._loaders[filepath] = loader\n", + " return loader[key][()]\n", + " elif filetype == \"sound.hdf5\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.h5:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"sound.hdf5\",\n", + " # -> filepath = \"some/path.h5\", key = \"F01_050C0101_PED_REAL\"\n", + " filepath, key = filepath.split(\":\", 1)\n", + "\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = SoundHDF5File(filepath, \"r\", dtype=\"int16\")\n", + " self._loaders[filepath] = loader\n", + " array, rate = loader[key]\n", + " return array\n", + " elif filetype == \"sound\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.wav\",\n", + " # \"filetype\": \"sound\"},\n", + " # Assume PCM16\n", + " if not self.keep_all_data_on_mem:\n", + " array, _ = soundfile.read(filepath, dtype=\"int16\")\n", + " return array\n", + " if filepath not in self._loaders:\n", + " array, _ = soundfile.read(filepath, dtype=\"int16\")\n", + " self._loaders[filepath] = array\n", + " return self._loaders[filepath]\n", + " elif filetype == \"npz\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.npz:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"npz\",\n", + " filepath, key = filepath.split(\":\", 1)\n", + "\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = np.load(filepath)\n", + " self._loaders[filepath] = loader\n", + " return loader[key]\n", + " elif filetype == \"npy\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.npy\",\n", + " # \"filetype\": \"npy\"},\n", + " if not self.keep_all_data_on_mem:\n", + " return np.load(filepath)\n", + " if filepath not in self._loaders:\n", + " self._loaders[filepath] = np.load(filepath)\n", + " return self._loaders[filepath]\n", + " elif filetype in [\"mat\", \"vec\"]:\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.ark:123\",\n", + " # \"filetype\": \"mat\"}]},\n", + " # In this case, \"123\" indicates the starting points of the matrix\n", + " # load_mat can load both matrix and vector\n", + " if not self.keep_all_data_on_mem:\n", + " return kaldiio.load_mat(filepath)\n", + " if filepath not in self._loaders:\n", + " self._loaders[filepath] = kaldiio.load_mat(filepath)\n", + " return self._loaders[filepath]\n", + " elif filetype == \"scp\":\n", + " # e.g.\n", + " # {\"input\": [{\"feat\": \"some/path.scp:F01_050C0101_PED_REAL\",\n", + " # \"filetype\": \"scp\",\n", + " filepath, key = filepath.split(\":\", 1)\n", + " loader = self._loaders.get(filepath)\n", + " if loader is None:\n", + " # To avoid disk access, create loader only for the first time\n", + " loader = kaldiio.load_scp(filepath)\n", + " self._loaders[filepath] = loader\n", + " return loader[key]\n", + " else:\n", + " raise NotImplementedError(\n", + " \"Not supported: loader_type={}\".format(filetype))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "monthly-muscle", + "metadata": {}, + "outputs": [], + "source": [ + "preprocess_conf=None\n", + "train_mode=True\n", + "load = LoadInputsAndTargets(\n", + " mode=\"asr\",\n", + " load_output=True,\n", + " preprocess_conf=preprocess_conf,\n", + " preprocess_args={\"train\":\n", + " train_mode}, # Switch the mode of preprocessing\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "periodic-senegal", + "metadata": {}, + "outputs": [], + "source": [ + "res = load(dev_data[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "502d3f4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "2\n", + "10\n", + "10\n", + "(1174, 83) float32\n", + "(29,) int64\n" + ] + } + ], + "source": [ + "print(type(res))\n", + "print(len(res))\n", + "print(len(res[0]))\n", + "print(len(res[1]))\n", + "print(res[0][0].shape, res[0][0].dtype)\n", + "print(res[1][0].shape, res[1][0].dtype)\n", + "# Tuple[Tuple[np.ndarry], Tuple[np.ndarry]]\n", + "# 2[10, 10]\n", + "# feats, labels" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "humanitarian-container", + "metadata": {}, + "outputs": [], + "source": [ + "(inputs, outputs), utts = load(dev_data[0], return_uttid=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "heard-prize", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['4572-112383-0005', '6313-66125-0015', '251-137823-0022', '2277-149896-0030', '652-130726-0032', '5895-34615-0013', '1462-170138-0002', '777-126732-0008', '3660-172182-0021', '2277-149896-0027'] 10\n", + "10\n" + ] + } + ], + "source": [ + "print(utts, len(utts))\n", + "print(len(inputs))" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "convinced-animation", + "metadata": {}, + "outputs": [], + "source": [ + "import paddle\n", + "from deepspeech.io.utility import pad_list\n", + "class CustomConverter():\n", + " \"\"\"Custom batch converter.\n", + "\n", + " Args:\n", + " subsampling_factor (int): The subsampling factor.\n", + " dtype (paddle.dtype): Data type to convert.\n", + "\n", + " \"\"\"\n", + "\n", + " def __init__(self, subsampling_factor=1, dtype=np.float32):\n", + " \"\"\"Construct a CustomConverter object.\"\"\"\n", + " self.subsampling_factor = subsampling_factor\n", + " self.ignore_id = -1\n", + " self.dtype = dtype\n", + "\n", + " def __call__(self, batch):\n", + " \"\"\"Transform a batch and send it to a device.\n", + "\n", + " Args:\n", + " batch (list): The batch to transform.\n", + "\n", + " Returns:\n", + " tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)\n", + "\n", + " \"\"\"\n", + " # batch should be located in list\n", + " assert len(batch) == 1\n", + " (xs, ys), utts = batch[0]\n", + "\n", + " # perform subsampling\n", + " if self.subsampling_factor > 1:\n", + " xs = [x[::self.subsampling_factor, :] for x in xs]\n", + "\n", + " # get batch of lengths of input sequences\n", + " ilens = np.array([x.shape[0] for x in xs])\n", + "\n", + " # perform padding and convert to tensor\n", + " # currently only support real number\n", + " if xs[0].dtype.kind == \"c\":\n", + " xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)\n", + " xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)\n", + " # Note(kamo):\n", + " # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.\n", + " # Don't create ComplexTensor and give it E2E here\n", + " # because torch.nn.DataParellel can't handle it.\n", + " xs_pad = {\"real\": xs_pad_real, \"imag\": xs_pad_imag}\n", + " else:\n", + " xs_pad = pad_list(xs, 0).astype(self.dtype)\n", + "\n", + " # NOTE: this is for multi-output (e.g., speech translation)\n", + " ys_pad = pad_list(\n", + " [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],\n", + " self.ignore_id)\n", + "\n", + " olens = np.array([y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])\n", + " return utts, xs_pad, ilens, ys_pad, olens" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "0b92ade5", + "metadata": {}, + "outputs": [], + "source": [ + "convert = CustomConverter()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "8dbd847c", + "metadata": {}, + "outputs": [], + "source": [ + "utts, xs, ilen, ys, olen = convert([load(dev_data[0], return_uttid=True)])" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "31c085f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['4572-112383-0005', '6313-66125-0015', '251-137823-0022', '2277-149896-0030', '652-130726-0032', '5895-34615-0013', '1462-170138-0002', '777-126732-0008', '3660-172182-0021', '2277-149896-0027']\n", + "(10, 1174, 83)\n", + "(10,)\n", + "[1174 821 716 628 597 473 463 441 419 358]\n", + "(10, 32)\n", + "[[4502 2404 4223 3204 4502 587 1018 3861 2932 713 2458 2916 253 4508\n", + " 627 1395 713 4504 957 2761 209 2967 3173 3918 2598 4100 3 2816\n", + " 4990 -1 -1 -1]\n", + " [1005 451 210 278 3411 206 482 2307 573 4502 3848 4577 4273 2388\n", + " 4444 89 4919 278 1264 4501 2371 3 139 113 2603 4962 3158 3325\n", + " 4577 814 4587 1422]\n", + " [2345 4144 2291 200 713 2345 532 999 2458 3076 545 2458 4832 3038\n", + " 4499 482 2812 1260 3080 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1]\n", + " [2345 832 4577 4920 4501 2345 2298 1236 381 288 389 101 2495 4172\n", + " 4843 3233 3245 4501 2345 2298 3987 4502 3023 3353 2345 1361 1635 2603\n", + " 4723 2371 -1 -1]\n", + " [4502 4207 432 3204 4502 2396 125 935 433 2598 483 18 327 2\n", + " 389 627 4512 2340 713 482 1981 4525 4031 269 2030 1340 101 2495\n", + " 4013 4844 -1 -1]\n", + " [4502 4892 3204 1892 3780 389 482 2774 3013 89 192 2495 4502 3475\n", + " 389 66 370 343 404 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1]\n", + " [2458 2314 4577 2340 2863 1254 303 269 2 389 932 2079 4577 299\n", + " 195 3233 4508 2 89 814 3144 1091 3204 3250 2193 3414 -1 -1\n", + " -1 -1 -1 -1]\n", + " [2391 1785 443 78 39 4962 2340 829 599 4593 278 4681 202 407\n", + " 269 194 182 4577 482 4308 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1]\n", + " [ 627 4873 2175 363 202 404 1018 4577 4502 3412 4875 2286 107 122\n", + " 4832 2345 3896 89 2368 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1]\n", + " [ 481 174 474 599 1881 3252 2842 742 4502 2545 107 88 3204 4525\n", + " 4517 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1\n", + " -1 -1 -1 -1]]\n", + "[29 32 19 30 30 19 26 20 19 15]\n", + "float32\n", + "int64\n", + "int64\n", + "int64\n" + ] + } + ], + "source": [ + "print(utts)\n", + "print(xs.shape)\n", + "print(ilen.shape)\n", + "print(ilen)\n", + "print(ys.shape)\n", + "print(ys)\n", + "print(olen)\n", + "print(xs.dtype)\n", + "print(ilen.dtype)\n", + "print(ys.dtype)\n", + "print(olen.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "72e9ba60", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 230, + "id": "64593e5f", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from paddle.io import DataLoader\n", + "\n", + "from deepspeech.frontend.utility import read_manifest\n", + "from deepspeech.io.batchfy import make_batchset\n", + "from deepspeech.io.converter import CustomConverter\n", + "from deepspeech.io.dataset import TransformDataset\n", + "from deepspeech.io.reader import LoadInputsAndTargets\n", + "from deepspeech.utils.log import Log\n", + "\n", + "\n", + "logger = Log(__name__).getlog()\n", + "\n", + "\n", + "class BatchDataLoader():\n", + " def __init__(self,\n", + " json_file: str,\n", + " train_mode: bool,\n", + " sortagrad: bool=False,\n", + " batch_size: int=0,\n", + " maxlen_in: float=float('inf'),\n", + " maxlen_out: float=float('inf'),\n", + " minibatches: int=0,\n", + " mini_batch_size: int=1,\n", + " batch_count: str='auto',\n", + " batch_bins: int=0,\n", + " batch_frames_in: int=0,\n", + " batch_frames_out: int=0,\n", + " batch_frames_inout: int=0,\n", + " preprocess_conf=None,\n", + " n_iter_processes: int=1,\n", + " subsampling_factor: int=1,\n", + " num_encs: int=1):\n", + " self.json_file = json_file\n", + " self.train_mode = train_mode\n", + " self.use_sortagrad = sortagrad == -1 or sortagrad > 0\n", + " self.batch_size = batch_size\n", + " self.maxlen_in = maxlen_in\n", + " self.maxlen_out = maxlen_out\n", + " self.batch_count = batch_count\n", + " self.batch_bins = batch_bins\n", + " self.batch_frames_in = batch_frames_in\n", + " self.batch_frames_out = batch_frames_out\n", + " self.batch_frames_inout = batch_frames_inout\n", + " self.subsampling_factor = subsampling_factor\n", + " self.num_encs = num_encs\n", + " self.preprocess_conf = preprocess_conf\n", + " self.n_iter_processes = n_iter_processes\n", + "\n", + " \n", + " # read json data\n", + " self.data_json = read_manifest(json_file)\n", + "\n", + " # make minibatch list (variable length)\n", + " self.minibaches = make_batchset(\n", + " self.data_json,\n", + " batch_size,\n", + " maxlen_in,\n", + " maxlen_out,\n", + " minibatches, # for debug\n", + " min_batch_size=mini_batch_size,\n", + " shortest_first=self.use_sortagrad,\n", + " count=batch_count,\n", + " batch_bins=batch_bins,\n", + " batch_frames_in=batch_frames_in,\n", + " batch_frames_out=batch_frames_out,\n", + " batch_frames_inout=batch_frames_inout,\n", + " iaxis=0,\n", + " oaxis=0, )\n", + "\n", + " # data reader\n", + " self.reader = LoadInputsAndTargets(\n", + " mode=\"asr\",\n", + " load_output=True,\n", + " preprocess_conf=preprocess_conf,\n", + " preprocess_args={\"train\":\n", + " train_mode}, # Switch the mode of preprocessing\n", + " )\n", + "\n", + " # Setup a converter\n", + " if num_encs == 1:\n", + " self.converter = CustomConverter(\n", + " subsampling_factor=subsampling_factor, dtype=np.float32)\n", + " else:\n", + " assert NotImplementedError(\"not impl CustomConverterMulEnc.\")\n", + "\n", + " # hack to make batchsize argument as 1\n", + " # actual bathsize is included in a list\n", + " # default collate function converts numpy array to pytorch tensor\n", + " # we used an empty collate function instead which returns list\n", + " self.dataset = TransformDataset(self.minibaches, \n", + " lambda data: self.converter([self.reader(data, return_uttid=True)]))\n", + " self.dataloader = DataLoader(\n", + " dataset=self.dataset,\n", + " batch_size=1,\n", + " shuffle=not use_sortagrad if train_mode else False,\n", + " collate_fn=lambda x: x[0],\n", + " num_workers=n_iter_processes, )\n", + "\n", + " def __repr__(self):\n", + " echo = f\"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> \"\n", + " echo += f\"train_mode: {self.train_mode}, \"\n", + " echo += f\"sortagrad: {self.use_sortagrad}, \"\n", + " echo += f\"batch_size: {self.batch_size}, \"\n", + " echo += f\"maxlen_in: {self.maxlen_in}, \"\n", + " echo += f\"maxlen_out: {self.maxlen_out}, \"\n", + " echo += f\"batch_count: {self.batch_count}, \"\n", + " echo += f\"batch_bins: {self.batch_bins}, \"\n", + " echo += f\"batch_frames_in: {self.batch_frames_in}, \"\n", + " echo += f\"batch_frames_out: {self.batch_frames_out}, \"\n", + " echo += f\"batch_frames_inout: {self.batch_frames_inout}, \"\n", + " echo += f\"subsampling_factor: {self.subsampling_factor}, \"\n", + " echo += f\"num_encs: {self.num_encs}, \"\n", + " echo += f\"num_workers: {self.n_iter_processes}, \"\n", + " echo += f\"file: {self.json_file}\"\n", + " return echo\n", + " \n", + " def __len__(self):\n", + " return len(self.dataloader)\n", + " \n", + " def __iter__(self):\n", + " return self.dataloader.__iter__()\n", + " \n", + " def __call__(self):\n", + " return self.__iter__()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 231, + "id": "fcea3fd0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[INFO 2021/08/18 07:42:23 batchfy.py:399] count is auto detected as seq\n", + "[INFO 2021/08/18 07:42:23 batchfy.py:423] # utts: 5542\n", + "[INFO 2021/08/18 07:42:23 batchfy.py:466] # minibatches: 278\n" + ] + } + ], + "source": [ + "train = BatchDataLoader(dev_data, True, batch_size=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "id": "e2a2c9a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "278\n", + "['__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'auto_collate_batch', 'batch_sampler', 'batch_size', 'collate_fn', 'dataset', 'dataset_kind', 'feed_list', 'from_dataset', 'from_generator', 'num_workers', 'pin_memory', 'places', 'return_list', 'timeout', 'use_buffer_reader', 'use_shared_memory', 'worker_init_fn']\n", + "<__main__.BatchDataLoader object at 0x7fdddba35470> train_mode: True, sortagrad: False, batch_size: 20, maxlen_in: inf, maxlen_out: inf, batch_count: auto, batch_bins: 0, batch_frames_in: 0, batch_frames_out: 0, batch_frames_inout: 0, subsampling_factor: 1, num_encs: 1, num_workers: 1, file: /workspace/zhanghui/DeepSpeech-2.x/examples/librispeech/s2/data/manifest.dev\n", + "278\n" + ] + } + ], + "source": [ + "print(len(train.dataloader))\n", + "print(dir(train.dataloader))\n", + "print(train)\n", + "print(len(train))" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "id": "a5ba7d6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['7601-101619-0003', '1255-138279-0000', '1272-128104-0004', '6123-59150-0027', '2078-142845-0025', '7850-73752-0018', '4570-24733-0004', '2506-169427-0002', '7601-101619-0004', '3170-137482-0000', '6267-53049-0019', '4570-14911-0009', '174-168635-0018', '7601-291468-0004', '3576-138058-0022', '1919-142785-0007', '6467-62797-0007', '4153-61735-0005', '1686-142278-0003', '2506-169427-0000']\n", + "Tensor(shape=[20, 2961, 83], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[[-1.99415934, -1.80315673, -1.88801885, ..., 0.86933994, -0.59853148, 0.02596200],\n", + " [-1.95346808, -1.84891188, -2.17492867, ..., 0.83640492, -0.59853148, -0.11333394],\n", + " [-2.27899861, -2.21495342, -2.58480024, ..., 0.91874266, -0.59853148, -0.31453922],\n", + " ...,\n", + " [-2.64522028, -2.35221887, -2.91269732, ..., 1.48994756, -0.16100442, 0.36646330],\n", + " [-2.40107250, -2.21495342, -2.37986445, ..., 1.44072104, -0.13220564, 0.12656468],\n", + " [-2.15692472, -1.89466715, -2.25690317, ..., 1.31273174, -0.09620714, -0.15202725]],\n", + "\n", + " [[-0.28859532, -0.29033494, -0.86576819, ..., 1.37753224, -0.30570769, 0.25806731],\n", + " [-0.20149794, -0.17814466, -0.59891301, ..., 1.35188794, -0.30570769, -0.02964944],\n", + " [-0.34947991, -0.33597648, -0.96877253, ..., 1.38394332, -0.30570769, -0.38376236],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[-0.44914246, -0.33902276, -0.78237975, ..., 1.38218808, 0.29214793, -0.16815147],\n", + " [-0.55490732, -0.41596055, -0.84425378, ..., 1.34530187, 0.25002354, -0.04004869],\n", + " [-0.83694696, -0.62112784, -1.07112527, ..., 1.19160914, 0.20789915, 0.37984371],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[-1.24343657, -0.94188881, -1.41092563, ..., 0.96716309, 0.60345763, 0.15360183],\n", + " [-1.19466043, -0.80585432, -0.49723154, ..., 1.06735480, 0.60345763, 0.14511746],\n", + " [-0.94079566, -0.59330046, -0.40948665, ..., 0.82244170, 0.55614340, 0.28086722],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[ 0.21757117, 0.11361472, -0.33262897, ..., 0.76338506, -0.10711290, -0.57754958],\n", + " [-1.00205481, -0.61152041, -0.47124696, ..., 1.11897349, -0.10711290, 0.24931324],\n", + " [-1.03929281, -1.20336759, -1.16433656, ..., 0.88888687, -0.10711290, -0.04115745],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[-1.25289667, -1.05046368, -0.82881606, ..., 1.23991334, 0.61702502, 0.05275881],\n", + " [-1.19659519, -0.78677225, -0.80407262, ..., 1.27644968, 0.61702502, -0.35079369],\n", + " [-1.49687004, -1.01750231, -0.82881606, ..., 1.29106426, 0.65006059, 0.17958963],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]]])\n", + "Tensor(shape=[20], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [2961, 2948, 2938, 2907, 2904, 2838, 2832, 2819, 2815, 2797, 2775, 2710, 2709, 2696, 2688, 2661, 2616, 2595, 2589, 2576])\n", + "Tensor(shape=[20, 133], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[3098, 1595, 389, ..., -1 , -1 , -1 ],\n", + " [2603, 4832, 482, ..., -1 , -1 , -1 ],\n", + " [2796, 303, 269, ..., -1 , -1 , -1 ],\n", + " ...,\n", + " [3218, 3673, 206, ..., -1 , -1 , -1 ],\n", + " [2371, 4832, 4031, ..., -1 , -1 , -1 ],\n", + " [2570, 2433, 4285, ..., -1 , -1 , -1 ]])\n", + "Tensor(shape=[20], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [80 , 83 , 102, 133, 82 , 102, 71 , 91 , 68 , 81 , 86 , 67 , 71 , 95 , 65 , 88 , 97 , 98 , 89 , 72 ])\n" + ] + } + ], + "source": [ + "for batch in train:\n", + " utts, xs, ilens, ys, olens = batch\n", + " print(utts)\n", + " print(xs)\n", + " print(ilens)\n", + " print(ys)\n", + " print(olens)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c974a1e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.notebook/u2_confermer_model_wenet.ipynb b/.notebook/u2_confermer_model_wenet.ipynb index 4f2c9632f..a425e16cb 100644 --- a/.notebook/u2_confermer_model_wenet.ipynb +++ b/.notebook/u2_confermer_model_wenet.ipynb @@ -3431,7 +3431,7 @@ " convolution_layer_args = (output_size, cnn_module_kernel, activation,\n", " cnn_module_norm, causal)\n", "\n", - " self.encoders = nn.ModuleList([\n", + " self.encoders = nn.LayerList([\n", " ConformerEncoderLayer(\n", " size=output_size,\n", " self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n", diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 37531657e..1316256e4 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -30,24 +30,13 @@ logger = Log(__name__).getlog() logger.warn = logger.warning ########### hcak paddle ############# -paddle.bool = 'bool' -paddle.float16 = 'float16' paddle.half = 'float16' -paddle.float32 = 'float32' paddle.float = 'float32' -paddle.float64 = 'float64' paddle.double = 'float64' -paddle.int8 = 'int8' -paddle.int16 = 'int16' paddle.short = 'int16' -paddle.int32 = 'int32' paddle.int = 'int32' -paddle.int64 = 'int64' paddle.long = 'int64' -paddle.uint8 = 'uint8' paddle.uint16 = 'uint16' -paddle.complex64 = 'complex64' -paddle.complex128 = 'complex128' paddle.cdouble = 'complex128' @@ -403,45 +392,7 @@ if not hasattr(paddle.nn.functional, 'glu'): # return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) -# hack loss -def ctc_loss(logits, - labels, - input_lengths, - label_lengths, - blank=0, - reduction='mean', - norm_by_times=True): - #logger.info("my ctc loss with norm by times") - ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403 - loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times, - input_lengths, label_lengths) - - loss_out = paddle.fluid.layers.squeeze(loss_out, [-1]) - assert reduction in ['mean', 'sum', 'none'] - if reduction == 'mean': - loss_out = paddle.mean(loss_out / label_lengths) - elif reduction == 'sum': - loss_out = paddle.sum(loss_out) - return loss_out - - -logger.warn( - "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!" -) -F.ctc_loss = ctc_loss - ########### hcak paddle.nn ############# -if not hasattr(paddle.nn, 'Module'): - logger.warn("register user Module to paddle.nn, remove this when fixed!") - setattr(paddle.nn, 'Module', paddle.nn.Layer) - -# maybe cause assert isinstance(sublayer, core.Layer) -if not hasattr(paddle.nn, 'ModuleList'): - logger.warn( - "register user ModuleList to paddle.nn, remove this when fixed!") - setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList) - - class GLU(nn.Layer): """Gated Linear Units (GLU) Layer""" diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index d661f078d..0662e38d9 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -264,12 +264,12 @@ class U2Trainer(Trainer): config.data.manifest = config.data.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - # config.data.min_input_len = 0.0 # second - # config.data.max_input_len = float('inf') # second - # config.data.min_output_len = 0.0 # tokens - # config.data.max_output_len = float('inf') # tokens - # config.data.min_output_input_ratio = 0.00 - # config.data.max_output_input_ratio = float('inf') + config.data.min_input_len = 0.0 # second + config.data.max_input_len = float('inf') # second + config.data.min_output_len = 0.0 # tokens + config.data.max_output_len = float('inf') # tokens + config.data.min_output_input_ratio = 0.00 + config.data.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id diff --git a/deepspeech/frontend/augmentor/augmentation.py b/deepspeech/frontend/augmentor/augmentation.py index cc0564daf..7b43988e4 100644 --- a/deepspeech/frontend/augmentor/augmentation.py +++ b/deepspeech/frontend/augmentor/augmentation.py @@ -13,18 +13,28 @@ # limitations under the License. """Contains the data augmentation pipeline.""" import json +from collections.abc import Sequence +from inspect import signature import numpy as np -from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor -from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor -from deepspeech.frontend.augmentor.online_bayesian_normalization import \ - OnlineBayesianNormalizationAugmentor -from deepspeech.frontend.augmentor.resample import ResampleAugmentor -from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor -from deepspeech.frontend.augmentor.spec_augment import SpecAugmentor -from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor -from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.utils.dynamic_import import dynamic_import +from deepspeech.utils.log import Log + +__all__ = ["AugmentationPipeline"] + +logger = Log(__name__).getlog() + +import_alias = dict( + volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor", + shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor", + speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor", + resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor", + bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor", + noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor", + impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor", + specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", ) class AugmentationPipeline(): @@ -78,20 +88,74 @@ class AugmentationPipeline(): augmentor to take effect. If "prob" is zero, the augmentor does not take effect. - :param augmentation_config: Augmentation configuration in json string. - :type augmentation_config: str - :param random_seed: Random seed. - :type random_seed: int - :raises ValueError: If the augmentation json config is in incorrect format". + Params: + augmentation_config(str): Augmentation configuration in json string. + random_seed(int): Random seed. + train(bool): whether is train mode. + + Raises: + ValueError: If the augmentation json config is in incorrect format". """ - def __init__(self, augmentation_config: str, random_seed=0): + def __init__(self, augmentation_config: str, random_seed: int=0): self._rng = np.random.RandomState(random_seed) self._spec_types = ('specaug') - self._augmentors, self._rates = self._parse_pipeline_from( - augmentation_config, 'audio') + + if augmentation_config is None: + self.conf = {} + else: + self.conf = json.loads(augmentation_config) + + self._augmentors, self._rates = self._parse_pipeline_from('all') + self._audio_augmentors, self._audio_rates = self._parse_pipeline_from( + 'audio') self._spec_augmentors, self._spec_rates = self._parse_pipeline_from( - augmentation_config, 'feature') + 'feature') + + def __call__(self, xs, uttid_list=None, **kwargs): + if not isinstance(xs, Sequence): + is_batch = False + xs = [xs] + else: + is_batch = True + + if isinstance(uttid_list, str): + uttid_list = [uttid_list for _ in range(len(xs))] + + if self.conf.get("mode", "sequential") == "sequential": + for idx, (func, rate) in enumerate( + zip(self._augmentors, self._rates), 0): + if self._rng.uniform(0., 1.) >= rate: + continue + + # Derive only the args which the func has + try: + param = signature(func).parameters + except ValueError: + # Some function, e.g. built-in function, are failed + param = {} + _kwargs = {k: v for k, v in kwargs.items() if k in param} + + try: + if uttid_list is not None and "uttid" in param: + xs = [ + func(x, u, **_kwargs) + for x, u in zip(xs, uttid_list) + ] + else: + xs = [func(x, **_kwargs) for x in xs] + except Exception: + logger.fatal("Catch a exception from {}th func: {}".format( + idx, func)) + raise + else: + raise NotImplementedError( + "Not supporting mode={}".format(self.conf["mode"])) + + if is_batch: + return xs + else: + return xs[0] def transform_audio(self, audio_segment): """Run the pre-processing pipeline for data augmentation. @@ -101,7 +165,7 @@ class AugmentationPipeline(): :param audio_segment: Audio segment to process. :type audio_segment: AudioSegmenet|SpeechSegment """ - for augmentor, rate in zip(self._augmentors, self._rates): + for augmentor, rate in zip(self._audio_augmentors, self._audio_rates): if self._rng.uniform(0., 1.) < rate: augmentor.transform_audio(audio_segment) @@ -116,52 +180,39 @@ class AugmentationPipeline(): spec_segment = augmentor.transform_feature(spec_segment) return spec_segment - def _parse_pipeline_from(self, config_json, aug_type='audio'): + def _parse_pipeline_from(self, aug_type='all'): """Parse the config json to build a augmentation pipelien.""" - assert aug_type in ('audio', 'feature'), aug_type - try: - configs = json.loads(config_json) - audio_confs = [] - feature_confs = [] - for config in configs: - if config["type"] in self._spec_types: - feature_confs.append(config) - else: - audio_confs.append(config) - - if aug_type == 'audio': - aug_confs = audio_confs - elif aug_type == 'feature': - aug_confs = feature_confs - - augmentors = [ - self._get_augmentor(config["type"], config["params"]) - for config in aug_confs - ] - rates = [config["prob"] for config in aug_confs] - - except Exception as e: - raise ValueError("Failed to parse the augmentation config json: " - "%s" % str(e)) + assert aug_type in ('audio', 'feature', 'all'), aug_type + audio_confs = [] + feature_confs = [] + all_confs = [] + for config in self.conf: + all_confs.append(config) + if config["type"] in self._spec_types: + feature_confs.append(config) + else: + audio_confs.append(config) + + if aug_type == 'audio': + aug_confs = audio_confs + elif aug_type == 'feature': + aug_confs = feature_confs + else: + aug_confs = all_confs + + augmentors = [ + self._get_augmentor(config["type"], config["params"]) + for config in aug_confs + ] + rates = [config["prob"] for config in aug_confs] return augmentors, rates def _get_augmentor(self, augmentor_type, params): """Return an augmentation model by the type name, and pass in params.""" - if augmentor_type == "volume": - return VolumePerturbAugmentor(self._rng, **params) - elif augmentor_type == "shift": - return ShiftPerturbAugmentor(self._rng, **params) - elif augmentor_type == "speed": - return SpeedPerturbAugmentor(self._rng, **params) - elif augmentor_type == "resample": - return ResampleAugmentor(self._rng, **params) - elif augmentor_type == "bayesian_normal": - return OnlineBayesianNormalizationAugmentor(self._rng, **params) - elif augmentor_type == "noise": - return NoisePerturbAugmentor(self._rng, **params) - elif augmentor_type == "impulse": - return ImpulseResponseAugmentor(self._rng, **params) - elif augmentor_type == "specaug": - return SpecAugmentor(self._rng, **params) - else: + class_obj = dynamic_import(augmentor_type, import_alias) + assert issubclass(class_obj, AugmentorBase) + try: + obj = class_obj(self._rng, **params) + except Exception: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) + return obj diff --git a/deepspeech/frontend/augmentor/base.py b/deepspeech/frontend/augmentor/base.py index e6f5c1e9f..87cb4ef72 100644 --- a/deepspeech/frontend/augmentor/base.py +++ b/deepspeech/frontend/augmentor/base.py @@ -28,6 +28,10 @@ class AugmentorBase(): def __init__(self): pass + @abstractmethod + def __call__(self, xs): + raise NotImplementedError + @abstractmethod def transform_audio(self, audio_segment): """Adds various effects to the input audio segment. Such effects diff --git a/deepspeech/frontend/augmentor/impulse_response.py b/deepspeech/frontend/augmentor/impulse_response.py index fbd617b42..01421fc65 100644 --- a/deepspeech/frontend/augmentor/impulse_response.py +++ b/deepspeech/frontend/augmentor/impulse_response.py @@ -30,6 +30,11 @@ class ImpulseResponseAugmentor(AugmentorBase): self._rng = rng self._impulse_manifest = read_manifest(impulse_manifest_path) + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Add impulse response effect. diff --git a/deepspeech/frontend/augmentor/noise_perturb.py b/deepspeech/frontend/augmentor/noise_perturb.py index b3c07f5c1..11f5ed105 100644 --- a/deepspeech/frontend/augmentor/noise_perturb.py +++ b/deepspeech/frontend/augmentor/noise_perturb.py @@ -36,6 +36,11 @@ class NoisePerturbAugmentor(AugmentorBase): self._rng = rng self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Add background noise audio. diff --git a/deepspeech/frontend/augmentor/online_bayesian_normalization.py b/deepspeech/frontend/augmentor/online_bayesian_normalization.py index 5af3b9b03..dc32a1808 100644 --- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py +++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py @@ -44,6 +44,11 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase): self._rng = rng self._startup_delay = startup_delay + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Normalizes the input audio using the online Bayesian approach. diff --git a/deepspeech/frontend/augmentor/resample.py b/deepspeech/frontend/augmentor/resample.py index 9afce635d..a862b184e 100644 --- a/deepspeech/frontend/augmentor/resample.py +++ b/deepspeech/frontend/augmentor/resample.py @@ -31,6 +31,11 @@ class ResampleAugmentor(AugmentorBase): self._new_sample_rate = new_sample_rate self._rng = rng + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Resamples the input audio to a target sample rate. diff --git a/deepspeech/frontend/augmentor/shift_perturb.py b/deepspeech/frontend/augmentor/shift_perturb.py index 9cc3fe2d0..6c78c528e 100644 --- a/deepspeech/frontend/augmentor/shift_perturb.py +++ b/deepspeech/frontend/augmentor/shift_perturb.py @@ -31,6 +31,11 @@ class ShiftPerturbAugmentor(AugmentorBase): self._max_shift_ms = max_shift_ms self._rng = rng + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Shift audio. diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index 1c2e09fc7..94d23bf46 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -157,6 +157,11 @@ class SpecAugmentor(AugmentorBase): self._time_mask = (t_0, t_0 + t) return xs + def __call__(self, x, train=True): + if not train: + return + self.transform_audio(x) + def transform_feature(self, xs: np.ndarray): """ Args: diff --git a/deepspeech/frontend/augmentor/speed_perturb.py b/deepspeech/frontend/augmentor/speed_perturb.py index d0977c131..838c5cc29 100644 --- a/deepspeech/frontend/augmentor/speed_perturb.py +++ b/deepspeech/frontend/augmentor/speed_perturb.py @@ -79,6 +79,11 @@ class SpeedPerturbAugmentor(AugmentorBase): self._rates = np.linspace( self._min_rate, self._max_rate, self._num_rates, endpoint=True) + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Sample a new speed rate from the given range and changes the speed of the given audio clip. diff --git a/deepspeech/frontend/augmentor/volume_perturb.py b/deepspeech/frontend/augmentor/volume_perturb.py index 0d76e7a05..ffae1693e 100644 --- a/deepspeech/frontend/augmentor/volume_perturb.py +++ b/deepspeech/frontend/augmentor/volume_perturb.py @@ -37,6 +37,11 @@ class VolumePerturbAugmentor(AugmentorBase): self._max_gain_dBFS = max_gain_dBFS self._rng = rng + def __call__(self, x, uttid=None, train=True): + if not train: + return + self.transform_audio(x) + def transform_audio(self, audio_segment): """Change audio loadness. diff --git a/deepspeech/io/__init__.py b/deepspeech/io/__init__.py index e180f18ee..185a92b8d 100644 --- a/deepspeech/io/__init__.py +++ b/deepspeech/io/__init__.py @@ -11,139 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -from paddle.io import DataLoader - -from deepspeech.io.collator import SpeechCollator -from deepspeech.io.dataset import ManifestDataset -from deepspeech.io.sampler import SortagradBatchSampler -from deepspeech.io.sampler import SortagradDistributedBatchSampler - - -def create_dataloader(manifest_path, - unit_type, - vocab_filepath, - mean_std_filepath, - spm_model_prefix, - augmentation_config='{}', - max_input_len=float('inf'), - min_input_len=0.0, - max_output_len=float('inf'), - min_output_len=0.0, - max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, - stride_ms=10.0, - window_ms=20.0, - max_freq=None, - specgram_type='linear', - feat_dim=None, - delta_delta=False, - use_dB_normalization=True, - random_seed=0, - keep_transcription_text=False, - is_training=False, - batch_size=1, - num_workers=0, - sortagrad=False, - shuffle_method=None, - dist=False): - - dataset = ManifestDataset( - manifest_path=manifest_path, - unit_type=unit_type, - vocab_filepath=vocab_filepath, - mean_std_filepath=mean_std_filepath, - spm_model_prefix=spm_model_prefix, - augmentation_config=augmentation_config, - max_input_len=max_input_len, - min_input_len=min_input_len, - max_output_len=max_output_len, - min_output_len=min_output_len, - max_output_input_ratio=max_output_input_ratio, - min_output_input_ratio=min_output_input_ratio, - stride_ms=stride_ms, - window_ms=window_ms, - max_freq=max_freq, - specgram_type=specgram_type, - feat_dim=feat_dim, - delta_delta=delta_delta, - use_dB_normalization=use_dB_normalization, - random_seed=random_seed, - keep_transcription_text=keep_transcription_text) - - if dist: - batch_sampler = SortagradDistributedBatchSampler( - dataset, - batch_size, - num_replicas=None, - rank=None, - shuffle=is_training, - drop_last=is_training, - sortagrad=is_training, - shuffle_method=shuffle_method) - else: - batch_sampler = SortagradBatchSampler( - dataset, - shuffle=is_training, - batch_size=batch_size, - drop_last=is_training, - sortagrad=is_training, - shuffle_method=shuffle_method) - - def padding_batch(batch, - padding_to=-1, - flatten=False, - keep_transcription_text=True): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. - - If ``padding_to`` is -1, the maximun shape in the batch will be used - as the target shape for padding. Otherwise, `padding_to` will be the - target shape (only refers to the second axis). - - If `flatten` is True, features will be flatten to 1darray. - """ - new_batch = [] - # get target shape - max_length = max([audio.shape[1] for audio, text in batch]) - if padding_to != -1: - if padding_to < max_length: - raise ValueError("If padding_to is not -1, it should be larger " - "than any instance's shape in the batch") - max_length = padding_to - max_text_length = max([len(text) for audio, text in batch]) - # padding - padded_audios = [] - audio_lens = [] - texts, text_lens = [], [] - for audio, text in batch: - padded_audio = np.zeros([audio.shape[0], max_length]) - padded_audio[:, :audio.shape[1]] = audio - if flatten: - padded_audio = padded_audio.flatten() - padded_audios.append(padded_audio) - audio_lens.append(audio.shape[1]) - - padded_text = np.zeros([max_text_length]) - if keep_transcription_text: - padded_text[:len(text)] = [ord(t) for t in text] # string - else: - padded_text[:len(text)] = text # ids - texts.append(padded_text) - text_lens.append(len(text)) - - padded_audios = np.array(padded_audios).astype('float32') - audio_lens = np.array(audio_lens).astype('int64') - texts = np.array(texts).astype('int32') - text_lens = np.array(text_lens).astype('int64') - return padded_audios, audio_lens, texts, text_lens - - # collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text), - collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text) - loader = DataLoader( - dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn, - num_workers=num_workers) - return loader diff --git a/deepspeech/io/batchfy.py b/deepspeech/io/batchfy.py new file mode 100644 index 000000000..de29d0546 --- /dev/null +++ b/deepspeech/io/batchfy.py @@ -0,0 +1,469 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools + +import numpy as np + +from deepspeech.utils.log import Log + +__all__ = ["make_batchset"] + +logger = Log(__name__).getlog() + + +def batchfy_by_seq( + sorted_data, + batch_size, + max_length_in, + max_length_out, + min_batch_size=1, + shortest_first=False, + ikey="input", + iaxis=0, + okey="output", + oaxis=0, ): + """Make batch set from json dictionary + + :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json + :param int batch_size: batch size + :param int max_length_in: maximum length of input to decide adaptive batch size + :param int max_length_out: maximum length of output to decide adaptive batch size + :param int min_batch_size: mininum batch size (for multi-gpu) + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + :param str ikey: key to access input + (for ASR ikey="input", for TTS, MT ikey="output".) + :param int iaxis: dimension to access input + (for ASR, TTS iaxis=0, for MT iaxis="1".) + :param str okey: key to access output + (for ASR, MT okey="output". for TTS okey="input".) + :param int oaxis: dimension to access output + (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.) + :return: List[List[Tuple[str, dict]]] list of batches + """ + if batch_size <= 0: + raise ValueError(f"Invalid batch_size={batch_size}") + + # check #utts is more than min_batch_size + if len(sorted_data) < min_batch_size: + raise ValueError( + f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})." + ) + + # make list of minibatches + minibatches = [] + start = 0 + while True: + _, info = sorted_data[start] + ilen = int(info[ikey][iaxis]["shape"][0]) + olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else + max(map(lambda x: int(x["shape"][0]), info[okey]))) + factor = max(int(ilen / max_length_in), int(olen / max_length_out)) + # change batchsize depending on the input and output length + # if ilen = 1000 and max_length_in = 800 + # then b = batchsize / 2 + # and max(min_batches, .) avoids batchsize = 0 + bs = max(min_batch_size, int(batch_size / (1 + factor))) + end = min(len(sorted_data), start + bs) + minibatch = sorted_data[start:end] + if shortest_first: + minibatch.reverse() + + # check each batch is more than minimum batchsize + if len(minibatch) < min_batch_size: + mod = min_batch_size - len(minibatch) % min_batch_size + additional_minibatch = [ + sorted_data[i] for i in np.random.randint(0, start, mod) + ] + if shortest_first: + additional_minibatch.reverse() + minibatch.extend(additional_minibatch) + minibatches.append(minibatch) + + if end == len(sorted_data): + break + start = end + + # batch: List[List[Tuple[str, dict]]] + return minibatches + + +def batchfy_by_bin( + sorted_data, + batch_bins, + num_batches=0, + min_batch_size=1, + shortest_first=False, + ikey="input", + okey="output", ): + """Make variably sized batch set, which maximizes + + the number of bins up to `batch_bins`. + + :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json + :param int batch_bins: Maximum frames of a batch + :param int num_batches: # number of batches to use (for debug) + :param int min_batch_size: minimum batch size (for multi-gpu) + :param int test: Return only every `test` batches + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + + :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".) + :param str okey: key to access output (for ASR okey="output". for TTS okey="input".) + + :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches + """ + if batch_bins <= 0: + raise ValueError(f"invalid batch_bins={batch_bins}") + length = len(sorted_data) + idim = int(sorted_data[0][1][ikey][0]["shape"][1]) + odim = int(sorted_data[0][1][okey][0]["shape"][1]) + logger.info("# utts: " + str(len(sorted_data))) + minibatches = [] + start = 0 + n = 0 + while True: + # Dynamic batch size depending on size of samples + b = 0 + next_size = 0 + max_olen = 0 + while next_size < batch_bins and (start + b) < length: + ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim + olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim + if olen > max_olen: + max_olen = olen + next_size = (max_olen + ilen) * (b + 1) + if next_size <= batch_bins: + b += 1 + elif next_size == 0: + raise ValueError( + f"Can't fit one sample in batch_bins ({batch_bins}): " + f"Please increase the value") + end = min(length, start + max(min_batch_size, b)) + batch = sorted_data[start:end] + if shortest_first: + batch.reverse() + minibatches.append(batch) + # Check for min_batch_size and fixes the batches if needed + i = -1 + while len(minibatches[i]) < min_batch_size: + missing = min_batch_size - len(minibatches[i]) + if -i == len(minibatches): + minibatches[i + 1].extend(minibatches[i]) + minibatches = minibatches[1:] + break + else: + minibatches[i].extend(minibatches[i - 1][:missing]) + minibatches[i - 1] = minibatches[i - 1][missing:] + i -= 1 + if end == length: + break + start = end + n += 1 + if num_batches > 0: + minibatches = minibatches[:num_batches] + lengths = [len(x) for x in minibatches] + logger.info( + str(len(minibatches)) + " batches containing from " + str(min(lengths)) + + " to " + str(max(lengths)) + " samples " + "(avg " + str( + int(np.mean(lengths))) + " samples).") + return minibatches + + +def batchfy_by_frame( + sorted_data, + max_frames_in, + max_frames_out, + max_frames_inout, + num_batches=0, + min_batch_size=1, + shortest_first=False, + ikey="input", + okey="output", ): + """Make variable batch set, which maximizes the number of frames to max_batch_frame. + + :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json + :param int max_frames_in: Maximum input frames of a batch + :param int max_frames_out: Maximum output frames of a batch + :param int max_frames_inout: Maximum input+output frames of a batch + :param int num_batches: # number of batches to use (for debug) + :param int min_batch_size: minimum batch size (for multi-gpu) + :param int test: Return only every `test` batches + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + + :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".) + :param str okey: key to access output (for ASR okey="output". for TTS okey="input".) + + :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches + """ + if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0: + raise ValueError( + "At least, one of `--batch-frames-in`, `--batch-frames-out` or " + "`--batch-frames-inout` should be > 0") + length = len(sorted_data) + minibatches = [] + start = 0 + end = 0 + while end != length: + # Dynamic batch size depending on size of samples + b = 0 + max_olen = 0 + max_ilen = 0 + while (start + b) < length: + ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) + if ilen > max_frames_in and max_frames_in != 0: + raise ValueError( + f"Can't fit one sample in --batch-frames-in ({max_frames_in}): " + f"Please increase the value") + olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) + if olen > max_frames_out and max_frames_out != 0: + raise ValueError( + f"Can't fit one sample in --batch-frames-out ({max_frames_out}): " + f"Please increase the value") + if ilen + olen > max_frames_inout and max_frames_inout != 0: + raise ValueError( + f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): " + f"Please increase the value") + max_olen = max(max_olen, olen) + max_ilen = max(max_ilen, ilen) + in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0 + out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0 + inout_ok = (max_ilen + max_olen) * ( + b + 1) <= max_frames_inout or max_frames_inout == 0 + if in_ok and out_ok and inout_ok: + # add more seq in the minibatch + b += 1 + else: + # no more seq in the minibatch + break + end = min(length, start + b) + batch = sorted_data[start:end] + if shortest_first: + batch.reverse() + minibatches.append(batch) + # Check for min_batch_size and fixes the batches if needed + i = -1 + while len(minibatches[i]) < min_batch_size: + missing = min_batch_size - len(minibatches[i]) + if -i == len(minibatches): + minibatches[i + 1].extend(minibatches[i]) + minibatches = minibatches[1:] + break + else: + minibatches[i].extend(minibatches[i - 1][:missing]) + minibatches[i - 1] = minibatches[i - 1][missing:] + i -= 1 + start = end + if num_batches > 0: + minibatches = minibatches[:num_batches] + lengths = [len(x) for x in minibatches] + logger.info( + str(len(minibatches)) + " batches containing from " + str(min(lengths)) + + " to " + str(max(lengths)) + " samples" + "(avg " + str( + int(np.mean(lengths))) + " samples).") + + return minibatches + + +def batchfy_shuffle(data, batch_size, min_batch_size, num_batches, + shortest_first): + import random + + logger.info("use shuffled batch.") + sorted_data = random.sample(data.items(), len(data.items())) + logger.info("# utts: " + str(len(sorted_data))) + # make list of minibatches + minibatches = [] + start = 0 + while True: + end = min(len(sorted_data), start + batch_size) + # check each batch is more than minimum batchsize + minibatch = sorted_data[start:end] + if shortest_first: + minibatch.reverse() + if len(minibatch) < min_batch_size: + mod = min_batch_size - len(minibatch) % min_batch_size + additional_minibatch = [ + sorted_data[i] for i in np.random.randint(0, start, mod) + ] + if shortest_first: + additional_minibatch.reverse() + minibatch.extend(additional_minibatch) + minibatches.append(minibatch) + if end == len(sorted_data): + break + start = end + + # for debugging + if num_batches > 0: + minibatches = minibatches[:num_batches] + logger.info("# minibatches: " + str(len(minibatches))) + return minibatches + + +BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"] +BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"] + + +def make_batchset( + data, + batch_size=0, + max_length_in=float("inf"), + max_length_out=float("inf"), + num_batches=0, + min_batch_size=1, + shortest_first=False, + batch_sort_key="input", + count="auto", + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + iaxis=0, + oaxis=0, ): + """Make batch set from json dictionary + + if utts have "category" value, + + >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'}, + ... {'category': 'B', 'input': ..., 'utt':'utt2'}, + ... {'category': 'B', 'input': ..., 'utt':'utt3'}, + ... {'category': 'A', 'input': ..., 'utt':'utt4'}] + >>> make_batchset(data, batchsize=2, ...) + [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]] + + Note that if any utts doesn't have "category", + perform as same as batchfy_by_{count} + + :param List[Dict[str, Any]] data: dictionary loaded from data.json + :param int batch_size: maximum number of sequences in a minibatch. + :param int batch_bins: maximum number of bins (frames x dim) in a minibatch. + :param int batch_frames_in: maximum number of input frames in a minibatch. + :param int batch_frames_out: maximum number of output frames in a minibatch. + :param int batch_frames_out: maximum number of input+output frames in a minibatch. + :param str count: strategy to count maximum size of batch. + For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES + + :param int max_length_in: maximum length of input to decide adaptive batch size + :param int max_length_out: maximum length of output to decide adaptive batch size + :param int num_batches: # number of batches to use (for debug) + :param int min_batch_size: minimum batch size (for multi-gpu) + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + :param str batch_sort_key: how to sort data before creating minibatches + ["input", "output", "shuffle"] + :param bool swap_io: if True, use "input" as output and "output" + as input in `data` dict + :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output" + as input in `data` dict + :param int iaxis: dimension to access input + (for ASR, TTS iaxis=0, for MT iaxis="1".) + :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0, + reserved for future research, -1 means all axis.) + :return: List[List[Tuple[str, dict]]] list of batches + """ + # check args + if count not in BATCH_COUNT_CHOICES: + raise ValueError( + f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}") + if batch_sort_key not in BATCH_SORT_KEY_CHOICES: + raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be " + f"one of {BATCH_SORT_KEY_CHOICES}") + + ikey = "input" + okey = "output" + batch_sort_axis = 0 # index of list + if count == "auto": + if batch_size != 0: + count = "seq" + elif batch_bins != 0: + count = "bin" + elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0: + count = "frame" + else: + raise ValueError( + f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}" + ) + logger.info(f"count is auto detected as {count}") + + if count != "seq" and batch_sort_key == "shuffle": + raise ValueError( + "batch_sort_key=shuffle is only available if batch_count=seq") + + category2data = {} # Dict[str, dict] + for v in data: + k = v['utt'] + category2data.setdefault(v.get("category"), {})[k] = v + + batches_list = [] # List[List[List[Tuple[str, dict]]]] + for d in category2data.values(): + if batch_sort_key == "shuffle": + batches = batchfy_shuffle(d, batch_size, min_batch_size, + num_batches, shortest_first) + batches_list.append(batches) + continue + + # sort it by input lengths (long to short) + sorted_data = sorted( + d.items(), + key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), + reverse=not shortest_first, ) + logger.info("# utts: " + str(len(sorted_data))) + + if count == "seq": + batches = batchfy_by_seq( + sorted_data, + batch_size=batch_size, + max_length_in=max_length_in, + max_length_out=max_length_out, + min_batch_size=min_batch_size, + shortest_first=shortest_first, + ikey=ikey, + iaxis=iaxis, + okey=okey, + oaxis=oaxis, ) + if count == "bin": + batches = batchfy_by_bin( + sorted_data, + batch_bins=batch_bins, + min_batch_size=min_batch_size, + shortest_first=shortest_first, + ikey=ikey, + okey=okey, ) + if count == "frame": + batches = batchfy_by_frame( + sorted_data, + max_frames_in=batch_frames_in, + max_frames_out=batch_frames_out, + max_frames_inout=batch_frames_inout, + min_batch_size=min_batch_size, + shortest_first=shortest_first, + ikey=ikey, + okey=okey, ) + batches_list.append(batches) + + if len(batches_list) == 1: + batches = batches_list[0] + else: + # Concat list. This way is faster than "sum(batch_list, [])" + batches = list(itertools.chain(*batches_list)) + + # for debugging + if num_batches > 0: + batches = batches[:num_batches] + logger.info("# minibatches: " + str(len(batches))) + + # batch: List[List[Tuple[str, dict]]] + return batches diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 2ef119666..4900350e2 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -23,7 +23,7 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer from deepspeech.frontend.normalizer import FeatureNormalizer from deepspeech.frontend.speech import SpeechSegment from deepspeech.frontend.utility import IGNORE_ID -from deepspeech.io.utility import pad_sequence +from deepspeech.io.utility import pad_list from deepspeech.utils.log import Log __all__ = ["SpeechCollator"] @@ -286,13 +286,12 @@ class SpeechCollator(): texts.append(tokens) text_lens.append(tokens.shape[0]) - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_texts = pad_sequence( - texts, padding_value=IGNORE_ID).astype(np.int64) - text_lens = np.array(text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, padded_texts, text_lens + #[B, T, D] + xs_pad = pad_list(audios, 0.0).astype(np.float32) + ilens = np.array(audio_lens).astype(np.int64) + ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64) + olens = np.array(text_lens).astype(np.int64) + return utts, xs_pad, ilens, ys_pad, olens @property def manifest(self): diff --git a/deepspeech/io/converter.py b/deepspeech/io/converter.py new file mode 100644 index 000000000..a02e06acb --- /dev/null +++ b/deepspeech/io/converter.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from deepspeech.io.utility import pad_list +from deepspeech.utils.log import Log + +__all__ = ["CustomConverter"] + +logger = Log(__name__).getlog() + + +class CustomConverter(): + """Custom batch converter. + + Args: + subsampling_factor (int): The subsampling factor. + dtype (np.dtype): Data type to convert. + + """ + + def __init__(self, subsampling_factor=1, dtype=np.float32): + """Construct a CustomConverter object.""" + self.subsampling_factor = subsampling_factor + self.ignore_id = -1 + self.dtype = dtype + + def __call__(self, batch): + """Transform a batch and send it to a device. + + Args: + batch (list): The batch to transform. + + Returns: + tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor) + + """ + # batch should be located in list + assert len(batch) == 1 + (xs, ys), utts = batch[0] + + # perform subsampling + if self.subsampling_factor > 1: + xs = [x[::self.subsampling_factor, :] for x in xs] + + # get batch of lengths of input sequences + ilens = np.array([x.shape[0] for x in xs]) + + # perform padding and convert to tensor + # currently only support real number + if xs[0].dtype.kind == "c": + xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype) + xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype) + # Note(kamo): + # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. + # Don't create ComplexTensor and give it E2E here + # because torch.nn.DataParellel can't handle it. + xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} + else: + xs_pad = pad_list(xs, 0).astype(self.dtype) + + # NOTE: this is for multi-output (e.g., speech translation) + ys_pad = pad_list( + [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys], + self.ignore_id) + + olens = np.array( + [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys]) + return utts, xs_pad, ilens, ys_pad, olens diff --git a/deepspeech/io/dataloader.py b/deepspeech/io/dataloader.py new file mode 100644 index 000000000..15ab73157 --- /dev/null +++ b/deepspeech/io/dataloader.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle.io import DataLoader + +from deepspeech.frontend.utility import read_manifest +from deepspeech.io.batchfy import make_batchset +from deepspeech.io.converter import CustomConverter +from deepspeech.io.dataset import TransformDataset +from deepspeech.io.reader import LoadInputsAndTargets +from deepspeech.utils.log import Log + +__all__ = ["BatchDataLoader"] + +logger = Log(__name__).getlog() + + +class BatchDataLoader(): + def __init__(self, + json_file: str, + train_mode: bool, + sortagrad: bool=False, + batch_size: int=0, + maxlen_in: float=float('inf'), + maxlen_out: float=float('inf'), + minibatches: int=0, + mini_batch_size: int=1, + batch_count: str='auto', + batch_bins: int=0, + batch_frames_in: int=0, + batch_frames_out: int=0, + batch_frames_inout: int=0, + preprocess_conf=None, + n_iter_processes: int=1, + subsampling_factor: int=1, + num_encs: int=1): + self.json_file = json_file + self.train_mode = train_mode + self.use_sortagrad = sortagrad == -1 or sortagrad > 0 + self.batch_size = batch_size + self.maxlen_in = maxlen_in + self.maxlen_out = maxlen_out + self.batch_count = batch_count + self.batch_bins = batch_bins + self.batch_frames_in = batch_frames_in + self.batch_frames_out = batch_frames_out + self.batch_frames_inout = batch_frames_inout + self.subsampling_factor = subsampling_factor + self.num_encs = num_encs + self.preprocess_conf = preprocess_conf + self.n_iter_processes = n_iter_processes + + # read json data + self.data_json = read_manifest(json_file) + + # make minibatch list (variable length) + self.minibaches = make_batchset( + self.data_json, + batch_size, + maxlen_in, + maxlen_out, + minibatches, # for debug + min_batch_size=mini_batch_size, + shortest_first=self.use_sortagrad, + count=batch_count, + batch_bins=batch_bins, + batch_frames_in=batch_frames_in, + batch_frames_out=batch_frames_out, + batch_frames_inout=batch_frames_inout, + iaxis=0, + oaxis=0, ) + + # data reader + self.reader = LoadInputsAndTargets( + mode="asr", + load_output=True, + preprocess_conf=preprocess_conf, + preprocess_args={"train": + train_mode}, # Switch the mode of preprocessing + ) + + # Setup a converter + if num_encs == 1: + self.converter = CustomConverter( + subsampling_factor=subsampling_factor, dtype=np.float32) + else: + assert NotImplementedError("not impl CustomConverterMulEnc.") + + # hack to make batchsize argument as 1 + # actual bathsize is included in a list + # default collate function converts numpy array to pytorch tensor + # we used an empty collate function instead which returns list + self.dataset = TransformDataset( + self.minibaches, + lambda data: self.converter([self.reader(data, return_uttid=True)])) + self.dataloader = DataLoader( + dataset=self.dataset, + batch_size=1, + shuffle=not use_sortagrad if train_mode else False, + collate_fn=lambda x: x[0], + num_workers=n_iter_processes, ) + + def __repr__(self): + echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> " + echo += f"train_mode: {self.train_mode}, " + echo += f"sortagrad: {self.use_sortagrad}, " + echo += f"batch_size: {self.batch_size}, " + echo += f"maxlen_in: {self.maxlen_in}, " + echo += f"maxlen_out: {self.maxlen_out}, " + echo += f"batch_count: {self.batch_count}, " + echo += f"batch_bins: {self.batch_bins}, " + echo += f"batch_frames_in: {self.batch_frames_in}, " + echo += f"batch_frames_out: {self.batch_frames_out}, " + echo += f"batch_frames_inout: {self.batch_frames_inout}, " + echo += f"subsampling_factor: {self.subsampling_factor}, " + echo += f"num_encs: {self.num_encs}, " + echo += f"num_workers: {self.n_iter_processes}, " + echo += f"file: {self.json_file}" + return echo + + def __len__(self): + return len(self.dataloader) + + def __iter__(self): + return self.dataloader.__iter__() + + def __call__(self): + return self.__iter__() diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index ac7be1f9e..74c08b461 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -19,7 +19,7 @@ from yacs.config import CfgNode from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log -__all__ = ["ManifestDataset", "TripletManifestDataset"] +__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] logger = Log(__name__).getlog() @@ -76,12 +76,18 @@ class ManifestDataset(Dataset): Args: manifest_path (str): manifest josn file path - max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). - min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. - max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. - min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. - max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. - min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. + max_input_len ([type], optional): maximum output seq length, + in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). + min_input_len (float, optional): minimum input seq length, + in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. + max_output_len (float, optional): maximum input seq length, + in modeling units. Defaults to 500.0. + min_output_len (float, optional): minimum input seq length, + in modeling units. Defaults to 0.0. + max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. + Defaults to 10.0. + min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. + Defaults to 0.05. """ super().__init__() @@ -116,3 +122,27 @@ class TripletManifestDataset(ManifestDataset): instance = self._manifest[idx] return instance["utt"], instance["feat"], instance["text"], instance[ "text1"] + + +class TransformDataset(Dataset): + """Transform Dataset. + + Args: + data: list object from make_batchset + transfrom: transform function + + """ + + def __init__(self, data, transform): + """Init function.""" + super().__init__() + self.data = data + self.transform = transform + + def __len__(self): + """Len function.""" + return len(self.data) + + def __getitem__(self, idx): + """[] operator.""" + return self.transform(self.data[idx]) diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py new file mode 100644 index 000000000..b6dc61b79 --- /dev/null +++ b/deepspeech/io/reader.py @@ -0,0 +1,409 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import OrderedDict + +import kaldiio +import numpy as np +import soundfile + +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.utils.log import Log + +__all__ = ["LoadInputsAndTargets"] + +logger = Log(__name__).getlog() + + +class LoadInputsAndTargets(): + """Create a mini-batch from a list of dicts + + >>> batch = [('utt1', + ... dict(input=[dict(feat='some.ark:123', + ... filetype='mat', + ... name='input1', + ... shape=[100, 80])], + ... output=[dict(tokenid='1 2 3 4', + ... name='target1', + ... shape=[4, 31])]])) + >>> l = LoadInputsAndTargets() + >>> feat, target = l(batch) + + :param: str mode: Specify the task mode, "asr" or "tts" + :param: str preprocess_conf: The path of a json file for pre-processing + :param: bool load_input: If False, not to load the input data + :param: bool load_output: If False, not to load the output data + :param: bool sort_in_input_length: Sort the mini-batch in descending order + of the input length + :param: bool use_speaker_embedding: Used for tts mode only + :param: bool use_second_target: Used for tts mode only + :param: dict preprocess_args: Set some optional arguments for preprocessing + :param: Optional[dict] preprocess_args: Used for tts mode only + """ + + def __init__( + self, + mode="asr", + preprocess_conf=None, + load_input=True, + load_output=True, + sort_in_input_length=True, + preprocess_args=None, + keep_all_data_on_mem=False, ): + self._loaders = {} + + if mode not in ["asr"]: + raise ValueError("Only asr are allowed: mode={}".format(mode)) + + if preprocess_conf is not None: + self.preprocessing = AugmentationPipeline(preprocess_conf) + logging.warning( + "[Experimental feature] Some preprocessing will be done " + "for the mini-batch creation using {}".format( + self.preprocessing)) + else: + # If conf doesn't exist, this function don't touch anything. + self.preprocessing = None + + self.mode = mode + self.load_output = load_output + self.load_input = load_input + self.sort_in_input_length = sort_in_input_length + if preprocess_args is None: + self.preprocess_args = {} + else: + assert isinstance(preprocess_args, dict), type(preprocess_args) + self.preprocess_args = dict(preprocess_args) + + self.keep_all_data_on_mem = keep_all_data_on_mem + + def __call__(self, batch, return_uttid=False): + """Function to load inputs and targets from list of dicts + + :param List[Tuple[str, dict]] batch: list of dict which is subset of + loaded data.json + :param bool return_uttid: return utterance ID information for visualization + :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)] + :return: list of input feature sequences + [(T_1, D), (T_2, D), ..., (T_B, D)] + :rtype: list of float ndarray + :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)] + :rtype: list of int ndarray + + """ + x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]] + y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]] + uttid_list = [] # List[str] + + for uttid, info in batch: + uttid_list.append(uttid) + + if self.load_input: + # Note(kamo): This for-loop is for multiple inputs + for idx, inp in enumerate(info["input"]): + # {"input": + # [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "hdf5", + # "name": "input1", ...}], ...} + x = self._get_from_loader( + filepath=inp["feat"], + filetype=inp.get("filetype", "mat")) + x_feats_dict.setdefault(inp["name"], []).append(x) + + if self.load_output: + for idx, inp in enumerate(info["output"]): + if "tokenid" in inp: + # ======= Legacy format for output ======= + # {"output": [{"tokenid": "1 2 3 4"}]) + x = np.fromiter( + map(int, inp["tokenid"].split()), dtype=np.int64) + else: + # ======= New format ======= + # {"input": + # [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "hdf5", + # "name": "target1", ...}], ...} + x = self._get_from_loader( + filepath=inp["feat"], + filetype=inp.get("filetype", "mat")) + + y_feats_dict.setdefault(inp["name"], []).append(x) + + if self.mode == "asr": + return_batch, uttid_list = self._create_batch_asr( + x_feats_dict, y_feats_dict, uttid_list) + else: + raise NotImplementedError(self.mode) + + if self.preprocessing is not None: + # Apply pre-processing all input features + for x_name in return_batch.keys(): + if x_name.startswith("input"): + return_batch[x_name] = self.preprocessing( + return_batch[x_name], uttid_list, + **self.preprocess_args) + + if return_uttid: + return tuple(return_batch.values()), uttid_list + + # Doesn't return the names now. + return tuple(return_batch.values()) + + def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list): + """Create a OrderedDict for the mini-batch + + :param OrderedDict x_feats_dict: + e.g. {"input1": [ndarray, ndarray, ...], + "input2": [ndarray, ndarray, ...]} + :param OrderedDict y_feats_dict: + e.g. {"target1": [ndarray, ndarray, ...], + "target2": [ndarray, ndarray, ...]} + :param: List[str] uttid_list: + Give uttid_list to sort in the same order as the mini-batch + :return: batch, uttid_list + :rtype: Tuple[OrderedDict, List[str]] + """ + # handle single-input and multi-input (paralell) asr mode + xs = list(x_feats_dict.values()) + + if self.load_output: + ys = list(y_feats_dict.values()) + assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0])) + + # get index of non-zero length samples + nonzero_idx = list( + filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0])))) + for n in range(1, len(y_feats_dict)): + nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx) + else: + # Note(kamo): Be careful not to make nonzero_idx to a generator + nonzero_idx = list(range(len(xs[0]))) + + if self.sort_in_input_length: + # sort in input lengths based on the first input + nonzero_sorted_idx = sorted( + nonzero_idx, key=lambda i: -len(xs[0][i])) + else: + nonzero_sorted_idx = nonzero_idx + + if len(nonzero_sorted_idx) != len(xs[0]): + logging.warning( + "Target sequences include empty tokenid (batch {} -> {}).". + format(len(xs[0]), len(nonzero_sorted_idx))) + + # remove zero-length samples + xs = [[x[i] for i in nonzero_sorted_idx] for x in xs] + uttid_list = [uttid_list[i] for i in nonzero_sorted_idx] + + x_names = list(x_feats_dict.keys()) + if self.load_output: + ys = [[y[i] for i in nonzero_sorted_idx] for y in ys] + y_names = list(y_feats_dict.keys()) + + # Keeping x_name and y_name, e.g. input1, for future extension + return_batch = OrderedDict([ + * [(x_name, x) for x_name, x in zip(x_names, xs)], + * [(y_name, y) for y_name, y in zip(y_names, ys)], + ]) + else: + return_batch = OrderedDict( + [(x_name, x) for x_name, x in zip(x_names, xs)]) + return return_batch, uttid_list + + def _get_from_loader(self, filepath, filetype): + """Return ndarray + + In order to make the fds to be opened only at the first referring, + the loader are stored in self._loaders + + >>> ndarray = loader.get_from_loader( + ... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5') + + :param: str filepath: + :param: str filetype: + :return: + :rtype: np.ndarray + """ + if filetype == "hdf5": + # e.g. + # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "hdf5", + # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL" + filepath, key = filepath.split(":", 1) + + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = h5py.File(filepath, "r") + self._loaders[filepath] = loader + return loader[key][()] + elif filetype == "sound.hdf5": + # e.g. + # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "sound.hdf5", + # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL" + filepath, key = filepath.split(":", 1) + + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = SoundHDF5File(filepath, "r", dtype="int16") + self._loaders[filepath] = loader + array, rate = loader[key] + return array + elif filetype == "sound": + # e.g. + # {"input": [{"feat": "some/path.wav", + # "filetype": "sound"}, + # Assume PCM16 + if not self.keep_all_data_on_mem: + array, _ = soundfile.read(filepath, dtype="int16") + return array + if filepath not in self._loaders: + array, _ = soundfile.read(filepath, dtype="int16") + self._loaders[filepath] = array + return self._loaders[filepath] + elif filetype == "npz": + # e.g. + # {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL", + # "filetype": "npz", + filepath, key = filepath.split(":", 1) + + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = np.load(filepath) + self._loaders[filepath] = loader + return loader[key] + elif filetype == "npy": + # e.g. + # {"input": [{"feat": "some/path.npy", + # "filetype": "npy"}, + if not self.keep_all_data_on_mem: + return np.load(filepath) + if filepath not in self._loaders: + self._loaders[filepath] = np.load(filepath) + return self._loaders[filepath] + elif filetype in ["mat", "vec"]: + # e.g. + # {"input": [{"feat": "some/path.ark:123", + # "filetype": "mat"}]}, + # In this case, "123" indicates the starting points of the matrix + # load_mat can load both matrix and vector + if not self.keep_all_data_on_mem: + return kaldiio.load_mat(filepath) + if filepath not in self._loaders: + self._loaders[filepath] = kaldiio.load_mat(filepath) + return self._loaders[filepath] + elif filetype == "scp": + # e.g. + # {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL", + # "filetype": "scp", + filepath, key = filepath.split(":", 1) + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = kaldiio.load_scp(filepath) + self._loaders[filepath] = loader + return loader[key] + else: + raise NotImplementedError( + "Not supported: loader_type={}".format(filetype)) + + +class SoundHDF5File(): + """Collecting sound files to a HDF5 file + + >>> f = SoundHDF5File('a.flac.h5', mode='a') + >>> array = np.random.randint(0, 100, 100, dtype=np.int16) + >>> f['id'] = (array, 16000) + >>> array, rate = f['id'] + + + :param: str filepath: + :param: str mode: + :param: str format: The type used when saving wav. flac, nist, htk, etc. + :param: str dtype: + + """ + + def __init__(self, + filepath, + mode="r+", + format=None, + dtype="int16", + **kwargs): + self.filepath = filepath + self.mode = mode + self.dtype = dtype + + self.file = h5py.File(filepath, mode, **kwargs) + if format is None: + # filepath = a.flac.h5 -> format = flac + second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1] + format = second_ext[1:] + if format.upper() not in soundfile.available_formats(): + # If not found, flac is selected + format = "flac" + + # This format affects only saving + self.format = format + + def __repr__(self): + return ''.format( + self.filepath, self.mode, self.format, self.dtype) + + def create_dataset(self, name, shape=None, data=None, **kwds): + f = io.BytesIO() + array, rate = data + soundfile.write(f, array, rate, format=self.format) + self.file.create_dataset( + name, shape=shape, data=np.void(f.getvalue()), **kwds) + + def __setitem__(self, name, data): + self.create_dataset(name, data=data) + + def __getitem__(self, key): + data = self.file[key][()] + f = io.BytesIO(data.tobytes()) + array, rate = soundfile.read(f, dtype=self.dtype) + return array, rate + + def keys(self): + return self.file.keys() + + def values(self): + for k in self.file: + yield self[k] + + def items(self): + for k in self.file: + yield k, self[k] + + def __iter__(self): + return iter(self.file) + + def __contains__(self, item): + return item in self.file + + def __len__(self, item): + return len(self.file) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.file.close() + + def close(self): + self.file.close() diff --git a/deepspeech/io/utility.py b/deepspeech/io/utility.py index 0cd37428b..99487a0af 100644 --- a/deepspeech/io/utility.py +++ b/deepspeech/io/utility.py @@ -17,11 +17,16 @@ import numpy as np from deepspeech.utils.log import Log -__all__ = ["pad_sequence"] +__all__ = ["pad_list", "pad_sequence"] logger = Log(__name__).getlog() +def pad_list(sequences: List[np.ndarray], + padding_value: float=0.0) -> np.ndarray: + return pad_sequence(sequences, True, padding_value) + + def pad_sequence(sequences: List[np.ndarray], batch_first: bool=True, padding_value: float=0.0) -> np.ndarray: diff --git a/deepspeech/models/ds2/rnn.py b/deepspeech/models/ds2/rnn.py index 01b55c4a2..0d8c9fd2c 100644 --- a/deepspeech/models/ds2/rnn.py +++ b/deepspeech/models/ds2/rnn.py @@ -297,7 +297,7 @@ class RNNStack(nn.Layer): share_weights=share_rnn_weights)) i_size = h_size * 2 - self.rnn_stacks = nn.ModuleList(rnn_stacks) + self.rnn_stacks = nn.LayerList(rnn_stacks) def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): """ diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index f1d466a27..7ed16c9d2 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"] logger = Log(__name__).getlog() -class U2BaseModel(nn.Module): +class U2BaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @classmethod diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py index a73f52e99..99420a89c 100644 --- a/deepspeech/models/u2_st.py +++ b/deepspeech/models/u2_st.py @@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"] logger = Log(__name__).getlog() -class U2STBaseModel(nn.Module): +class U2STBaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" @classmethod diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index 696a6315b..87c9fa492 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -33,7 +33,7 @@ logger = Log(__name__).getlog() __all__ = ["TransformerDecoder"] -class TransformerDecoder(nn.Module): +class TransformerDecoder(nn.Layer): """Base class of Transfomer decoder module. Args: vocab_size: output dim @@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module): self.use_output_layer = use_output_layer self.output_layer = nn.Linear(attention_dim, vocab_size) - self.decoders = nn.ModuleList([ + self.decoders = nn.LayerList([ DecoderLayer( size=attention_dim, self_attn=MultiHeadedAttention(attention_heads, attention_dim, diff --git a/deepspeech/modules/decoder_layer.py b/deepspeech/modules/decoder_layer.py index c6fac5412..47c42615e 100644 --- a/deepspeech/modules/decoder_layer.py +++ b/deepspeech/modules/decoder_layer.py @@ -25,15 +25,15 @@ logger = Log(__name__).getlog() __all__ = ["DecoderLayer"] -class DecoderLayer(nn.Module): +class DecoderLayer(nn.Layer): """Single decoder layer module. Args: size (int): Input dimension. - self_attn (nn.Module): Self-attention module instance. + self_attn (nn.Layer): Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - src_attn (nn.Module): Self-attention module instance. + src_attn (nn.Layer): Self-attention module instance. `MultiHeadedAttention` instance can be used as the argument. - feed_forward (nn.Module): Feed-forward module instance. + feed_forward (nn.Layer): Feed-forward module instance. `PositionwiseFeedForward` instance can be used as the argument. dropout_rate (float): Dropout rate. normalize_before (bool): @@ -48,9 +48,9 @@ class DecoderLayer(nn.Module): def __init__( self, size: int, - self_attn: nn.Module, - src_attn: nn.Module, - feed_forward: nn.Module, + self_attn: nn.Layer, + src_attn: nn.Layer, + feed_forward: nn.Layer, dropout_rate: float, normalize_before: bool=True, concat_after: bool=False, ): diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index 27e0f8d78..71ec61a0e 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder): pos_enc_layer_type, normalize_before, concat_after, static_chunk_size, use_dynamic_chunk, global_cmvn, use_dynamic_left_chunk) - self.encoders = nn.ModuleList([ + self.encoders = nn.LayerList([ TransformerEncoderLayer( size=output_size, self_attn=MultiHeadedAttention(attention_heads, output_size, @@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder): convolution_layer_args = (output_size, cnn_module_kernel, activation, cnn_module_norm, causal) - self.encoders = nn.ModuleList([ + self.encoders = nn.LayerList([ ConformerEncoderLayer( size=output_size, self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args), diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 3e441bbbc..8918ca669 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -48,7 +48,8 @@ class CTCLoss(nn.Layer): logits = logits.transpose([1, 0, 2]) # (TODO:Hui Zhang) ctc loss does not support int64 labels ys_pad = ys_pad.astype(paddle.int32) - loss = self.loss(logits, ys_pad, hlens, ys_lens) + loss = self.loss( + logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average) if self.batch_average: # Batch-size average loss = loss / B diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py index 01b55c4a2..0d8c9fd2c 100644 --- a/deepspeech/modules/rnn.py +++ b/deepspeech/modules/rnn.py @@ -297,7 +297,7 @@ class RNNStack(nn.Layer): share_weights=share_rnn_weights)) i_size = h_size * 2 - self.rnn_stacks = nn.ModuleList(rnn_stacks) + self.rnn_stacks = nn.LayerList(rnn_stacks) def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): """ diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index daa4d175b..4cb3629de 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -21,7 +21,6 @@ | --- | --- | --- | --- | --- | --- | --- | --- | | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 | - ## Chunk Conformer | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | --- | @@ -39,4 +38,7 @@ ### Test w/o length filter | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 7.63 | 0.056832 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_greedy_search | 7.63 | 0.059742 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | ctc_prefix_beam_search | 7.63 | 0.059057 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention_rescoring | 7.63 | 0.047417 | diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 8a769dca4..bc2ec6061 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -4,7 +4,7 @@ data: dev_manifest: data/manifest.dev test_manifest: data/manifest.test-clean min_input_len: 0.5 # second - max_input_len: 20.0 # second + max_input_len: 30.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh index 2a8f2e2d1..def10ab05 100755 --- a/examples/librispeech/s1/run.sh +++ b/examples/librispeech/s1/run.sh @@ -5,7 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/transformer.yaml -avg_num=30 +avg_num=5 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/s2/local/espnet_json_to_manifest.py new file mode 100755 index 000000000..acfa46681 --- /dev/null +++ b/examples/librispeech/s2/local/espnet_json_to_manifest.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +import argparse +import json + + +def main(args): + with open(args.json_file, 'r') as fin: + data_json = json.load(fin) + + # manifest format: + # {"input": [ + # {"feat": "dev/deltafalse/feats.1.ark:842920", "name": "input1", "shape": [349, 83]} + # ], + # "output": [ + # {"name": "target1", "shape": [12, 5002], "text": "NO APOLLO", "token": "▁NO ▁A PO LL O", "tokenid": "3144 482 352 269 317"} + # ], + # "utt2spk": "116-288045", + # "utt": "116-288045-0019"} + with open(args.manifest_file, 'w') as fout: + for key, value in data_json['utts'].items(): + value['utt'] = key + fout.write(json.dumps(value, ensure_ascii=False)) + fout.write("\n") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--json-file', type=str, default=None, help="espnet data json file.") + parser.add_argument( + '--manifest-file', + type=str, + default='maniefst.train', + help='manifest data json line file.') + args = parser.parse_args() + main(args) diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh index 2a8f2e2d1..def10ab05 100755 --- a/examples/librispeech/s2/run.sh +++ b/examples/librispeech/s2/run.sh @@ -5,7 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/transformer.yaml -avg_num=30 +avg_num=5 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} diff --git a/requirements.txt b/requirements.txt index baaa9ba9b..af2600e0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ coverage gpustat +kaldiio pre-commit pybind11 resampy==0.2.2 diff --git a/speechnn/core/CMakeLists.txt b/speechnn/examples/CMakeLists.txt similarity index 100% rename from speechnn/core/CMakeLists.txt rename to speechnn/examples/CMakeLists.txt diff --git a/speechnn/core/frontend/CMakeLists.txt b/speechnn/speechnn/CMakeLists.txt similarity index 100% rename from speechnn/core/frontend/CMakeLists.txt rename to speechnn/speechnn/CMakeLists.txt diff --git a/speechnn/core/decoder/CMakeLists.txt b/speechnn/speechnn/decoder/CMakeLists.txt similarity index 100% rename from speechnn/core/decoder/CMakeLists.txt rename to speechnn/speechnn/decoder/CMakeLists.txt diff --git a/speechnn/core/frontend/audio/CMakeLists.txt b/speechnn/speechnn/frontend/CMakeLists.txt similarity index 100% rename from speechnn/core/frontend/audio/CMakeLists.txt rename to speechnn/speechnn/frontend/CMakeLists.txt diff --git a/speechnn/core/frontend/text/CMakeLists.txt b/speechnn/speechnn/frontend/audio/CMakeLists.txt similarity index 100% rename from speechnn/core/frontend/text/CMakeLists.txt rename to speechnn/speechnn/frontend/audio/CMakeLists.txt diff --git a/speechnn/core/model/CMakeLists.txt b/speechnn/speechnn/frontend/text/CMakeLists.txt similarity index 100% rename from speechnn/core/model/CMakeLists.txt rename to speechnn/speechnn/frontend/text/CMakeLists.txt diff --git a/speechnn/core/protocol/CMakeLists.txt b/speechnn/speechnn/model/CMakeLists.txt similarity index 100% rename from speechnn/core/protocol/CMakeLists.txt rename to speechnn/speechnn/model/CMakeLists.txt diff --git a/speechnn/core/utils/CMakeLists.txt b/speechnn/speechnn/nn/CMakeLists.txt similarity index 100% rename from speechnn/core/utils/CMakeLists.txt rename to speechnn/speechnn/nn/CMakeLists.txt diff --git a/speechnn/speechnn/protocol/CMakeLists.txt b/speechnn/speechnn/protocol/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb diff --git a/speechnn/speechnn/utils/CMakeLists.txt b/speechnn/speechnn/utils/CMakeLists.txt new file mode 100644 index 000000000..e69de29bb