fix subsampling, label smoothing loss, remove useless

4 years ago · c607bff282
parent ffb5756787
commit c607bff282
17 changed files with 2413 additions and 1092 deletions
--- a/.notebook/compute_cmvn_loader_test.ipynb
+++ b/.notebook/compute_cmvn_loader_test.ipynb
@ -0,0 +1,409 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "purple-consequence",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/workspace/DeepSpeech-2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "defensive-mason",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "patient-convention",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:93] register user softmax to paddle, remove this when fixed!\n",
+      "2021-04-16 15:30:29,345 - WARNING - register user softmax to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n",
+      "2021-04-16 15:30:29,346 - WARNING - register user log_softmax to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n",
+      "2021-04-16 15:30:29,347 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n",
+      "2021-04-16 15:30:29,348 - WARNING - register user log_sigmoid to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:109] register user relu to paddle, remove this when fixed!\n",
+      "2021-04-16 15:30:29,349 - WARNING - register user relu to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,349 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,350 - WARNING - override item of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,351 - WARNING - override long of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,352 - WARNING - override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,353 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,354 - WARNING - override eq of paddle if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "2021-04-16 15:30:29,355 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "2021-04-16 15:30:29,356 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,357 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,361 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,362 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,363 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,364 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,365 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,366 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,366 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,367 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,368 - WARNING - register user type_as to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,369 - WARNING - register user to to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,370 - WARNING - register user float to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n",
+      "2021-04-16 15:30:29,370 - WARNING - register user tolist to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "2021-04-16 15:30:29,371 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "2021-04-16 15:30:29,372 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n",
+      "2021-04-16 15:30:29,377 - WARNING - register user Module to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n",
+      "2021-04-16 15:30:29,378 - WARNING - register user ModuleList to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n",
+      "2021-04-16 15:30:29,379 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "2021-04-16 15:30:29,380 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 15:30:29 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n",
+      "2021-04-16 15:30:29,381 - WARNING - register user export to paddle.jit, remove this when fixed!\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated.  Instead of using dual, use the functions directly from numpy or scipy.\n",
+      "  from numpy.dual import register_func\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  long_ = _make_signed(np.long)\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  ulong = _make_unsigned(np.long)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Namespace(delta_delta=False, feat_dim=13, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=1, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='linear', stride_ms=10.0, window_ms=20.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import argparse\n",
+    "import functools\n",
+    "\n",
+    "from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
+    "from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
+    "from deepspeech.frontend.normalizer import FeatureNormalizer\n",
+    "from deepspeech.utils.utility import add_arguments\n",
+    "from deepspeech.utils.utility import print_arguments\n",
+    "\n",
+    "parser = argparse.ArgumentParser(description=__doc__)\n",
+    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
+    "# yapf: disable\n",
+    "add_arg('num_samples',      int,    -1,    \"# of samples to for statistics.\")\n",
+    "add_arg('specgram_type',    str,\n",
+    "        'linear',\n",
+    "        \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
+    "        choices=['linear', 'mfcc', 'fbank'])\n",
+    "add_arg('feat_dim',    int, 13, \"Audio feature dim.\")\n",
+    "add_arg('delta_delta',    bool,\n",
+    "        False,\n",
+    "        \"Audio feature with delta delta.\")\n",
+    "add_arg('stride_ms',    float, 10.0,  \"stride length in ms.\")\n",
+    "add_arg('window_ms',    float, 20.0,  \"stride length in ms.\")\n",
+    "add_arg('sample_rate',    int, 16000,  \"target sample rate.\")\n",
+    "add_arg('manifest_path',    str,\n",
+    "        'examples/aishell/s1/data/manifest.train.raw',\n",
+    "        \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
+    "add_arg('num_workers',\n",
+    "                        default=1,\n",
+    "                        type=int,\n",
+    "                        help='num of subprocess workers for processing')\n",
+    "add_arg('output_path',    str,\n",
+    "        'data/librispeech/mean_std.npz',\n",
+    "        \"Filepath of write mean and stddev to (.npz).\")\n",
+    "# yapf: disable\n",
+    "args = parser.parse_args([])\n",
+    "print(args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "enormous-currency",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "\n",
+    "import numpy as np\n",
+    "import paddle\n",
+    "from paddle.io import DataLoader\n",
+    "from paddle.io import Dataset\n",
+    "\n",
+    "from deepspeech.frontend.audio import AudioSegment\n",
+    "from deepspeech.frontend.utility import load_cmvn\n",
+    "from deepspeech.frontend.utility import read_manifest\n",
+    "\n",
+    "class CollateFunc(object):\n",
+    "    ''' Collate function for AudioDataset\n",
+    "    '''\n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "       \n",
+    "    def __call__(self, batch):\n",
+    "        mean_stat = None\n",
+    "        var_stat = None\n",
+    "        number = 0\n",
+    "        for feat in batch:\n",
+    "            sums = np.sum(feat, axis=1)\n",
+    "            if mean_stat is None:\n",
+    "                mean_stat = sums\n",
+    "            else:\n",
+    "                mean_stat += sums\n",
+    "\n",
+    "            square_sums = np.sum(np.square(feat), axis=1)\n",
+    "            if var_stat is None:\n",
+    "                var_stat = square_sums\n",
+    "            else:\n",
+    "                var_stat += square_sums\n",
+    "\n",
+    "            number += feat.shape[1]\n",
+    "        return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)\n",
+    "\n",
+    "\n",
+    "class AudioDataset(Dataset):\n",
+    "    def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):\n",
+    "        self.feature_func = feature_func\n",
+    "        self._rng = rng\n",
+    "        manifest = read_manifest(manifest_path)\n",
+    "        if num_samples == -1:\n",
+    "            sampled_manifest = manifest\n",
+    "        else:\n",
+    "            sampled_manifest = self._rng.sample(manifest, num_samples)\n",
+    "        self.items = sampled_manifest\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.items)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        key = self.items[idx]['feat']\n",
+    "        audioseg = AudioSegment.from_file(key)\n",
+    "        feat = self.feature_func(audioseg)  #(D, T)\n",
+    "        return feat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "armed-semester",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Exception ignored in: <function _DataLoaderIterMultiProcess.__del__ at 0x7f9c9f91c2f0>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 763, in __del__\n",
+      "    self._try_shutdown_all()\n",
+      "  File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 590, in _try_shutdown_all\n",
+      "    w.join()\n",
+      "  File \"/usr/local/lib/python3.7/multiprocessing/process.py\", line 140, in join\n",
+      "    res = self._popen.wait(timeout)\n",
+      "  File \"/usr/local/lib/python3.7/multiprocessing/popen_fork.py\", line 48, in wait\n",
+      "    return self.poll(os.WNOHANG if timeout == 0.0 else 0)\n",
+      "  File \"/usr/local/lib/python3.7/multiprocessing/popen_fork.py\", line 28, in poll\n",
+      "    pid, sts = os.waitpid(self.pid, flag)\n",
+      "KeyboardInterrupt: \n",
+      "2021-04-16 15:44:43,413 - ERROR - DataLoader reader thread raised an exception!\n"
+     ]
+    },
+    {
+     "ename": "SystemError",
+     "evalue": "(Fatal) Blocking queue is killed because the data reader raises an exception.\n  [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mSystemError\u001b[0m                               Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-7-b5adcffc5685>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     38\u001b[0m     \u001b[0mwav_number\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m     \u001b[0;31m#     for i, batch in enumerate(data_loader()):\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     41\u001b[0m         \u001b[0mnumber\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmean_stat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvar_stat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     42\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    778\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0min_dygraph_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m                 \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_next_var_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    780\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    781\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_return_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mSystemError\u001b[0m: (Fatal) Blocking queue is killed because the data reader raises an exception.\n  [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "augmentation_pipeline = AugmentationPipeline('{}')\n",
+    "audio_featurizer = AudioFeaturizer(\n",
+    "    specgram_type=args.specgram_type,\n",
+    "    feat_dim=args.feat_dim,\n",
+    "    delta_delta=args.delta_delta,\n",
+    "    stride_ms=args.stride_ms,\n",
+    "    window_ms=args.window_ms,\n",
+    "    n_fft=None,\n",
+    "    max_freq=None,\n",
+    "    target_sample_rate=args.sample_rate,\n",
+    "    use_dB_normalization=True,\n",
+    "    target_dB=-20)\n",
+    "\n",
+    "def augment_and_featurize(audio_segment):\n",
+    "    augmentation_pipeline.transform_audio(audio_segment)\n",
+    "    return audio_featurizer.featurize(audio_segment)\n",
+    "\n",
+    "\n",
+    "collate_func = CollateFunc()\n",
+    "\n",
+    "dataset = AudioDataset(\n",
+    "    args.manifest_path,\n",
+    "    augment_and_featurize, \n",
+    "    args.num_samples)\n",
+    "\n",
+    "batch_size = 20\n",
+    "data_loader = DataLoader(\n",
+    "    dataset,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=False,\n",
+    "    num_workers=args.num_workers,\n",
+    "    collate_fn=collate_func)\n",
+    "\n",
+    "with paddle.no_grad():\n",
+    "    all_mean_stat = None\n",
+    "    all_var_stat = None\n",
+    "    all_number = 0\n",
+    "    wav_number = 0\n",
+    "    #     for i, batch in enumerate(data_loader()):\n",
+    "    for batch in data_loader():\n",
+    "        number, mean_stat, var_stat = batch\n",
+    "        if i == 0:\n",
+    "            all_mean_stat = mean_stat\n",
+    "            all_var_stat = var_stat\n",
+    "        else:\n",
+    "            all_mean_stat += mean_stat\n",
+    "            all_var_stat += var_stat\n",
+    "        all_number += number\n",
+    "        wav_number += batch_size\n",
+    "\n",
+    "        if wav_number % 1000 == 0:\n",
+    "            print('process {} wavs,{} frames'.format(wav_number,\n",
+    "                                                           all_number))\n",
+    "\n",
+    "cmvn_info = {\n",
+    "    'mean_stat': list(all_mean_stat.tolist()),\n",
+    "    'var_stat': list(all_var_stat.tolist()),\n",
+    "    'frame_num': all_number\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "danish-executive",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "accurate-terminal",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dominant-abuse",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.notebook/u2_model.ipynb
+++ b/.notebook/u2_model.ipynb
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@ -72,8 +72,7 @@ class SpeechCollator():
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
-        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        padded_texts = pad_sequence(
-            texts, padding_value=IGNORE_ID).astype(np.int32)
+            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
        return padded_audios, audio_lens, padded_texts, text_lens
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@ -46,6 +46,8 @@ class CTCLoss(nn.Layer):
        # warp-ctc need activation with shape [T, B, V + 1]
        # logits: (B, L, D) -> (L, B, D)
        logits = logits.transpose([1, 0, 2])
+        # (TODO:Hui Zhang) ctc loss does not support int64 labels
+        ys_pad = ys_pad.astype(paddle.int32)
        loss = self.loss(logits, ys_pad, hlens, ys_lens)
        if self.batch_average:
            # Batch-size average
@ -123,9 +125,12 @@ class LabelSmoothingLoss(nn.Layer):
        true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
        ignore = target == self.padding_idx  # (B,)

-        #target = target * (1 - ignore)  # avoid -1 index
+        # target = target * (1 - ignore)  # avoid -1 index
        target = target.masked_fill(ignore, 0)  # avoid -1 index
-        true_dist += F.one_hot(target, self.size) * self.confidence
+        # true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        target_mask = F.one_hot(target, self.size)
+        true_dist *= (1 - target_mask)
+        true_dist += target_mask * self.confidence

        kl = self.criterion(F.log_softmax(x, axis=1), true_dist)

--- a/deepspeech/modules/subsampling.py
+++ b/deepspeech/modules/subsampling.py
@ -104,7 +104,8 @@ class Conv2dSubsampling4(BaseSubsampling):
            nn.ReLU(),
            nn.Conv2D(odim, odim, 3, 2),
            nn.ReLU(), )
-        self.linear = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
+        self.out = nn.Sequential(
+            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
        self.subsampling_rate = 4
        # The right context for every conv layer is computed by:
        # (kernel_size - 1) / 2 * stride  * frame_rate_of_this_layer
@ -128,7 +129,7 @@ class Conv2dSubsampling4(BaseSubsampling):
        x = x.unsqueeze(1)  # (b, c=1, t, f)
        x = self.conv(x)
        b, c, t, f = paddle.shape(x)
-        x = self.linear(x.transpose([0, 1, 2, 3]).reshape([b, t, c * f]))
+        x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        x, pos_emb = self.pos_enc(x, offset)
        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]

@ -181,7 +182,7 @@ class Conv2dSubsampling6(BaseSubsampling):
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
        b, c, t, f = paddle.shape(x)
-        x = self.linear(x.transpose([0, 1, 2, 3]).reshape([b, t, c * f]))
+        x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        x, pos_emb = self.pos_enc(x, offset)
        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3]

@ -233,6 +234,6 @@ class Conv2dSubsampling8(BaseSubsampling):
        """
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv(x)
-        x = self.linear(x.transpose([0, 1, 2, 3]).reshape([b, t, c * f]))
+        x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
        x, pos_emb = self.pos_enc(x, offset)
        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,5 @@
 coverage
 pre-commit
-python_speech_features
 resampy==0.2.2
 scipy==1.2.1
 sentencepiece
--- a/third_party/python_kaldi_features/.gitignore
+++ b/third_party/python_kaldi_features/.gitignore
@ -0,0 +1,3 @@
+python_speech_features.egg-info/
+dist/
+build/
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/init.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/init.py
@ -1 +0,0 @@
-from .base import *
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/base.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base.py
@ -1,166 +0,0 @@
-# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
-# Author: James Lyons 2012
-from __future__ import division
-import numpy
-from python_speech_features import sigproc
-from scipy.fftpack import dct
-
-def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
-         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
-         ceplifter=22,useEnergy=True,wintype='povey'):
-    """Compute MFCC features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param numcep: the number of cepstrum to return, default 13
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
-    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
-    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
-    """
-    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
-    feat = numpy.log(feat)
-    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
-    feat = lifter(feat,ceplifter)
-    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
-    return feat
-
-def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
-          wintype='hamming'):
-    """Compute Mel-filterbank energy features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
-     winfunc=lambda x:numpy.ones((x,))   
-    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
-        second return value is the energy in each frame (total energy, unwindowed)
-    """
-    highfreq= highfreq or samplerate/2
-    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
-    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
-    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
-    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
-
-    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
-    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
-    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
-
-    return feat,energy
-
-def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
-    """Compute log Mel-filterbank energy features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
-    """
-    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
-    return numpy.log(feat)
-
-def hz2mel(hz):
-    """Convert a value in Hertz to Mels
-
-    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
-    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
-    """
-    return 1127 * numpy.log(1+hz/700.0)
-
-
-def mel2hz(mel):
-    """Convert a value in Mels to Hertz
-
-    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
-    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
-    """
-    return 700 * (numpy.exp(mel/1127.0)-1)
-
-def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
-    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
-    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
-
-    :param nfilt: the number of filters in the filterbank, default 20.
-    :param nfft: the FFT size. Default is 512.
-    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
-    :param lowfreq: lowest band edge of mel filters, default 0 Hz
-    :param highfreq: highest band edge of mel filters, default samplerate/2
-    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
-    """
-    highfreq= highfreq or samplerate/2
-    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
-
-    # compute points evenly spaced in mels
-    lowmel = hz2mel(lowfreq)
-    highmel = hz2mel(highfreq)
-
-    # check kaldi/src/feat/Mel-computations.h    
-    fbank = numpy.zeros([nfilt,nfft//2+1])
-    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
-    for j in range(0,nfilt):
-        leftmel = lowmel+j*mel_freq_delta
-        centermel = lowmel+(j+1)*mel_freq_delta
-        rightmel = lowmel+(j+2)*mel_freq_delta
-        for i in range(0,nfft//2):
-            mel=hz2mel(i*samplerate/nfft)
-            if mel>leftmel and mel<rightmel:
-                if mel<centermel:
-                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
-                else:
-                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
-    return fbank
-
-def lifter(cepstra, L=22):
-    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
-    magnitude of the high frequency DCT coeffs.
-
-    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
-    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
-    """
-    if L > 0:
-        nframes,ncoeff = numpy.shape(cepstra)
-        n = numpy.arange(ncoeff)
-        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
-        return lift*cepstra
-    else:
-        # values of L <= 0, do nothing
-        return cepstra
-
-def delta(feat, N):
-    """Compute delta features from a feature vector sequence.
-
-    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
-    :param N: For each frame, calculate delta features based on preceding and following N frames
-    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
-    """
-    if N < 1:
-        raise ValueError('N must be an integer >= 1')
-    NUMFRAMES = len(feat)
-    denominator = 2 * sum([i**2 for i in range(1, N+1)])
-    delta_feat = numpy.empty_like(feat)
-    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
-    for t in range(NUMFRAMES):
-        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
-    return delta_feat
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py
@ -1,190 +0,0 @@
-# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
-# Author: James Lyons 2012
-from __future__ import division
-import numpy
-from python_speech_features import sigproc
-from scipy.fftpack import dct
-
-def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
-         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
-         winfunc=lambda x:numpy.ones((x,))):
-    """Compute MFCC features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param numcep: the number of cepstrum to return, default 13
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
-    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
-    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
-    """
-    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
-    feat = numpy.log(feat)
-    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
-    feat = lifter(feat,ceplifter)
-    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
-    return feat
-
-def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
-          winfunc=lambda x:numpy.ones((x,))):
-    """Compute Mel-filterbank energy features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
-    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
-        second return value is the energy in each frame (total energy, unwindowed)
-    """
-    highfreq= highfreq or samplerate/2
-    signal = sigproc.preemphasis(signal,preemph)
-    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
-    pspec = sigproc.powspec(frames,nfft)
-    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
-    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
-
-    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
-    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
-    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
-
-    return feat,energy
-
-def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
-    """Compute log Mel-filterbank energy features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
-    """
-    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
-    return numpy.log(feat)
-
-def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
-        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
-        winfunc=lambda x:numpy.ones((x,))):
-    """Compute Spectral Subband Centroid features from an audio signal.
-
-    :param signal: the audio signal from which to compute features. Should be an N*1 array
-    :param samplerate: the samplerate of the signal we are working with.
-    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
-    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
-    :param nfilt: the number of filters in the filterbank, default 26.
-    :param nfft: the FFT size. Default is 512.
-    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
-    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
-    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
-    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
-    """
-    highfreq= highfreq or samplerate/2
-    signal = sigproc.preemphasis(signal,preemph)
-    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
-    pspec = sigproc.powspec(frames,nfft)
-    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
-
-    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
-    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
-    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
-
-    return numpy.dot(pspec*R,fb.T) / feat
-
-def hz2mel(hz):
-    """Convert a value in Hertz to Mels
-
-    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
-    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
-    """
-    return 2595 * numpy.log10(1+hz/700.)
-
-def mel2hz(mel):
-    """Convert a value in Mels to Hertz
-
-    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
-    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
-    """
-    return 700*(10**(mel/2595.0)-1)
-
-def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
-    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
-    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
-
-    :param nfilt: the number of filters in the filterbank, default 20.
-    :param nfft: the FFT size. Default is 512.
-    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
-    :param lowfreq: lowest band edge of mel filters, default 0 Hz
-    :param highfreq: highest band edge of mel filters, default samplerate/2
-    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
-    """
-    highfreq= highfreq or samplerate/2
-    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
-
-    # compute points evenly spaced in mels
-    lowmel = hz2mel(lowfreq)
-    highmel = hz2mel(highfreq)
-    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
-    # our points are in Hz, but we use fft bins, so we have to convert
-    #  from Hz to fft bin number
-    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
-
-    fbank = numpy.zeros([nfilt,nfft//2+1])
-    for j in range(0,nfilt):
-        for i in range(int(bin[j]), int(bin[j+1])):
-            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
-        for i in range(int(bin[j+1]), int(bin[j+2])):
-            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
-    return fbank
-
-def lifter(cepstra, L=22):
-    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
-    magnitude of the high frequency DCT coeffs.
-
-    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
-    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
-    """
-    if L > 0:
-        nframes,ncoeff = numpy.shape(cepstra)
-        n = numpy.arange(ncoeff)
-        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
-        return lift*cepstra
-    else:
-        # values of L <= 0, do nothing
-        return cepstra
-
-def delta(feat, N):
-    """Compute delta features from a feature vector sequence.
-
-    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
-    :param N: For each frame, calculate delta features based on preceding and following N frames
-    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
-    """
-    if N < 1:
-        raise ValueError('N must be an integer >= 1')
-    NUMFRAMES = len(feat)
-    denominator = 2 * sum([i**2 for i in range(1, N+1)])
-    delta_feat = numpy.empty_like(feat)
-    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
-    for t in range(NUMFRAMES):
-        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
-    return delta_feat
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py
@ -1,158 +0,0 @@
-# This file includes routines for basic signal processing including framing and computing power spectra.
-# Author: James Lyons 2012
-import decimal
-
-import numpy
-import math
-import logging
-
-
-def round_half_up(number):
-    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
-
-
-def rolling_window(a, window, step=1):
-    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
-    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
-    strides = a.strides + (a.strides[-1],)
-    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
-
-
-def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
-    """Frame a signal into overlapping frames.
-
-    :param sig: the audio signal to frame.
-    :param frame_len: length of each frame measured in samples.
-    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
-    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
-    :returns: an array of frames. Size is NUMFRAMES by frame_len.
-    """
-    slen = len(sig)
-    frame_len = int(round_half_up(frame_len))
-    frame_step = int(round_half_up(frame_step))
-    if slen <= frame_len:
-        numframes = 1
-    else:
-        numframes = 1 + (( slen - frame_len) // frame_step)
-
-    # check kaldi/src/feat/feature-window.h
-    padsignal = sig[:(numframes-1)*frame_step+frame_len]
-    if wintype is 'povey':
-        win = numpy.empty(frame_len)
-        for i in range(frame_len):
-            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
-    else: # the hamming window
-        win = numpy.hamming(frame_len)
-        
-    if stride_trick:
-        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
-    else:
-        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
-            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
-        indices = numpy.array(indices, dtype=numpy.int32)
-        frames = padsignal[indices]
-        win = numpy.tile(win, (numframes, 1))
-        
-    frames = frames.astype(numpy.float32)
-    raw_frames = numpy.zeros(frames.shape)
-    for frm in range(frames.shape[0]):
-        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
-        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
-        raw_frames[frm,:] = frames[frm,:]
-        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
-
-    return frames * win, raw_frames
-
-def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
-    """Does overlap-add procedure to undo the action of framesig.
-
-    :param frames: the array of frames.
-    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
-    :param frame_len: length of each frame measured in samples.
-    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
-    :returns: a 1-D signal.
-    """
-    frame_len = round_half_up(frame_len)
-    frame_step = round_half_up(frame_step)
-    numframes = numpy.shape(frames)[0]
-    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
-
-    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
-        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
-    indices = numpy.array(indices, dtype=numpy.int32)
-    padlen = (numframes - 1) * frame_step + frame_len
-
-    if siglen <= 0: siglen = padlen
-
-    rec_signal = numpy.zeros((padlen,))
-    window_correction = numpy.zeros((padlen,))
-    win = winfunc(frame_len)
-
-    for i in range(0, numframes):
-        window_correction[indices[i, :]] = window_correction[
-                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
-        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
-
-    rec_signal = rec_signal / window_correction
-    return rec_signal[0:siglen]
-
-
-def magspec(frames, NFFT):
-    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
-
-    :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
-    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
-    """
-    if numpy.shape(frames)[1] > NFFT:
-        logging.warn(
-            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
-            numpy.shape(frames)[1], NFFT)
-    complex_spec = numpy.fft.rfft(frames, NFFT)
-    return numpy.absolute(complex_spec)
-
-
-def powspec(frames, NFFT):
-    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
-
-    :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
-    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
-    """
-    return numpy.square(magspec(frames, NFFT))
-
-
-def logpowspec(frames, NFFT, norm=1):
-    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
-
-    :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
-    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
-    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
-    """
-    ps = powspec(frames, NFFT);
-    ps[ps <= 1e-30] = 1e-30
-    lps = 10 * numpy.log10(ps)
-    if norm:
-        return lps - numpy.max(lps)
-    else:
-        return lps
-
-def do_dither(signal, dither_value=1.0):
-    signal += numpy.random.normal(size=signal.shape) * dither_value
-    return signal
-    
-def do_remove_dc_offset(signal):
-    signal -= numpy.mean(signal)
-    return signal
-
-def do_preemphasis(signal, coeff=0.97):
-    """perform preemphasis on the input signal.
-
-    :param signal: The signal to filter.
-    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
-    :returns: the filtered signal.
-    """
-    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py
@ -1,140 +0,0 @@
-# This file includes routines for basic signal processing including framing and computing power spectra.
-# Author: James Lyons 2012
-import decimal
-
-import numpy
-import math
-import logging
-
-
-def round_half_up(number):
-    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
-
-
-def rolling_window(a, window, step=1):
-    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
-    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
-    strides = a.strides + (a.strides[-1],)
-    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
-
-
-def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
-    """Frame a signal into overlapping frames.
-
-    :param sig: the audio signal to frame.
-    :param frame_len: length of each frame measured in samples.
-    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
-    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
-    :returns: an array of frames. Size is NUMFRAMES by frame_len.
-    """
-    slen = len(sig)
-    frame_len = int(round_half_up(frame_len))
-    frame_step = int(round_half_up(frame_step))
-    if slen <= frame_len:
-        numframes = 1
-    else:
-        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
-
-    padlen = int((numframes - 1) * frame_step + frame_len)
-
-    zeros = numpy.zeros((padlen - slen,))
-    padsignal = numpy.concatenate((sig, zeros))
-    if stride_trick:
-        win = winfunc(frame_len)
-        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
-    else:
-        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
-            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
-        indices = numpy.array(indices, dtype=numpy.int32)
-        frames = padsignal[indices]
-        win = numpy.tile(winfunc(frame_len), (numframes, 1))
-
-    return frames * win
-
-
-def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
-    """Does overlap-add procedure to undo the action of framesig.
-
-    :param frames: the array of frames.
-    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
-    :param frame_len: length of each frame measured in samples.
-    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
-    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
-    :returns: a 1-D signal.
-    """
-    frame_len = round_half_up(frame_len)
-    frame_step = round_half_up(frame_step)
-    numframes = numpy.shape(frames)[0]
-    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
-
-    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
-        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
-    indices = numpy.array(indices, dtype=numpy.int32)
-    padlen = (numframes - 1) * frame_step + frame_len
-
-    if siglen <= 0: siglen = padlen
-
-    rec_signal = numpy.zeros((padlen,))
-    window_correction = numpy.zeros((padlen,))
-    win = winfunc(frame_len)
-
-    for i in range(0, numframes):
-        window_correction[indices[i, :]] = window_correction[
-                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
-        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
-
-    rec_signal = rec_signal / window_correction
-    return rec_signal[0:siglen]
-
-
-def magspec(frames, NFFT):
-    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
-
-    :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
-    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
-    """
-    if numpy.shape(frames)[1] > NFFT:
-        logging.warn(
-            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
-            numpy.shape(frames)[1], NFFT)
-    complex_spec = numpy.fft.rfft(frames, NFFT)
-    return numpy.absolute(complex_spec)
-
-
-def powspec(frames, NFFT):
-    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
-
-    :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
-    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
-    """
-    return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
-
-
-def logpowspec(frames, NFFT, norm=1):
-    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
-
-    :param frames: the array of frames. Each row is a frame.
-    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
-    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
-    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
-    """
-    ps = powspec(frames, NFFT);
-    ps[ps <= 1e-30] = 1e-30
-    lps = 10 * numpy.log10(ps)
-    if norm:
-        return lps - numpy.max(lps)
-    else:
-        return lps
-
-
-def preemphasis(signal, coeff=0.95):
-    """perform preemphasis on the input signal.
-
-    :param signal: The signal to filter.
-    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
-    :returns: the filtered signal.
-    """
-    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg
+++ b/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO
@ -1,10 +0,0 @@
-Metadata-Version: 1.0
-Name: python-speech-features
-Version: 0.6
-Summary: Python Speech Feature extraction
-Home-page: https://github.com/jameslyons/python_speech_features
-Author: James Lyons
-Author-email: james.lyons0@gmail.com
-License: MIT
-Description: UNKNOWN
-Platform: UNKNOWN
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt
@ -1,12 +0,0 @@
-README.rst
-setup.py
-python_speech_features/__init__.py
-python_speech_features/base.py
-python_speech_features/base_orig.py
-python_speech_features/sigproc.py
-python_speech_features/sigproc_orig.py
-python_speech_features.egg-info/PKG-INFO
-python_speech_features.egg-info/SOURCES.txt
-python_speech_features.egg-info/dependency_links.txt
-python_speech_features.egg-info/top_level.txt
-test/test_sigproc.py
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt
@ -1 +0,0 @@
-
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt
@ -1 +0,0 @@
-python_speech_features