parent
ffb5756787
commit
c607bff282
@ -0,0 +1,409 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "purple-consequence",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'/workspace/DeepSpeech-2.x'"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%cd ..\n",
|
||||
"%pwd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "defensive-mason",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "patient-convention",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" def convert_to_list(value, n, name, dtype=np.int):\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:93] register user softmax to paddle, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,345 - WARNING - register user softmax to paddle, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,346 - WARNING - register user log_softmax to paddle, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,347 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,348 - WARNING - register user log_sigmoid to paddle, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:109] register user relu to paddle, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,349 - WARNING - register user relu to paddle, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,349 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,350 - WARNING - override item of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,351 - WARNING - override long of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,352 - WARNING - override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,353 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,354 - WARNING - override eq of paddle if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,355 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,356 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,357 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,361 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,362 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,363 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,364 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,365 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,366 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,366 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,367 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,368 - WARNING - register user type_as to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,369 - WARNING - register user to to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,370 - WARNING - register user float to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,370 - WARNING - register user tolist to paddle.Tensor, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,371 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,372 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,377 - WARNING - register user Module to paddle.nn, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,378 - WARNING - register user ModuleList to paddle.nn, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,379 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,380 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
|
||||
"[WARNING 2021/04/16 15:30:29 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n",
|
||||
"2021-04-16 15:30:29,381 - WARNING - register user export to paddle.jit, remove this when fixed!\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
|
||||
" from numpy.dual import register_func\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" long_ = _make_signed(np.long)\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" ulong = _make_unsigned(np.long)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Namespace(delta_delta=False, feat_dim=13, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=1, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='linear', stride_ms=10.0, window_ms=20.0)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import argparse\n",
|
||||
"import functools\n",
|
||||
"\n",
|
||||
"from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
|
||||
"from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
|
||||
"from deepspeech.frontend.normalizer import FeatureNormalizer\n",
|
||||
"from deepspeech.utils.utility import add_arguments\n",
|
||||
"from deepspeech.utils.utility import print_arguments\n",
|
||||
"\n",
|
||||
"parser = argparse.ArgumentParser(description=__doc__)\n",
|
||||
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
|
||||
"# yapf: disable\n",
|
||||
"add_arg('num_samples', int, -1, \"# of samples to for statistics.\")\n",
|
||||
"add_arg('specgram_type', str,\n",
|
||||
" 'linear',\n",
|
||||
" \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
|
||||
" choices=['linear', 'mfcc', 'fbank'])\n",
|
||||
"add_arg('feat_dim', int, 13, \"Audio feature dim.\")\n",
|
||||
"add_arg('delta_delta', bool,\n",
|
||||
" False,\n",
|
||||
" \"Audio feature with delta delta.\")\n",
|
||||
"add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n",
|
||||
"add_arg('window_ms', float, 20.0, \"stride length in ms.\")\n",
|
||||
"add_arg('sample_rate', int, 16000, \"target sample rate.\")\n",
|
||||
"add_arg('manifest_path', str,\n",
|
||||
" 'examples/aishell/s1/data/manifest.train.raw',\n",
|
||||
" \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
|
||||
"add_arg('num_workers',\n",
|
||||
" default=1,\n",
|
||||
" type=int,\n",
|
||||
" help='num of subprocess workers for processing')\n",
|
||||
"add_arg('output_path', str,\n",
|
||||
" 'data/librispeech/mean_std.npz',\n",
|
||||
" \"Filepath of write mean and stddev to (.npz).\")\n",
|
||||
"# yapf: disable\n",
|
||||
"args = parser.parse_args([])\n",
|
||||
"print(args)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "enormous-currency",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import paddle\n",
|
||||
"from paddle.io import DataLoader\n",
|
||||
"from paddle.io import Dataset\n",
|
||||
"\n",
|
||||
"from deepspeech.frontend.audio import AudioSegment\n",
|
||||
"from deepspeech.frontend.utility import load_cmvn\n",
|
||||
"from deepspeech.frontend.utility import read_manifest\n",
|
||||
"\n",
|
||||
"class CollateFunc(object):\n",
|
||||
" ''' Collate function for AudioDataset\n",
|
||||
" '''\n",
|
||||
" def __init__(self):\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def __call__(self, batch):\n",
|
||||
" mean_stat = None\n",
|
||||
" var_stat = None\n",
|
||||
" number = 0\n",
|
||||
" for feat in batch:\n",
|
||||
" sums = np.sum(feat, axis=1)\n",
|
||||
" if mean_stat is None:\n",
|
||||
" mean_stat = sums\n",
|
||||
" else:\n",
|
||||
" mean_stat += sums\n",
|
||||
"\n",
|
||||
" square_sums = np.sum(np.square(feat), axis=1)\n",
|
||||
" if var_stat is None:\n",
|
||||
" var_stat = square_sums\n",
|
||||
" else:\n",
|
||||
" var_stat += square_sums\n",
|
||||
"\n",
|
||||
" number += feat.shape[1]\n",
|
||||
" return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class AudioDataset(Dataset):\n",
|
||||
" def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):\n",
|
||||
" self.feature_func = feature_func\n",
|
||||
" self._rng = rng\n",
|
||||
" manifest = read_manifest(manifest_path)\n",
|
||||
" if num_samples == -1:\n",
|
||||
" sampled_manifest = manifest\n",
|
||||
" else:\n",
|
||||
" sampled_manifest = self._rng.sample(manifest, num_samples)\n",
|
||||
" self.items = sampled_manifest\n",
|
||||
"\n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.items)\n",
|
||||
"\n",
|
||||
" def __getitem__(self, idx):\n",
|
||||
" key = self.items[idx]['feat']\n",
|
||||
" audioseg = AudioSegment.from_file(key)\n",
|
||||
" feat = self.feature_func(audioseg) #(D, T)\n",
|
||||
" return feat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "armed-semester",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Exception ignored in: <function _DataLoaderIterMultiProcess.__del__ at 0x7f9c9f91c2f0>\n",
|
||||
"Traceback (most recent call last):\n",
|
||||
" File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 763, in __del__\n",
|
||||
" self._try_shutdown_all()\n",
|
||||
" File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 590, in _try_shutdown_all\n",
|
||||
" w.join()\n",
|
||||
" File \"/usr/local/lib/python3.7/multiprocessing/process.py\", line 140, in join\n",
|
||||
" res = self._popen.wait(timeout)\n",
|
||||
" File \"/usr/local/lib/python3.7/multiprocessing/popen_fork.py\", line 48, in wait\n",
|
||||
" return self.poll(os.WNOHANG if timeout == 0.0 else 0)\n",
|
||||
" File \"/usr/local/lib/python3.7/multiprocessing/popen_fork.py\", line 28, in poll\n",
|
||||
" pid, sts = os.waitpid(self.pid, flag)\n",
|
||||
"KeyboardInterrupt: \n",
|
||||
"2021-04-16 15:44:43,413 - ERROR - DataLoader reader thread raised an exception!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "SystemError",
|
||||
"evalue": "(Fatal) Blocking queue is killed because the data reader raises an exception.\n [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mSystemError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-7-b5adcffc5685>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0mwav_number\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# for i, batch in enumerate(data_loader()):\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmean_stat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvar_stat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0min_dygraph_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_next_var_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_return_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mSystemError\u001b[0m: (Fatal) Blocking queue is killed because the data reader raises an exception.\n [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"augmentation_pipeline = AugmentationPipeline('{}')\n",
|
||||
"audio_featurizer = AudioFeaturizer(\n",
|
||||
" specgram_type=args.specgram_type,\n",
|
||||
" feat_dim=args.feat_dim,\n",
|
||||
" delta_delta=args.delta_delta,\n",
|
||||
" stride_ms=args.stride_ms,\n",
|
||||
" window_ms=args.window_ms,\n",
|
||||
" n_fft=None,\n",
|
||||
" max_freq=None,\n",
|
||||
" target_sample_rate=args.sample_rate,\n",
|
||||
" use_dB_normalization=True,\n",
|
||||
" target_dB=-20)\n",
|
||||
"\n",
|
||||
"def augment_and_featurize(audio_segment):\n",
|
||||
" augmentation_pipeline.transform_audio(audio_segment)\n",
|
||||
" return audio_featurizer.featurize(audio_segment)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"collate_func = CollateFunc()\n",
|
||||
"\n",
|
||||
"dataset = AudioDataset(\n",
|
||||
" args.manifest_path,\n",
|
||||
" augment_and_featurize, \n",
|
||||
" args.num_samples)\n",
|
||||
"\n",
|
||||
"batch_size = 20\n",
|
||||
"data_loader = DataLoader(\n",
|
||||
" dataset,\n",
|
||||
" batch_size=batch_size,\n",
|
||||
" shuffle=False,\n",
|
||||
" num_workers=args.num_workers,\n",
|
||||
" collate_fn=collate_func)\n",
|
||||
"\n",
|
||||
"with paddle.no_grad():\n",
|
||||
" all_mean_stat = None\n",
|
||||
" all_var_stat = None\n",
|
||||
" all_number = 0\n",
|
||||
" wav_number = 0\n",
|
||||
" # for i, batch in enumerate(data_loader()):\n",
|
||||
" for batch in data_loader():\n",
|
||||
" number, mean_stat, var_stat = batch\n",
|
||||
" if i == 0:\n",
|
||||
" all_mean_stat = mean_stat\n",
|
||||
" all_var_stat = var_stat\n",
|
||||
" else:\n",
|
||||
" all_mean_stat += mean_stat\n",
|
||||
" all_var_stat += var_stat\n",
|
||||
" all_number += number\n",
|
||||
" wav_number += batch_size\n",
|
||||
"\n",
|
||||
" if wav_number % 1000 == 0:\n",
|
||||
" print('process {} wavs,{} frames'.format(wav_number,\n",
|
||||
" all_number))\n",
|
||||
"\n",
|
||||
"cmvn_info = {\n",
|
||||
" 'mean_stat': list(all_mean_stat.tolist()),\n",
|
||||
" 'var_stat': list(all_var_stat.tolist()),\n",
|
||||
" 'frame_num': all_number\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "danish-executive",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "accurate-terminal",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dominant-abuse",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,3 @@
|
||||
python_speech_features.egg-info/
|
||||
dist/
|
||||
build/
|
@ -1 +0,0 @@
|
||||
from .base import *
|
@ -1,166 +0,0 @@
|
||||
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
|
||||
# Author: James Lyons 2012
|
||||
from __future__ import division
|
||||
import numpy
|
||||
from python_speech_features import sigproc
|
||||
from scipy.fftpack import dct
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
|
||||
ceplifter=22,useEnergy=True,wintype='povey'):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
|
||||
wintype='hamming'):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
winfunc=lambda x:numpy.ones((x,))
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
|
||||
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
|
||||
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
|
||||
return numpy.log(feat)
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 1127 * numpy.log(1+hz/700.0)
|
||||
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700 * (numpy.exp(mel/1127.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
|
||||
# check kaldi/src/feat/Mel-computations.h
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
|
||||
for j in range(0,nfilt):
|
||||
leftmel = lowmel+j*mel_freq_delta
|
||||
centermel = lowmel+(j+1)*mel_freq_delta
|
||||
rightmel = lowmel+(j+2)*mel_freq_delta
|
||||
for i in range(0,nfft//2):
|
||||
mel=hz2mel(i*samplerate/nfft)
|
||||
if mel>leftmel and mel<rightmel:
|
||||
if mel<centermel:
|
||||
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
|
||||
else:
|
||||
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
@ -1,190 +0,0 @@
|
||||
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
|
||||
# Author: James Lyons 2012
|
||||
from __future__ import division
|
||||
import numpy
|
||||
from python_speech_features import sigproc
|
||||
from scipy.fftpack import dct
|
||||
|
||||
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute MFCC features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param numcep: the number of cepstrum to return, default 13
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
|
||||
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
|
||||
feat = numpy.log(feat)
|
||||
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
|
||||
feat = lifter(feat,ceplifter)
|
||||
if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
|
||||
return feat
|
||||
|
||||
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
|
||||
second return value is the energy in each frame (total energy, unwindowed)
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
signal = sigproc.preemphasis(signal,preemph)
|
||||
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
|
||||
pspec = sigproc.powspec(frames,nfft)
|
||||
energy = numpy.sum(pspec,1) # this stores the total energy in each frame
|
||||
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
|
||||
|
||||
return feat,energy
|
||||
|
||||
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
|
||||
"""Compute log Mel-filterbank energy features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
|
||||
return numpy.log(feat)
|
||||
|
||||
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
|
||||
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
|
||||
winfunc=lambda x:numpy.ones((x,))):
|
||||
"""Compute Spectral Subband Centroid features from an audio signal.
|
||||
|
||||
:param signal: the audio signal from which to compute features. Should be an N*1 array
|
||||
:param samplerate: the samplerate of the signal we are working with.
|
||||
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
|
||||
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
|
||||
:param nfilt: the number of filters in the filterbank, default 26.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
|
||||
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
|
||||
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
|
||||
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
signal = sigproc.preemphasis(signal,preemph)
|
||||
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
|
||||
pspec = sigproc.powspec(frames,nfft)
|
||||
pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
|
||||
|
||||
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
|
||||
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
|
||||
R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
|
||||
|
||||
return numpy.dot(pspec*R,fb.T) / feat
|
||||
|
||||
def hz2mel(hz):
|
||||
"""Convert a value in Hertz to Mels
|
||||
|
||||
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 2595 * numpy.log10(1+hz/700.)
|
||||
|
||||
def mel2hz(mel):
|
||||
"""Convert a value in Mels to Hertz
|
||||
|
||||
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
|
||||
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
|
||||
"""
|
||||
return 700*(10**(mel/2595.0)-1)
|
||||
|
||||
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
|
||||
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
|
||||
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
|
||||
|
||||
:param nfilt: the number of filters in the filterbank, default 20.
|
||||
:param nfft: the FFT size. Default is 512.
|
||||
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
|
||||
:param lowfreq: lowest band edge of mel filters, default 0 Hz
|
||||
:param highfreq: highest band edge of mel filters, default samplerate/2
|
||||
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
|
||||
"""
|
||||
highfreq= highfreq or samplerate/2
|
||||
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
|
||||
|
||||
# compute points evenly spaced in mels
|
||||
lowmel = hz2mel(lowfreq)
|
||||
highmel = hz2mel(highfreq)
|
||||
melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
|
||||
# our points are in Hz, but we use fft bins, so we have to convert
|
||||
# from Hz to fft bin number
|
||||
bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
|
||||
|
||||
fbank = numpy.zeros([nfilt,nfft//2+1])
|
||||
for j in range(0,nfilt):
|
||||
for i in range(int(bin[j]), int(bin[j+1])):
|
||||
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
|
||||
for i in range(int(bin[j+1]), int(bin[j+2])):
|
||||
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
|
||||
return fbank
|
||||
|
||||
def lifter(cepstra, L=22):
|
||||
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
|
||||
magnitude of the high frequency DCT coeffs.
|
||||
|
||||
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
|
||||
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
|
||||
"""
|
||||
if L > 0:
|
||||
nframes,ncoeff = numpy.shape(cepstra)
|
||||
n = numpy.arange(ncoeff)
|
||||
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
|
||||
return lift*cepstra
|
||||
else:
|
||||
# values of L <= 0, do nothing
|
||||
return cepstra
|
||||
|
||||
def delta(feat, N):
|
||||
"""Compute delta features from a feature vector sequence.
|
||||
|
||||
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
|
||||
:param N: For each frame, calculate delta features based on preceding and following N frames
|
||||
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
|
||||
"""
|
||||
if N < 1:
|
||||
raise ValueError('N must be an integer >= 1')
|
||||
NUMFRAMES = len(feat)
|
||||
denominator = 2 * sum([i**2 for i in range(1, N+1)])
|
||||
delta_feat = numpy.empty_like(feat)
|
||||
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
|
||||
for t in range(NUMFRAMES):
|
||||
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
|
||||
return delta_feat
|
@ -1,158 +0,0 @@
|
||||
# This file includes routines for basic signal processing including framing and computing power spectra.
|
||||
# Author: James Lyons 2012
|
||||
import decimal
|
||||
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + (( slen - frame_len) // frame_step)
|
||||
|
||||
# check kaldi/src/feat/feature-window.h
|
||||
padsignal = sig[:(numframes-1)*frame_step+frame_len]
|
||||
if wintype is 'povey':
|
||||
win = numpy.empty(frame_len)
|
||||
for i in range(frame_len):
|
||||
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
|
||||
else: # the hamming window
|
||||
win = numpy.hamming(frame_len)
|
||||
|
||||
if stride_trick:
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(win, (numframes, 1))
|
||||
|
||||
frames = frames.astype(numpy.float32)
|
||||
raw_frames = numpy.zeros(frames.shape)
|
||||
for frm in range(frames.shape[0]):
|
||||
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
|
||||
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
|
||||
raw_frames[frm,:] = frames[frm,:]
|
||||
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
|
||||
|
||||
return frames * win, raw_frames
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
|
||||
"""Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
:param frames: the array of frames.
|
||||
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:returns: a 1-D signal.
|
||||
"""
|
||||
frame_len = round_half_up(frame_len)
|
||||
frame_step = round_half_up(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
|
||||
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes - 1) * frame_step + frame_len
|
||||
|
||||
if siglen <= 0: siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen,))
|
||||
window_correction = numpy.zeros((padlen,))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
window_correction[indices[i, :]] = window_correction[
|
||||
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal / window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
def logpowspec(frames, NFFT, norm=1):
|
||||
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
|
||||
"""
|
||||
ps = powspec(frames, NFFT);
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10 * numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
def do_dither(signal, dither_value=1.0):
|
||||
signal += numpy.random.normal(size=signal.shape) * dither_value
|
||||
return signal
|
||||
|
||||
def do_remove_dc_offset(signal):
|
||||
signal -= numpy.mean(signal)
|
||||
return signal
|
||||
|
||||
def do_preemphasis(signal, coeff=0.97):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
|
@ -1,140 +0,0 @@
|
||||
# This file includes routines for basic signal processing including framing and computing power spectra.
|
||||
# Author: James Lyons 2012
|
||||
import decimal
|
||||
|
||||
import numpy
|
||||
import math
|
||||
import logging
|
||||
|
||||
|
||||
def round_half_up(number):
|
||||
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
|
||||
|
||||
|
||||
def rolling_window(a, window, step=1):
|
||||
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
|
||||
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
|
||||
strides = a.strides + (a.strides[-1],)
|
||||
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
|
||||
|
||||
|
||||
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
|
||||
"""Frame a signal into overlapping frames.
|
||||
|
||||
:param sig: the audio signal to frame.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
|
||||
:returns: an array of frames. Size is NUMFRAMES by frame_len.
|
||||
"""
|
||||
slen = len(sig)
|
||||
frame_len = int(round_half_up(frame_len))
|
||||
frame_step = int(round_half_up(frame_step))
|
||||
if slen <= frame_len:
|
||||
numframes = 1
|
||||
else:
|
||||
numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
|
||||
|
||||
padlen = int((numframes - 1) * frame_step + frame_len)
|
||||
|
||||
zeros = numpy.zeros((padlen - slen,))
|
||||
padsignal = numpy.concatenate((sig, zeros))
|
||||
if stride_trick:
|
||||
win = winfunc(frame_len)
|
||||
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
|
||||
else:
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
frames = padsignal[indices]
|
||||
win = numpy.tile(winfunc(frame_len), (numframes, 1))
|
||||
|
||||
return frames * win
|
||||
|
||||
|
||||
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
|
||||
"""Does overlap-add procedure to undo the action of framesig.
|
||||
|
||||
:param frames: the array of frames.
|
||||
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
|
||||
:param frame_len: length of each frame measured in samples.
|
||||
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
|
||||
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
|
||||
:returns: a 1-D signal.
|
||||
"""
|
||||
frame_len = round_half_up(frame_len)
|
||||
frame_step = round_half_up(frame_step)
|
||||
numframes = numpy.shape(frames)[0]
|
||||
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
|
||||
|
||||
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
|
||||
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
|
||||
indices = numpy.array(indices, dtype=numpy.int32)
|
||||
padlen = (numframes - 1) * frame_step + frame_len
|
||||
|
||||
if siglen <= 0: siglen = padlen
|
||||
|
||||
rec_signal = numpy.zeros((padlen,))
|
||||
window_correction = numpy.zeros((padlen,))
|
||||
win = winfunc(frame_len)
|
||||
|
||||
for i in range(0, numframes):
|
||||
window_correction[indices[i, :]] = window_correction[
|
||||
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
|
||||
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
|
||||
|
||||
rec_signal = rec_signal / window_correction
|
||||
return rec_signal[0:siglen]
|
||||
|
||||
|
||||
def magspec(frames, NFFT):
|
||||
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
|
||||
"""
|
||||
if numpy.shape(frames)[1] > NFFT:
|
||||
logging.warn(
|
||||
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
|
||||
numpy.shape(frames)[1], NFFT)
|
||||
complex_spec = numpy.fft.rfft(frames, NFFT)
|
||||
return numpy.absolute(complex_spec)
|
||||
|
||||
|
||||
def powspec(frames, NFFT):
|
||||
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
|
||||
"""
|
||||
return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
|
||||
|
||||
|
||||
def logpowspec(frames, NFFT, norm=1):
|
||||
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
|
||||
|
||||
:param frames: the array of frames. Each row is a frame.
|
||||
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
|
||||
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
|
||||
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
|
||||
"""
|
||||
ps = powspec(frames, NFFT);
|
||||
ps[ps <= 1e-30] = 1e-30
|
||||
lps = 10 * numpy.log10(ps)
|
||||
if norm:
|
||||
return lps - numpy.max(lps)
|
||||
else:
|
||||
return lps
|
||||
|
||||
|
||||
def preemphasis(signal, coeff=0.95):
|
||||
"""perform preemphasis on the input signal.
|
||||
|
||||
:param signal: The signal to filter.
|
||||
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
|
||||
:returns: the filtered signal.
|
||||
"""
|
||||
return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
|
Binary file not shown.
@ -1,10 +0,0 @@
|
||||
Metadata-Version: 1.0
|
||||
Name: python-speech-features
|
||||
Version: 0.6
|
||||
Summary: Python Speech Feature extraction
|
||||
Home-page: https://github.com/jameslyons/python_speech_features
|
||||
Author: James Lyons
|
||||
Author-email: james.lyons0@gmail.com
|
||||
License: MIT
|
||||
Description: UNKNOWN
|
||||
Platform: UNKNOWN
|
@ -1,12 +0,0 @@
|
||||
README.rst
|
||||
setup.py
|
||||
python_speech_features/__init__.py
|
||||
python_speech_features/base.py
|
||||
python_speech_features/base_orig.py
|
||||
python_speech_features/sigproc.py
|
||||
python_speech_features/sigproc_orig.py
|
||||
python_speech_features.egg-info/PKG-INFO
|
||||
python_speech_features.egg-info/SOURCES.txt
|
||||
python_speech_features.egg-info/dependency_links.txt
|
||||
python_speech_features.egg-info/top_level.txt
|
||||
test/test_sigproc.py
|
@ -1 +0,0 @@
|
||||
|
@ -1 +0,0 @@
|
||||
python_speech_features
|
Loading…
Reference in new issue