diff --git a/.flake8 b/.flake8 index b49cbf1a9..722899439 100644 --- a/.flake8 +++ b/.flake8 @@ -12,6 +12,7 @@ exclude = .git, # python cache __pycache__, + third_party/, # Provide a comma-separate list of glob patterns to include for checks. filename = *.py @@ -46,4 +47,4 @@ select = E, W, F, - C \ No newline at end of file + C diff --git a/.notebook/dataloader_with_tokens_tokenids.ipynb b/.notebook/dataloader_with_tokens_tokenids.ipynb index 30d492eba..7d93dd009 100644 --- a/.notebook/dataloader_with_tokens_tokenids.ipynb +++ b/.notebook/dataloader_with_tokens_tokenids.ipynb @@ -83,37 +83,39 @@ "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n", - "WARNING:root:register user softmax to paddle, remove this when fixed!\n", - "WARNING:root:register user log_softmax to paddle, remove this when fixed!\n", - "WARNING:root:register user sigmoid to paddle, remove this when fixed!\n", - "WARNING:root:register user log_sigmoid to paddle, remove this when fixed!\n", - "WARNING:root:register user relu to paddle, remove this when fixed!\n", - "WARNING:root:override cat of paddle if exists or register, remove this when fixed!\n", - "WARNING:root:override item of paddle.Tensor if exists or register, remove this when fixed!\n", - "WARNING:root:override long of paddle.Tensor if exists or register, remove this when fixed!\n", - "WARNING:root:override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", - "WARNING:root:override eq of paddle.Tensor if exists or register, remove this when fixed!\n", - "WARNING:root:override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", - "WARNING:root:override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", - "WARNING:root:register user view to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user view_as to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user fill_ to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user softmax to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user sigmoid to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user relu to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user type_as to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user to to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user float to paddle.Tensor, remove this when fixed!\n", - "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n", - "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", - "WARNING:root:register user Module to paddle.nn, remove this when fixed!\n", - "WARNING:root:register user ModuleList to paddle.nn, remove this when fixed!\n", - "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n", - "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n", - "WARNING:root:register user export to paddle.jit, remove this when fixed!\n" + "[WARNING 2021/04/16 06:32:09 __init__.py:93] register user softmax to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:109] register user relu to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 06:32:09 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n" ] }, { @@ -191,6 +193,84 @@ { "cell_type": "code", "execution_count": 4, + "id": "wired-principal", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/aishell/s1/data/spm_bpe', 'infer_manifest': 'examples/aishell/s1/data/manifest.test', 'mean_std_path': '', 'vocab_path': 'examples/aishell/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n" + ] + } + ], + "source": [ + "import sys\n", + "import argparse\n", + "import functools\n", + "from deepspeech.utils.utility import add_arguments, print_arguments\n", + "parser = argparse.ArgumentParser(description=__doc__)\n", + "add_arg = functools.partial(add_arguments, argparser=parser)\n", + "# yapf: disable\n", + "add_arg('num_samples', int, 5, \"# of samples to infer.\")\n", + "add_arg('beam_size', int, 500, \"Beam search width.\")\n", + "add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n", + "add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n", + "add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n", + "add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n", + "add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n", + "add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n", + "add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n", + "add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n", + "add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n", + "add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n", + "add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n", + " \"bi-directional RNNs. Not for GRU.\")\n", + "add_arg('unit_type', str,\n", + " 'char',\n", + " \"Options: char, word, spm.\",\n", + " choices=['char', 'word', 'spm'])\n", + "add_arg('spm_model_prefix', str,\n", + " 'examples/aishell/s1/data/spm_bpe',\n", + " \"spm model prefix.\",)\n", + "add_arg('infer_manifest', str,\n", + " 'examples/aishell/s1/data/manifest.test',\n", + " \"Filepath of manifest to infer.\")\n", + "add_arg('mean_std_path', str,\n", + " '',\n", + " \"examples/aishell/s1/data/mean_std.npz, Filepath of normalizer's mean & std.\")\n", + "add_arg('vocab_path', str,\n", + " 'examples/aishell/s1/data/vocab.txt',\n", + " \"Filepath of vocabulary.\")\n", + "add_arg('lang_model_path', str,\n", + " 'models/lm/common_crawl_00.prune01111.trie.klm',\n", + " \"Filepath for language model.\")\n", + "add_arg('model_path', str,\n", + " 'examples/aishell/s1/checkpoints/step_final',\n", + " \"If None, the training starts from scratch, \"\n", + " \"otherwise, it resumes from the pre-trained model.\")\n", + "add_arg('decoding_method', str,\n", + " 'ctc_beam_search',\n", + " \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n", + " choices = ['ctc_beam_search', 'ctc_greedy'])\n", + "add_arg('error_rate_type', str,\n", + " 'wer',\n", + " \"Error rate type for evaluation.\",\n", + " choices=['wer', 'cer'])\n", + "add_arg('specgram_type', str,\n", + " 'fbank',\n", + " \"Audio feature type. Options: linear, mfcc.\",\n", + " choices=['linear', 'mfcc', 'fbank'])\n", + "add_arg('feat_dim', int, 80, \"mfcc or fbank feat dim.\")\n", + "add_arg('delta_delta', bool, False, \"delta delta\")\n", + "# yapf: disable\n", + "args = parser.parse_args([])\n", + "print(vars(args))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "bearing-physics", "metadata": {}, "outputs": [ @@ -259,7 +339,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "classified-melissa", "metadata": {}, "outputs": [ @@ -268,7 +348,31 @@ "output_type": "stream", "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", - " and should_run_async(code)\n", + " and should_run_async(code)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "fbank\n", + "[232 387 331 ... 249 249 262] int16\n", + "fbank\n", + "[-138 -219 -192 ... 338 324 351] int16\n", + "fbank\n", + "[ 694 1175 1022 ... 553 514 627] int16\n", + "fbank\n", + "[-39 -79 -53 ... 139 172 99] int16\n", + "fbank\n", + "[-277 -480 -425 ... 758 767 739] int16\n", + "fbank\n", + "[ 399 693 609 ... 1291 1270 1291] int16\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " if arr.dtype == np.object:\n" @@ -278,58 +382,106 @@ "name": "stdout", "output_type": "stream", "text": [ - "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", - " [[116, 104, 101, 32, 116, 119, 101, 110, 116, 105, 101, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n", - " [119, 104, 101, 114, 101, 32, 105, 115, 32, 116, 104, 97, 116, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n", - " [116, 101, 110, 32, 115, 101, 99, 111, 110, 100, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n", - " [104, 101, 32, 100, 111, 101, 115, 110, 39, 116, 32, 119, 111, 114, 107, 32, 97, 116, 32, 97, 108, 108, -1 ],\n", - " [119, 104, 101, 114, 101, 32, 105, 115, 32, 109, 121, 32, 98, 114, 111, 116, 104, 101, 114, 32, 110, 111, 119]])\n", - "test raw: the twenties\n", - "test raw: where is my brother now\n", + "fbank\n", + "[ -750 -1254 -1107 ... 2276 1889 2067] int16\n", + "fbank\n", + "[ -127 -199 -149 ... -5243 -5065 -5398] int16\n", + "fbank\n", + "[ 465 783 677 ... 980 903 1008] int16\n", + "fbank\n", + "[ 90 160 157 ... -2 -16 -21] int16\n", + "fbank\n", + "[ 213 345 295 ... 2483 2246 2501] int16\n", + "fbank\n", + "[ -86 -159 -131 ... 270 258 290] int16\n", + "fbank\n", + "[-1023 -1714 -1505 ... 1532 1596 1575] int16\n", + "fbank\n", + "[-366 -602 -527 ... 374 370 379] int16\n", + "fbank\n", + "[ 761 1275 1127 ... 369 413 295] int16\n", + "fbank\n", + "[382 621 550 ... 161 161 174] int16\n", + "fbank\n", + "[ -28 -91 -120 ... 28 34 11] int16\n", + "fbank\n", + "[ -5 -5 -5 ... 268 294 341] int16\n", + "fbank\n", + "[240 417 684 ... 267 262 219] int16\n", + "fbank\n", + "[131 206 194 ... 383 320 343] int16\n", + "test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[31069, 21487, 29233, 30340, 20320, -1 , -1 ],\n", + " [20540, 24471, 19968, 25552, 30340, 26159, -1 ],\n", + " [36825, 20010, 31243, 24230, 26159, 32654, 30340],\n", + " [20108, 21040, 20108, -1 , -1 , -1 , -1 ],\n", + " [21435, 34892, 25919, 21270, -1 , -1 , -1 ]])\n", + "fbank\n", + "[1155 1890 1577 ... 1092 989 1130] int16\n", + "fbank\n", + "[296 358 296 ... 140 140 168] int16\n", + "fbank\n", + "[-50 -91 -63 ... 104 104 86] int16\n", + "fbank\n", + "[-37 -66 -50 ... -31 -45 -52] int16\n", + "fbank\n", + "[-401 -652 -547 ... -339 -307 -344] int16\n", + "fbank\n", + "[-21 -47 -51 ... 94 81 107] int16\n", + "fbank\n", + "[ 533 887 755 ... 3074 2853 3254] int16\n", + "fbank\n", + "[ 44 71 66 ... -628 -733 -601] int16\n", + "fbank\n", + "[ 50 86 79 ... 129 116 138] int16\n", + "fbank\n", + "[ 92 146 126 ... -208 -193 -179] int16\n", + "test raw: 祝可爱的你\n", + "test raw: 去行政化\n", "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", - " [163, 173, 184, 190, 203])\n", + " [184, 194, 196, 204, 207])\n", "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", - " [12, 13, 11, 22, 23])\n", - "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", - " [[[-51.32406616, -17.91388321, 0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n", - " [-15.19027233, -20.52460480, 0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n", - " [-22.80181694, -19.48889351, 0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n", + " [5, 6, 7, 3, 4])\n", + "audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[[12.25633812, 12.61639309, 10.36936474, ..., 13.02949619, 11.51365757, 10.59789085],\n", + " [13.32148266, 13.41071606, 11.43800735, ..., 13.69783783, 12.83939362, 11.51259613],\n", + " [12.62640572, 12.53621101, 10.97212505, ..., 13.33757591, 12.32293034, 10.75493717],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-15.38297653, -18.95307732, 0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n", - " [-14.06289291, -12.69954872, 0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n", - " [-19.78544235, -11.63046265, 0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n", + " [[10.99619484, 11.35202599, 9.56922054 , ..., 9.94971657 , 9.88354111 , 9.55315971 ],\n", + " [10.44461155, 9.81688595 , 5.62538481 , ..., 10.60468388, 10.94417381, 9.42646980 ],\n", + " [10.23835754, 10.23407459, 7.99464273 , ..., 10.68097591, 9.91640091 , 10.04131031],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-22.65289879, -21.11938667, 0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n", - " [-31.04699135, -21.68680763, 0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n", - " [-18.16406441, -17.50658417, 0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n", + " [[14.10299397, 14.50298119, 12.87738323, ..., 12.62796497, 12.69949627, 11.43171215],\n", + " [13.85035992, 13.15289116, 10.66541386, ..., 13.34364223, 13.46972179, 11.02160740],\n", + " [13.19866467, 13.23537827, 11.65760899, ..., 12.72559357, 12.42716217, 11.74562359],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-16.17608452, -15.22302818, 0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n", - " [-19.40717316, -12.32932186, 0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n", - " [-11.01850796, -13.20147514, 0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n", + " [[12.85668373, 12.82431412, 11.68144703, ..., 14.10119247, 15.12791920, 13.68221378],\n", + " [13.19507027, 13.40244961, 11.43618393, ..., 13.32919979, 13.68267441, 12.73429012],\n", + " [13.02173328, 12.92082500, 11.44303989, ..., 12.77793121, 13.10915661, 11.77327728],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-16.55369759, -16.95514297, 0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n", - " [-19.51947975, -14.86818218, 0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n", - " [-15.26447582, -22.26662445, 0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n", + " [[12.90771198, 13.40234852, 13.01435471, ..., 13.80359459, 14.08088684, 13.17883396],\n", + " [14.06678009, 14.06943512, 12.52837276, ..., 13.66423225, 13.66300583, 13.60142994],\n", + " [12.58743191, 12.94520760, 11.75190544, ..., 14.28828907, 14.08229160, 13.02433395],\n", " ...,\n", - " [-4.81728077 , -10.65084648, 0.00000000 , ..., 3.19982862 , 8.42359638 , 7.95100546 ],\n", - " [-7.54755068 , -12.56441689, 0.00000000 , ..., 4.12789631 , 6.98472023 , 7.79936218 ],\n", - " [-8.79256725 , -11.23776722, 0.00000000 , ..., 1.31829071 , 1.30352044 , 6.80789280 ]]])\n" + " [16.20896912, 16.42283821, 14.94358730, ..., 12.91146755, 12.66766262, 11.76361752],\n", + " [13.49324894, 14.14653301, 13.16490936, ..., 13.23435783, 13.45378494, 12.60386276],\n", + " [15.56288910, 15.92445087, 14.90794277, ..., 13.43840790, 13.41075516, 12.55605984]]])\n" ] } ], @@ -354,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "minus-modern", "metadata": {}, "outputs": [ @@ -362,58 +514,70 @@ "name": "stdout", "output_type": "stream", "text": [ - "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", - " [[87, 37, 26, 1, 87, 97, 26, 61, 87, 38, 26, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n", - " [97, 37, 26, 79, 26, 1, 38, 82, 1, 87, 37, 3, 87, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n", - " [87, 26, 61, 1, 82, 26, 18, 64, 61, 25, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n", - " [37, 26, 1, 25, 64, 26, 82, 61, 2, 87, 1, 97, 64, 79, 52, 1, 3, 87, 1, 3, 53, 53, -1],\n", - " [97, 37, 26, 79, 26, 1, 38, 82, 1, 58, 102, 1, 17, 79, 64, 87, 37, 26, 79, 1, 61, 64, 97]])\n", - "test raw: W%\u001a\u0001Wa\u001a=W&\u001aR\n", - "test raw: a%\u001aO\u001a\u0001&R\u0001:f\u0001\u0011O@W%\u001aO\u0001=@a\n", + "fbank\n", + "[232 387 331 ... 249 249 262] int16\n", + "fbank\n", + "[-138 -219 -192 ... 338 324 351] int16\n", + "fbank\n", + "[ 694 1175 1022 ... 553 514 627] int16\n", + "fbank\n", + "[-39 -79 -53 ... 139 172 99] int16\n", + "fbank\n", + "[-277 -480 -425 ... 758 767 739] int16\n", + "fbank\n", + "test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[2695, 505, 2332, 2553, 169, -1 , -1 ],\n", + " [ 230, 1237, 2 , 1556, 2553, 1694, -1 ],\n", + " [3703, 28 , 2739, 1172, 1694, 2966, 2553],\n", + " [ 70 , 355, 70 , -1 , -1 , -1 , -1 ],\n", + " [ 477, 3363, 1621, 412, -1 , -1 , -1 ]])\n", + "[ 399 693 609 ... 1291 1270 1291] int16\n", + "test raw: ઇǹज৹©\n", + "test raw: ǝണٕƜ\n", "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", - " [12, 13, 11, 22, 23])\n", - "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", - " [[[-51.32406616, -17.91388321, 0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n", - " [-15.19027233, -20.52460480, 0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n", - " [-22.80181694, -19.48889351, 0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n", + " [5, 6, 7, 3, 4])\n", + "audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", + " [[[12.25794601, 12.61855793, 10.37306023, ..., 13.12571049, 11.53678799, 10.32210350],\n", + " [13.32333183, 13.41336918, 11.44248962, ..., 13.65861225, 12.79308128, 11.31168747],\n", + " [12.62584686, 12.53506088, 10.96861362, ..., 13.32526493, 12.41560936, 10.71458912],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-15.38297653, -18.95307732, 0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n", - " [-14.06289291, -12.69954872, 0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n", - " [-19.78544235, -11.63046265, 0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n", + " [[11.00003052, 11.35529137, 9.56384087 , ..., 10.06063652, 10.16322994, 9.43149185 ],\n", + " [10.44556236, 9.81155300 , 5.49400425 , ..., 10.84116268, 11.02734756, 9.42253590 ],\n", + " [10.23620510, 10.23321152, 7.99466419 , ..., 10.93381882, 10.28395081, 10.00841141],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-22.65289879, -21.11938667, 0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n", - " [-31.04699135, -21.68680763, 0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n", - " [-18.16406441, -17.50658417, 0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n", + " [[14.10379314, 14.50375748, 12.87825108, ..., 12.68065739, 12.62359715, 11.53773308],\n", + " [13.84964657, 13.15079498, 10.67198086, ..., 13.24875164, 13.45796680, 10.97363472],\n", + " [13.19808197, 13.23482990, 11.65900230, ..., 12.70375061, 12.41395664, 11.88668156],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-16.17608452, -15.22302818, 0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n", - " [-19.40717316, -12.32932186, 0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n", - " [-11.01850796, -13.20147514, 0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n", + " [[12.85676289, 12.82410812, 11.67961884, ..., 14.12018299, 15.14850044, 13.80065727],\n", + " [13.19532776, 13.40243340, 11.43492508, ..., 13.29144669, 13.70278549, 12.67841339],\n", + " [13.02196407, 12.92111111, 11.43998623, ..., 12.71165752, 13.16518497, 11.92028046],\n", " ...,\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", - " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", - " [[-16.55369759, -16.95514297, 0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n", - " [-19.51947975, -14.86818218, 0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n", - " [-15.26447582, -22.26662445, 0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n", + " [[12.90661621, 13.40162563, 13.01394463, ..., 13.84056377, 14.11240959, 13.21227264],\n", + " [14.06642914, 14.06922340, 12.52955723, ..., 13.55829811, 13.60157204, 13.50268650],\n", + " [12.58881378, 12.94780254, 11.75758171, ..., 14.29055786, 14.12165928, 13.02695847],\n", " ...,\n", - " [-4.81728077 , -10.65084648, 0.00000000 , ..., 3.19982862 , 8.42359638 , 7.95100546 ],\n", - " [-7.54755068 , -12.56441689, 0.00000000 , ..., 4.12789631 , 6.98472023 , 7.79936218 ],\n", - " [-8.79256725 , -11.23776722, 0.00000000 , ..., 1.31829071 , 1.30352044 , 6.80789280 ]]])\n", + " [16.20891571, 16.42290306, 14.94398117, ..., 12.86083794, 12.63515949, 11.67581463],\n", + " [13.49345875, 14.14656067, 13.16498375, ..., 13.28024578, 13.40956783, 12.70357513],\n", + " [15.56265163, 15.92387581, 14.90643024, ..., 13.45694065, 13.44703197, 12.81099033]]])\n", "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", - " [163, 173, 184, 190, 203])\n" + " [184, 194, 196, 204, 207])\n" ] } ], @@ -464,6 +628,556 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "knowing-military", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'num_samples': 1, 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False, 'stride_ms': 10.0, 'window_ms': 25.0, 'sample_rate': 16000, 'manifest_path': 'examples/aishell/s1/data/manifest.train', 'output_path': 'examples/aishell/s1/data/mean_std.npz'}\n" + ] + } + ], + "source": [ + "import sys\n", + "import argparse\n", + "import functools\n", + "from deepspeech.utils.utility import add_arguments, print_arguments\n", + "parser = argparse.ArgumentParser(description=__doc__)\n", + "add_arg = functools.partial(add_arguments, argparser=parser)\n", + "\n", + "add_arg('num_samples', int, 1, \"# of samples to for statistics.\")\n", + "add_arg('specgram_type', str, 'fbank',\n", + " \"Audio feature type. Options: linear, mfcc, fbank.\",\n", + " choices=['linear', 'mfcc', 'fbank'])\n", + "add_arg('feat_dim', int, 80, \"Audio feature dim.\")\n", + "add_arg('delta_delta', bool, False,\"Audio feature with delta delta.\")\n", + "add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n", + "add_arg('window_ms', float, 25.0, \"stride length in ms.\")\n", + "add_arg('sample_rate', int, 16000, \"target sample rate.\")\n", + "add_arg('manifest_path', str,\n", + " 'examples/aishell/s1/data/manifest.train',\n", + " \"Filepath of manifest to compute normalizer's mean and stddev.\")\n", + "add_arg('output_path', str,\n", + " 'examples/aishell/s1/data/mean_std.npz',\n", + " \"Filepath of write mean and stddev to (.npz).\")\n", + "args = parser.parse_args([])\n", + "print(vars(args))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "unnecessary-province", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n", + "from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n", + "from deepspeech.frontend.normalizer import FeatureNormalizer\n", + "from deepspeech.frontend.audio import AudioSegment\n", + "from deepspeech.frontend.utility import load_cmvn\n", + "from deepspeech.frontend.utility import read_manifest\n", + "\n", + "\n", + "\n", + "def mean(args):\n", + " augmentation_pipeline = AugmentationPipeline('{}')\n", + " audio_featurizer = AudioFeaturizer(\n", + " specgram_type=args.specgram_type,\n", + " feat_dim=args.feat_dim,\n", + " delta_delta=args.delta_delta,\n", + " stride_ms=args.stride_ms,\n", + " window_ms=args.window_ms,\n", + " n_fft=None,\n", + " max_freq=None,\n", + " target_sample_rate=args.sample_rate,\n", + " use_dB_normalization=True,\n", + " target_dB=-20,\n", + " dither=0.0)\n", + "\n", + " def augment_and_featurize(audio_segment):\n", + " augmentation_pipeline.transform_audio(audio_segment)\n", + " return audio_featurizer.featurize(audio_segment)\n", + "\n", + " normalizer = FeatureNormalizer(\n", + " mean_std_filepath=None,\n", + " manifest_path=args.manifest_path,\n", + " featurize_func=augment_and_featurize,\n", + " num_samples=args.num_samples)\n", + " normalizer.write_to_file(args.output_path)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "interested-camping", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n", + "[54. 90. 77. ... 58. 58. 61.]\n", + "29746\n", + "fbank\n", + "[54 90 77 ... 58 58 61] int16\n", + "(184, 80) float64\n", + "[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n", + " 7.80671114]\n", + " [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n", + " 8.83860079]\n", + " [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n", + " 7.96168491]\n", + " ...\n", + " [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n", + " 8.75616521]\n", + " [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n", + " 7.69287184]\n", + " [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n", + " 8.16007108]]\n", + "(184, 80) float64\n", + "[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n", + " 7.80671114]\n", + " [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n", + " 8.83860079]\n", + " [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n", + " 7.96168491]\n", + " ...\n", + " [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n", + " 8.75616521]\n", + " [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n", + " 7.69287184]\n", + " [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n", + " 8.16007108]]\n" + ] + } + ], + "source": [ + "wav='/workspace/DeepSpeech-2.x/examples/aishell/s1/../../..//examples/dataset/aishell/data_aishell/wav/test/S0916/BAC009S0916W0426.wav'\n", + "test='祝可爱的你'\n", + "audio_featurizer = AudioFeaturizer(\n", + " specgram_type=args.specgram_type,\n", + " feat_dim=args.feat_dim,\n", + " delta_delta=args.delta_delta,\n", + " stride_ms=args.stride_ms,\n", + " window_ms=args.window_ms,\n", + " n_fft=None,\n", + " max_freq=None,\n", + " target_sample_rate=args.sample_rate,\n", + " use_dB_normalization=False,\n", + " target_dB=-20,\n", + " dither=0.0)\n", + "samples = AudioSegment.from_file(wav)\n", + "print(samples._samples)\n", + "print(samples._samples * 2**15)\n", + "print(len(samples._samples))\n", + "feat = audio_featurizer.featurize(samples, False, False)\n", + "feat = feat.T\n", + "print(feat.shape, feat.dtype)\n", + "print(feat)\n", + "\n", + "from python_speech_features import logfbank\n", + "max_freq = args.sample_rate / 2\n", + "fbank_feat = logfbank(\n", + " signal=samples.to('int16'),\n", + " samplerate=args.sample_rate,\n", + " winlen=0.001 * args.window_ms,\n", + " winstep=0.001 * args.stride_ms,\n", + " nfilt=args.feat_dim,\n", + " nfft=512,\n", + " lowfreq=20,\n", + " highfreq=max_freq,\n", + " preemph=0.97,\n", + " dither=0.0,\n", + " wintype='povey')\n", + "print(fbank_feat.shape, fbank_feat.dtype)\n", + "print(fbank_feat)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "numeric-analyst", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(184, 160)\n", + "[ 8.59522397 8.43148278 8.36414052 8.45487173 8.31761643 8.04843683\n", + " 8.01683696 7.6574614 7.95521932 8.22945157 10.20138275 9.0447775\n", + " 9.14763398 9.18184349 9.03801065 9.04852307 8.67706728 8.71894271\n", + " 9.54553655 9.19535135 8.76413076 8.47828946 8.52586143 8.49469288\n", + " 8.72461247 8.28562879 8.11581393 7.99922156 7.91023364 8.04142296\n", + " 7.89762773 7.76257636 8.32043745 8.01592886 8.34109665 8.90115454\n", + " 8.48246945 7.98658664 8.05745122 8.11384088 8.18864479 8.8091827\n", + " 11.8067711 13.25258218 14.44311795 13.90515283 14.00120623 13.99801252\n", + " 13.81595394 13.6379904 13.3574897 13.14933334 12.96518543 13.02601156\n", + " 12.70246737 12.54410834 12.15615068 11.86574681 11.67497882 10.79645481\n", + " 10.48150035 10.03758575 10.05637027 9.92891308 10.06923218 12.43382431\n", + " 12.71428321 14.33135052 13.94470959 14.29188291 14.11483993 14.03496606\n", + " 13.78167331 13.66701466 14.40308625 14.73934137 15.09569382 14.89565815\n", + " 15.10519995 14.94383582 15.03275563 15.42194679 15.29219967 15.41602274\n", + " 15.39242545 15.76836177 16.259222 16.47777231 17.03366795 17.46165793\n", + " 17.52596217 17.78844031 17.99878075 18.11446843 17.95761578 17.99900337\n", + " 17.86282737 17.7290163 17.47686504 17.43425516 17.07750485 16.64395242\n", + " 15.68217043 14.90058399 14.45645737 14.0405463 14.89549542 16.00405781\n", + " 16.27301689 16.37572895 16.31219037 16.31765447 16.44819716 16.36281089\n", + " 16.24932823 15.79302555 14.76361963 13.95761882 13.48917053 13.45543501\n", + " 13.00091327 13.13854248 13.74596395 13.86340629 14.00656109 13.77432101\n", + " 13.64267001 13.35742634 13.23042234 12.97916104 12.80694468 12.70005006\n", + " 13.2802483 13.22644525 13.14579624 13.02536594 13.36511022 11.37167205\n", + " 12.11598045 12.47619798 12.83885973 11.63880287 11.42083924 11.08747705\n", + " 11.04093403 11.11263149 10.74353319 10.58734669 10.46180738 10.34157335\n", + " 9.63131146 9.70582692 9.29059204 8.94583657 8.66065094 8.46799095\n", + " 8.25064103 8.30239167 8.19463371 8.12104567 8.02731234 8.06412715\n", + " 7.84889951 7.73090283 7.74119562 7.85444657 7.80717312 7.7129933\n", + " 7.84087442 7.77907788 7.60660865 7.55051479 7.458385 7.496416\n", + " 7.69519793 7.49086759 7.32199493 8.01617458 7.58525375 7.06661122\n", + " 6.94653756 7.19874283 7.28515661 7.17574078]\n", + "(184,)\n", + "(184,)\n", + "[1.48370471 1.52174523 1.46984238 1.67010478 1.88757689 1.68825992\n", + " 1.74270259 1.55497318 1.29200818 1.68446481 1.88133219 1.97138928\n", + " 2.15910096 2.3149476 1.9820247 2.07694378 1.93498835 2.01493974\n", + " 2.39156824 2.02396518 1.69586449 1.63808752 1.64020228 1.43573473\n", + " 1.93092656 1.37466294 1.34704929 1.59600739 1.03960441 1.45276496\n", + " 1.59360131 1.57466343 1.89491479 1.79333746 1.32701974 1.49441767\n", + " 1.51466756 1.63497989 1.42858074 1.51135396 1.61077201 1.81066387\n", + " 1.83367783 2.3507094 2.87885378 3.26231227 2.1313117 1.98557548\n", + " 1.99105426 2.26150533 2.34298751 2.44621608 2.39201042 2.41226503\n", + " 2.5142992 3.03777565 2.81592295 2.75117863 2.78324175 2.68819666\n", + " 2.8945782 2.84464168 2.680973 2.78397395 2.47996808 1.71829563\n", + " 1.60636949 1.65992483 1.38122631 1.74831825 2.16006884 1.68076185\n", + " 1.69329487 1.44929837 1.63763312 1.80101076 2.01166253 2.03254244\n", + " 1.9583913 2.04542255 2.00859694 2.16600883 2.16095629 1.97541122\n", + " 2.13807632 2.06386436 2.2154187 2.84205688 2.54862449 2.64321545\n", + " 2.6805773 2.52300146 2.53209001 2.54682059 2.4521937 2.43155532\n", + " 2.42571275 2.23421289 2.23164529 2.23597192 2.14215121 2.10406703\n", + " 2.07962874 1.88506161 1.80092372 1.61156092 1.77426835 1.98765563\n", + " 2.0356793 1.87964187 1.779513 1.87187681 1.76463632 1.70978684\n", + " 1.76471778 1.75604749 1.62792552 1.73929352 1.6887024 1.8677704\n", + " 2.17342368 2.08166072 2.14567453 2.15936953 2.18351006 2.41010388\n", + " 2.26101752 2.25468001 2.23739715 2.15395133 2.04547813 1.92038843\n", + " 1.85491264 1.91905927 2.16709365 1.99924152 2.1850471 2.55461622\n", + " 2.72476673 1.69682926 1.73249614 2.06992695 2.1210591 1.66854454\n", + " 1.63907505 1.32203822 1.38992558 1.2436937 1.17932877 1.02963653\n", + " 1.26085036 1.16997132 1.09339504 1.14188689 1.18675772 1.31859788\n", + " 1.21746591 1.3872131 1.26095274 1.34885761 1.46633543 1.64506975\n", + " 1.36013821 1.45574721 1.43766588 1.65119054 1.57163772 1.55082968\n", + " 1.29413316 1.38351736 1.64234673 1.57186432 1.45381083 1.71204761\n", + " 1.51828607 1.30639985 1.32928395 1.49004237 1.6057589 1.81815735\n", + " 1.67784678 1.72180861 1.60703743 1.64850255]\n" + ] + } + ], + "source": [ + "a = np.hstack([feat, feat])\n", + "print(a.shape)\n", + "m = np.mean(a, axis=1)\n", + "print(m)\n", + "print(m.shape)\n", + "std = np.std(a, axis=1)\n", + "print(std.shape)\n", + "print(std)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nonprofit-potato", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "hispanic-ethics", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torchaudio\n", + "import torchaudio.compliance.kaldi as kaldi\n", + "import torchaudio.sox_effects as sox_effects\n", + "from torch.nn.utils.rnn import pad_sequence\n", + "torchaudio.set_audio_backend(\"sox\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "changing-calvin", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 29746])\n", + "tensor([[54., 90., 77., ..., 58., 58., 61.]])\n", + "(184, 80)\n", + "[[10.617376 10.077089 5.3248763 ... 10.248186 8.896992 7.8067265]\n", + " [11.044004 10.318072 6.3086634 ... 11.237308 10.358393 8.838616 ]\n", + " [10.269302 9.9963665 7.3296647 ... 10.451319 9.692951 7.9617033]\n", + " ...\n", + " [10.14497 9.886743 6.738012 ... 10.215809 9.0034275 8.756177 ]\n", + " [ 9.977456 9.679498 7.9066052 ... 10.224365 9.594568 7.6928873]\n", + " [ 6.4735703 7.7633557 7.7576594 ... 9.965221 9.622637 8.160085 ]]\n", + "-----------\n", + "[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n", + "(184, 80)\n", + "[[-10.177039 -10.717326 -15.46954 ... -10.546229 -11.897424 -12.987689]\n", + " [ -9.750411 -10.476343 -14.485752 ... -9.557108 -10.436023 -11.955799]\n", + " [-10.525113 -10.798049 -13.46475 ... -10.343097 -11.101464 -12.832712]\n", + " ...\n", + " [-10.649446 -10.907673 -14.056403 ... -10.578607 -11.790988 -12.038239]\n", + " [-10.816959 -11.114918 -12.88781 ... -10.570049 -11.199847 -13.101528]\n", + " [-14.320845 -13.03106 -13.036756 ... -10.829194 -11.171779 -12.634331]]\n", + "**************\n", + "[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n", + "[54. 90. 77. ... 58. 58. 61.] float32\n", + "(184, 80)\n", + "[[10.617376 10.077089 5.3248763 ... 10.248186 8.896992 7.8067265]\n", + " [11.044004 10.318072 6.3086634 ... 11.237308 10.358393 8.838616 ]\n", + " [10.269302 9.9963665 7.3296647 ... 10.451319 9.692951 7.9617033]\n", + " ...\n", + " [10.14497 9.886743 6.738012 ... 10.215809 9.0034275 8.756177 ]\n", + " [ 9.977456 9.679498 7.9066052 ... 10.224365 9.594568 7.6928873]\n", + " [ 6.4735703 7.7633557 7.7576594 ... 9.965221 9.622637 8.160085 ]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: torchaudio.backend.sox_backend.load_wav has been deprecated and will be removed from 0.9.0 release. Please use \"torchaudio.load\".\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "waveform, sample_rate = torchaudio.load_wav(wav)\n", + "print(waveform.shape)\n", + "print(waveform)\n", + "mat = kaldi.fbank(\n", + " waveform,\n", + " num_mel_bins=80,\n", + " frame_length=25,\n", + " frame_shift=10,\n", + " dither=0,\n", + " energy_floor=0.0,\n", + " sample_frequency=sample_rate\n", + " )\n", + "mat = mat.detach().numpy()\n", + "print(mat.shape)\n", + "print(mat)\n", + "\n", + "print('-----------')\n", + "print(samples._samples)\n", + "aud = torch.tensor(samples._samples).view(1, -1)\n", + "mat = kaldi.fbank(\n", + " aud,\n", + " num_mel_bins=80,\n", + " frame_length=25,\n", + " frame_shift=10,\n", + " dither=0,\n", + " energy_floor=0.0,\n", + " sample_frequency=sample_rate\n", + " )\n", + "mat = mat.detach().numpy()\n", + "print(mat.shape)\n", + "print(mat)\n", + "\n", + "print('**************')\n", + "print(samples._samples)\n", + "tmp = samples.to('int16').astype('float32')\n", + "print(tmp, tmp.dtype)\n", + "aud = torch.tensor(tmp).view(1, -1)\n", + "mat = kaldi.fbank(\n", + " aud,\n", + " num_mel_bins=80,\n", + " frame_length=25,\n", + " frame_shift=10,\n", + " dither=0,\n", + " energy_floor=0.0,\n", + " sample_frequency=sample_rate\n", + " )\n", + "mat = mat.detach().numpy()\n", + "print(mat.shape)\n", + "print(mat)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "buried-dependence", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "silver-printing", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "outer-space", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(29746,)\n", + "[54 90 77 ... 58 58 61]\n", + "(184, 80)\n", + "[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n", + " 7.80671114]\n", + " [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n", + " 8.83860079]\n", + " [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n", + " 7.96168491]\n", + " ...\n", + " [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n", + " 8.75616521]\n", + " [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n", + " 7.69287184]\n", + " [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n", + " 8.16007108]]\n", + "(184, 13)\n", + "[[ 14.73775998 -13.30393391 5.85974818 ... -3.42359739 2.82785335\n", + " 8.86862748]\n", + " [ 15.31274834 -13.33671651 4.06537223 ... 8.15970347 2.15934846\n", + " 6.78353115]\n", + " [ 13.82218765 -13.39296404 6.8304843 ... 2.55332563 8.86724453\n", + " -0.05919222]\n", + " ...\n", + " [ 13.5837844 -13.42104892 11.21222354 ... 4.81477718 1.66627505\n", + " 5.59045842]\n", + " [ 13.75757034 -13.92626662 13.06074011 ... -0.46694046 5.56214833\n", + " 12.0785146 ]\n", + " [ 11.92813809 -15.9169855 8.78372271 ... -1.42014277 -3.25768086\n", + " 0.88337965]]\n" + ] + } + ], + "source": [ + "from python_speech_features import mfcc\n", + "from python_speech_features import delta\n", + "from python_speech_features import logfbank\n", + "import scipy.io.wavfile as iowav\n", + "\n", + "(rate,sig) = iowav.read(wav)\n", + "print(sig.shape)\n", + "print(sig)\n", + "\n", + "# note that generally nfilt=40 is used for speech recognition\n", + "fbank_feat = logfbank(sig,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n", + "print(fbank_feat.shape)\n", + "print(fbank_feat)\n", + "\n", + "# the computed fbank coefficents of english.wav with dimension [110,23]\n", + "# [ 12.2865\t12.6906\t13.1765\t15.714\t16.064\t15.7553\t16.5746\t16.9205\t16.6472\t16.1302\t16.4576\t16.7326\t16.8864\t17.7215\t18.88\t19.1377\t19.1495\t18.6683\t18.3886\t20.3506\t20.2772\t18.8248\t18.1899\n", + "# 11.9198\t13.146\t14.7215\t15.8642\t17.4288\t16.394\t16.8238\t16.1095\t16.4297\t16.6331\t16.3163\t16.5093\t17.4981\t18.3429\t19.6555\t19.6263\t19.8435\t19.0534\t19.001\t20.0287\t19.7707\t19.5852\t19.1112\n", + "# ...\n", + "# ...\n", + "# the same with that using kaldi commands: compute-fbank-feats --dither=0.0\n", + "\n", + "mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')\n", + "print(mfcc_feat.shape)\n", + "print(mfcc_feat)\n", + "\n", + "# the computed mfcc coefficents of english.wav with dimension [110,13]\n", + "# [ 17.1337\t-23.3651\t-7.41751\t-7.73686\t-21.3682\t-8.93884\t-3.70843\t4.68346\t-16.0676\t12.782\t-7.24054\t8.25089\t10.7292\n", + "# 17.1692\t-23.3028\t-5.61872\t-4.0075\t-23.287\t-20.6101\t-5.51584\t-6.15273\t-14.4333\t8.13052\t-0.0345329\t2.06274\t-0.564298\n", + "# ...\n", + "# ...\n", + "# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "sporting-school", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(184, 80)\n", + "[[-10.17703627 -10.71732606 -15.46954014 ... -10.54623152 -11.89742148\n", + " -12.98770428]\n", + " [ -9.75040771 -10.47634331 -14.48575413 ... -9.55710616 -10.43602673\n", + " -11.95581463]\n", + " [-10.52510987 -10.79804975 -13.46475161 ... -10.34309947 -11.10146239\n", + " -12.83273051]\n", + " ...\n", + " [-10.64944197 -10.90767335 -14.05640404 ... -10.57860915 -11.7909807\n", + " -12.03825021]\n", + " [-10.8169558 -11.11491806 -12.88781116 ... -10.57004889 -11.19985048\n", + " -13.10154358]\n", + " [-14.32084168 -13.03106051 -13.03675699 ... -10.82919465 -11.17177892\n", + " -12.63434434]]\n", + "(184, 13)\n", + "[[ -6.05665544 -13.30393391 5.85974818 ... -3.42359739 2.82785335\n", + " 8.86862748]\n", + " [ -5.48166707 -13.33671651 4.06537223 ... 8.15970347 2.15934846\n", + " 6.78353115]\n", + " [ -6.97222776 -13.39296404 6.8304843 ... 2.55332563 8.86724453\n", + " -0.05919222]\n", + " ...\n", + " [ -7.21063102 -13.42104892 11.21222354 ... 4.81477718 1.66627505\n", + " 5.59045842]\n", + " [ -7.03684508 -13.92626662 13.06074011 ... -0.46694046 5.56214833\n", + " 12.0785146 ]\n", + " [ -8.86627732 -15.9169855 8.78372271 ... -1.42014277 -3.25768086\n", + " 0.88337965]]\n" + ] + } + ], + "source": [ + "fbank_feat = logfbank(samples._samples,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n", + "print(fbank_feat.shape)\n", + "print(fbank_feat)\n", + "\n", + "mfcc_feat = mfcc(samples._samples,dither=0,useEnergy=True,wintype='povey')\n", + "print(mfcc_feat.shape)\n", + "print(mfcc_feat)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "restricted-license", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "specialized-threat", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/.notebook/python_test.ipynb b/.notebook/python_test.ipynb index 0e6bde47f..819d4c48f 100644 --- a/.notebook/python_test.ipynb +++ b/.notebook/python_test.ipynb @@ -637,7 +637,7 @@ { "cell_type": "code", "execution_count": 59, - "id": "engaged-offense", + "id": "first-release", "metadata": {}, "outputs": [ { @@ -660,7 +660,7 @@ { "cell_type": "code", "execution_count": 35, - "id": "level-fairy", + "id": "convertible-roulette", "metadata": {}, "outputs": [ { @@ -705,7 +705,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "beautiful-geometry", + "id": "cutting-fleece", "metadata": {}, "outputs": [ { @@ -728,7 +728,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "african-trustee", + "id": "historical-diving", "metadata": {}, "outputs": [ { @@ -748,7 +748,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "ready-wages", + "id": "similar-spice", "metadata": {}, "outputs": [], "source": [ @@ -758,7 +758,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "distinguished-printer", + "id": "grand-influence", "metadata": {}, "outputs": [ { @@ -776,7 +776,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "precious-limit", + "id": "wireless-hypothetical", "metadata": {}, "outputs": [ { @@ -809,7 +809,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "chemical-convenience", + "id": "designed-fluid", "metadata": {}, "outputs": [ { @@ -839,7 +839,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "round-remark", + "id": "cultural-friendship", "metadata": {}, "outputs": [ { @@ -871,7 +871,7 @@ { "cell_type": "code", "execution_count": 19, - "id": "smaller-shower", + "id": "fossil-lotus", "metadata": {}, "outputs": [ { @@ -903,7 +903,7 @@ { "cell_type": "code", "execution_count": 31, - "id": "integrated-block", + "id": "constitutional-poker", "metadata": {}, "outputs": [ { @@ -935,7 +935,7 @@ { "cell_type": "code", "execution_count": 32, - "id": "favorite-failure", + "id": "threaded-strap", "metadata": {}, "outputs": [ { @@ -966,7 +966,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "boolean-saint", + "id": "infectious-welcome", "metadata": {}, "outputs": [], "source": [ @@ -977,7 +977,7 @@ { "cell_type": "code", "execution_count": 46, - "id": "senior-hospital", + "id": "musical-anatomy", "metadata": {}, "outputs": [ { @@ -997,7 +997,7 @@ { "cell_type": "code", "execution_count": 30, - "id": "consolidated-incident", + "id": "lucky-paraguay", "metadata": {}, "outputs": [], "source": [ @@ -1007,7 +1007,7 @@ { "cell_type": "code", "execution_count": 31, - "id": "pursuant-paragraph", + "id": "annual-christmas", "metadata": {}, "outputs": [], "source": [ @@ -1017,7 +1017,7 @@ { "cell_type": "code", "execution_count": 47, - "id": "mexican-apollo", + "id": "infectious-seeker", "metadata": {}, "outputs": [ { @@ -1038,7 +1038,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "encouraging-integration", + "id": "pregnant-conditioning", "metadata": {}, "outputs": [], "source": [ @@ -1049,7 +1049,7 @@ { "cell_type": "code", "execution_count": 56, - "id": "trying-auckland", + "id": "logical-happiness", "metadata": {}, "outputs": [], "source": [ @@ -1059,7 +1059,7 @@ { "cell_type": "code", "execution_count": 58, - "id": "national-edward", + "id": "rocky-plastic", "metadata": {}, "outputs": [], "source": [ @@ -1069,7 +1069,7 @@ { "cell_type": "code", "execution_count": 60, - "id": "aerial-campaign", + "id": "focused-compensation", "metadata": {}, "outputs": [], "source": [ @@ -1079,7 +1079,7 @@ { "cell_type": "code", "execution_count": 66, - "id": "instant-violence", + "id": "centered-repository", "metadata": {}, "outputs": [], "source": [ @@ -1089,7 +1089,7 @@ { "cell_type": "code", "execution_count": 95, - "id": "medical-globe", + "id": "inner-invite", "metadata": {}, "outputs": [ { @@ -1110,7 +1110,7 @@ { "cell_type": "code", "execution_count": 81, - "id": "three-contrast", + "id": "russian-chosen", "metadata": {}, "outputs": [ { @@ -1131,7 +1131,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "cross-atlas", + "id": "equal-particle", "metadata": {}, "outputs": [], "source": [ @@ -1161,7 +1161,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "empirical-defense", + "id": "tracked-purse", "metadata": {}, "outputs": [], "source": [ @@ -1172,7 +1172,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "rocky-listening", + "id": "steady-mileage", "metadata": {}, "outputs": [ { @@ -1201,7 +1201,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "surrounded-absolute", + "id": "regulated-google", "metadata": {}, "outputs": [ { @@ -1230,7 +1230,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "differential-surgery", + "id": "homeless-forge", "metadata": {}, "outputs": [ { @@ -1260,7 +1260,7 @@ { "cell_type": "code", "execution_count": 29, - "id": "durable-powell", + "id": "exciting-blocking", "metadata": {}, "outputs": [ { @@ -1290,7 +1290,7 @@ { "cell_type": "code", "execution_count": 30, - "id": "young-continuity", + "id": "through-botswana", "metadata": {}, "outputs": [ { @@ -1308,7 +1308,7 @@ { "cell_type": "code", "execution_count": 22, - "id": "geological-sarah", + "id": "cellular-violence", "metadata": {}, "outputs": [ { @@ -1343,7 +1343,7 @@ { "cell_type": "code", "execution_count": 23, - "id": "possible-angle", + "id": "undefined-parade", "metadata": {}, "outputs": [ { @@ -1376,7 +1376,7 @@ { "cell_type": "code", "execution_count": 33, - "id": "novel-sucking", + "id": "special-delicious", "metadata": {}, "outputs": [], "source": [ @@ -1386,7 +1386,7 @@ { "cell_type": "code", "execution_count": 34, - "id": "fixed-wallet", + "id": "seasonal-consensus", "metadata": {}, "outputs": [ { @@ -1428,7 +1428,7 @@ { "cell_type": "code", "execution_count": 35, - "id": "north-seattle", + "id": "dress-distinction", "metadata": {}, "outputs": [], "source": [ @@ -1438,7 +1438,7 @@ { "cell_type": "code", "execution_count": 38, - "id": "above-western", + "id": "rental-anthony", "metadata": {}, "outputs": [ { @@ -1471,7 +1471,7 @@ { "cell_type": "code", "execution_count": 41, - "id": "choice-diabetes", + "id": "separated-restriction", "metadata": {}, "outputs": [], "source": [ @@ -1481,7 +1481,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "white-vessel", + "id": "painted-variable", "metadata": {}, "outputs": [ { @@ -1504,7 +1504,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "treated-freedom", + "id": "satellite-insider", "metadata": {}, "outputs": [ { @@ -1523,7 +1523,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "convinced-safety", + "id": "developed-thirty", "metadata": {}, "outputs": [ { @@ -1543,7 +1543,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "blond-bunny", + "id": "official-bench", "metadata": {}, "outputs": [ { @@ -1560,10 +1560,97 @@ "print(sorted_val_scores)" ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ranking-camera", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b'\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x14\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x02\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x1e\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n", + "[ 1 20 2 30]\n", + "[[ 1 20]\n", + " [ 2 30]]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n", + " \"\"\"Entry point for launching an IPython kernel.\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:3: DeprecationWarning: The binary mode of fromstring is deprecated, as it behaves surprisingly on unicode inputs. Use frombuffer instead\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + } + ], + "source": [ + "a = scores.tostring()\n", + "print(a)\n", + "b = np.fromstring(a, scores.dtype)\n", + "print(b)\n", + "print(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "breeding-proxy", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.int16" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.int16" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "coordinate-hungary", + "metadata": {}, + "outputs": [], + "source": [ + "dtype = np.dtype('int16')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "specified-jackson", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "int16\n", + "16\n" + ] + } + ], + "source": [ + "print(dtype)\n", + "dtype is np.int16\n", + "print(np.iinfo(dtype).bits)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "utility-monroe", + "id": "activated-insight", "metadata": {}, "outputs": [], "source": [] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 02c084bb8..9621827a0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,6 +3,7 @@ hooks: - id: yapf files: \.py$ + exclude: (?=third_party).*(\.py)$ - repo: https://github.com/pre-commit/pre-commit-hooks sha: a11d9314b22d8f8c7556443875b731ef05965464 hooks: @@ -15,6 +16,7 @@ - id: trailing-whitespace files: \.md$ - id: requirements-txt-fixer + exclude: (?=third_party).*$ - id: check-yaml - id: check-json - id: pretty-format-json @@ -27,6 +29,7 @@ - --ignore=E501,E228,E226,E261,E266,E128,E402,W503 - --builtins=G,request - --jobs=1 + exclude: (?=third_party).*(\.py)$ - repo : https://github.com/Lucas-C/pre-commit-hooks sha: v1.0.1 hooks: @@ -51,8 +54,9 @@ entry: python .pre-commit-hooks/copyright-check.hook language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ - #exclude: (?=decoders/swig).*(\.cpp|\.h)$ + exclude: (?=third_party).*(\.cpp|\.h|\.py)$ - repo: https://github.com/asottile/reorder_python_imports rev: v2.4.0 hooks: - id: reorder-python-imports + exclude: (?=third_party).*(\.py)$ diff --git a/deepspeech/frontend/audio.py b/deepspeech/frontend/audio.py index a45f91d49..a1c7df63e 100644 --- a/deepspeech/frontend/audio.py +++ b/deepspeech/frontend/audio.py @@ -298,6 +298,18 @@ class AudioSegment(object): samples = self._convert_samples_from_float32(self._samples, dtype) return samples.tostring() + def to(self, dtype='int16'): + """Create a `dtype` audio content. + + :param dtype: Data type for export samples. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :return: np.ndarray containing `dtype` audio content. + :rtype: str + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + return samples + def gain_db(self, gain): """Apply gain in decibels to samples. diff --git a/deepspeech/frontend/augmentor/spec_augment.py b/deepspeech/frontend/augmentor/spec_augment.py index 518bbe096..1c2e09fc7 100644 --- a/deepspeech/frontend/augmentor/spec_augment.py +++ b/deepspeech/frontend/augmentor/spec_augment.py @@ -64,6 +64,7 @@ class SpecAugmentor(AugmentorBase): self.n_freq_masks = n_freq_masks self.n_time_masks = n_time_masks self.p = p + #logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}") # adaptive SpecAugment self.adaptive_number_ratio = adaptive_number_ratio diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index eddfdc8c5..44f67c5c1 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -56,7 +56,8 @@ class AudioFeaturizer(object): max_freq=None, target_sample_rate=16000, use_dB_normalization=True, - target_dB=-20): + target_dB=-20, + dither=1.0): self._specgram_type = specgram_type # mfcc and fbank using `feat_dim` self._feat_dim = feat_dim @@ -69,6 +70,7 @@ class AudioFeaturizer(object): self._use_dB_normalization = use_dB_normalization self._target_dB = target_dB self._fft_point = n_fft + self._dither = dither def featurize(self, audio_segment, @@ -101,8 +103,7 @@ class AudioFeaturizer(object): if self._use_dB_normalization: audio_segment.normalize(target_db=self._target_dB) # extract spectrogram - return self._compute_specgram(audio_segment.samples, - audio_segment.sample_rate) + return self._compute_specgram(audio_segment) @property def feature_size(self): @@ -125,9 +126,11 @@ class AudioFeaturizer(object): "Supported values: linear." % self._specgram_type) return feat_dim - def _compute_specgram(self, samples, sample_rate): + def _compute_specgram(self, audio_segment): """Extract various audio features.""" + sample_rate = audio_segment.sample_rate if self._specgram_type == 'linear': + samples = audio_segment.samples return self._compute_linear_specgram( samples, sample_rate, @@ -135,6 +138,7 @@ class AudioFeaturizer(object): window_ms=self._window_ms, max_freq=self._max_freq) elif self._specgram_type == 'mfcc': + samples = audio_segment.to('int16') return self._compute_mfcc( samples, sample_rate, @@ -142,8 +146,10 @@ class AudioFeaturizer(object): stride_ms=self._stride_ms, window_ms=self._window_ms, max_freq=self._max_freq, + dither=self._dither, delta_delta=self._delta_delta) elif self._specgram_type == 'fbank': + samples = audio_segment.to('int16') return self._compute_fbank( samples, sample_rate, @@ -151,6 +157,7 @@ class AudioFeaturizer(object): stride_ms=self._stride_ms, window_ms=self._window_ms, max_freq=self._max_freq, + dither=self._dither, delta_delta=self._delta_delta) else: raise ValueError("Unknown specgram_type %s. " @@ -233,17 +240,18 @@ class AudioFeaturizer(object): sample_rate, feat_dim=13, stride_ms=10.0, - window_ms=20.0, + window_ms=25.0, max_freq=None, + dither=1.0, delta_delta=True): """Compute mfcc from samples. Args: - samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array + samples (np.ndarray, np.int16): the audio signal from which to compute features. sample_rate (float): the sample rate of the signal we are working with, in Hz. feat_dim (int): the number of cepstrum to return, default 13. stride_ms (float, optional): stride length in ms. Defaults to 10.0. - window_ms (float, optional): window length in ms. Defaults to 20.0. + window_ms (float, optional): window length in ms. Defaults to 25.0. max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None. delta_delta (bool, optional): Whether with delta delta. Defaults to False. @@ -270,14 +278,16 @@ class AudioFeaturizer(object): winlen=0.001 * window_ms, winstep=0.001 * stride_ms, numcep=feat_dim, - nfilt=2 * feat_dim, - nfft=None, - lowfreq=0, + nfilt=23, + nfft=512, + lowfreq=20, highfreq=max_freq, + dither=dither, + remove_dc_offset=True, preemph=0.97, ceplifter=22, - appendEnergy=True, - winfunc=lambda x: np.ones((x, ))) + useEnergy=True, + winfunc='povey') mfcc_feat = np.transpose(mfcc_feat) if delta_delta: mfcc_feat = self._concat_delta_delta(mfcc_feat) @@ -286,15 +296,16 @@ class AudioFeaturizer(object): def _compute_fbank(self, samples, sample_rate, - feat_dim=26, + feat_dim=40, stride_ms=10.0, - window_ms=20.0, + window_ms=25.0, max_freq=None, + dither=1.0, delta_delta=False): """Compute logfbank from samples. Args: - samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array + samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array sample_rate (float): the sample rate of the signal we are working with, in Hz. feat_dim (int): the number of cepstrum to return, default 13. stride_ms (float, optional): stride length in ms. Defaults to 10.0. @@ -325,9 +336,13 @@ class AudioFeaturizer(object): winstep=0.001 * stride_ms, nfilt=feat_dim, nfft=512, - lowfreq=0, + lowfreq=20, highfreq=max_freq, - preemph=0.97, ) + dither=dither, + remove_dc_offset=True, + preemph=0.97, + wintype='povey') + fbank_feat = np.transpose(fbank_feat) if delta_delta: fbank_feat = self._concat_delta_delta(fbank_feat) diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py index e9524bf38..9161c1e46 100644 --- a/deepspeech/frontend/normalizer.py +++ b/deepspeech/frontend/normalizer.py @@ -82,13 +82,16 @@ class FeatureNormalizer(object): def _read_mean_std_from_file(self, filepath, eps=1e-20): """Load mean and std from file.""" mean, std = load_cmvn(filepath, filetype='npz') - self._mean = mean - self._istd = 1.0 / std + self._mean = mean.T + self._istd = 1.0 / std.T def _compute_mean_std(self, manifest_path, featurize_func, num_samples): """Compute mean and std from randomly sampled instances.""" manifest = read_manifest(manifest_path) - sampled_manifest = self._rng.sample(manifest, num_samples) + if num_samples == -1: + sampled_manifest = manifest + else: + sampled_manifest = self._rng.sample(manifest, num_samples) features = [] for instance in sampled_manifest: features.append( diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index fb2700083..f98e5a854 100644 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -36,10 +36,12 @@ fi # compute mean and stddev for normalizer python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ ---num_samples=2000 \ --specgram_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ +--stride_ms=10.0 \ +--window_ms=25.0 \ +--sample_rate=16000 \ --output_path="data/mean_std.npz" if [ $? -ne 0 ]; then diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/s1/conf/augmentation.json index a1a759e67..1987ad424 100644 --- a/examples/tiny/s1/conf/augmentation.json +++ b/examples/tiny/s1/conf/augmentation.json @@ -1,4 +1,13 @@ [ + { + "type": "speed", + "params": { + "min_speed_rate": 0.9, + "max_speed_rate": 1.1, + "num_rates": 3 + }, + "prob": 0.0 + }, { "type": "shift", "params": { @@ -6,5 +15,20 @@ "max_shift_ms": 5 }, "prob": 1.0 + }, + { + "type": "specaug", + "params": { + "F": 10, + "T": 50, + "n_freq_masks": 2, + "n_time_masks": 2, + "p": 1.0, + "W": 80, + "adaptive_number_ratio": 0, + "adaptive_size_ratio": 0, + "max_n_time_masks": 20 + }, + "prob": 1.0 } ] diff --git a/setup.sh b/setup.sh index 881fe8078..8d82038d9 100644 --- a/setup.sh +++ b/setup.sh @@ -54,4 +54,14 @@ if [ $? != 0 ]; then exit -1 fi + +# install kaldi-comptiable feature +pushd third_party/python_kaldi_features/ +python setup.py install +if [ $? != 0 ]; then + error_msg "Please check why kaldi feature install error!" + exit -1 +fi +popd + info_msg "Install all dependencies successfully." diff --git a/third_party/README.md b/third_party/README.md new file mode 100644 index 000000000..836e002a8 --- /dev/null +++ b/third_party/README.md @@ -0,0 +1,4 @@ + +* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features) +commit: fc1bd6240c2008412ab64dc25045cd872f5e126c +ref: https://zhuanlan.zhihu.com/p/55371926 diff --git a/third_party/python_kaldi_features/LICENSE b/third_party/python_kaldi_features/LICENSE new file mode 100644 index 000000000..f1ae26488 --- /dev/null +++ b/third_party/python_kaldi_features/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2013 James Lyons + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/third_party/python_kaldi_features/MANIFEST b/third_party/python_kaldi_features/MANIFEST new file mode 100644 index 000000000..90d8dbcc9 --- /dev/null +++ b/third_party/python_kaldi_features/MANIFEST @@ -0,0 +1,5 @@ +# file GENERATED by distutils, do NOT edit +setup.py +python_speech_features\__init__.py +python_speech_features\base.py +python_speech_features\sigproc.py diff --git a/third_party/python_kaldi_features/README.rst b/third_party/python_kaldi_features/README.rst new file mode 100644 index 000000000..83ac9efe8 --- /dev/null +++ b/third_party/python_kaldi_features/README.rst @@ -0,0 +1,58 @@ + + +forked from ``_ + +check the readme therein for the usages + +It has been modified to produce the same results as with the compute-mfcc-feats and compute-fbank-feats (check their default parameters first) commands in Kaldi. + +------------------------------- + +The compute-mfcc-feats pipeline: + +src/featbin/Compute-mfcc-feats.cc + + Mfcc mfcc(mfcc_opts) --> src/feat/Feature-mfcc.h + + struct MfccOptions + + typedef OfflineFeatureTpl Mfcc --> src/feat/Feature-common.h + + MfccComputer() --> src/feat/Feature-mfcc.cc + + ComputeDctMatrix() --> src/matrix/Matrix-functions.cc + + ComputeLifterCoeffs() --> src/feat/Mel-computations.cc + + + for each utterance: + mfcc.ComputeFeatures() + +src/feat/Feature-common-inl.h + +    OfflineFeatureTpl::ComputeFeatures() + + Compute() + + ExtractWindow() --> src/feat/Feature-window.cc + + ProcessWindow() + + Dither, remove_dc_offset, log_energy_pre_window, Preemphasize, window + +            computer_.Compute() --> src/feat/Feature-mfcc.cc + + MfccComputer::Compute() + +                                         const MelBanks &mel_banks --> Mel-computations.cc + +                                          srfft_ +                                         +                                         ComputerPowerSpectrum() + + mel_banks.Compute() + + mel_energies_.ApplyLog() + + dct, cepstral_lifter + diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py b/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py new file mode 100644 index 000000000..9b5ed21c9 --- /dev/null +++ b/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py @@ -0,0 +1 @@ +from .base import * diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/base.py b/third_party/python_kaldi_features/build/lib/python_speech_features/base.py new file mode 100644 index 000000000..592cb4f1e --- /dev/null +++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base.py @@ -0,0 +1,166 @@ +# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications +# Author: James Lyons 2012 +from __future__ import division +import numpy +from python_speech_features import sigproc +from scipy.fftpack import dct + +def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, + nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97, + ceplifter=22,useEnergy=True,wintype='povey'): + """Compute MFCC features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param numcep: the number of cepstrum to return, default 13 + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. + :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype) + feat = numpy.log(feat) + feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] + feat = lifter(feat,ceplifter) + if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy + return feat + +def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, + wintype='hamming'): + """Compute Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + winfunc=lambda x:numpy.ones((x,)) + :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The + second return value is the energy in each frame (total energy, unwindowed) + """ + highfreq= highfreq or samplerate/2 + frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype) + pspec = sigproc.powspec(frames,nfft) # nearly the same until this part + energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame + energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log + + return feat,energy + +def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'): + """Compute log Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype) + return numpy.log(feat) + +def hz2mel(hz): + """Convert a value in Hertz to Mels + + :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Mels. If an array was passed in, an identical sized array is returned. + """ + return 1127 * numpy.log(1+hz/700.0) + + +def mel2hz(mel): + """Convert a value in Mels to Hertz + + :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. + """ + return 700 * (numpy.exp(mel/1127.0)-1) + +def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): + """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond + to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) + + :param nfilt: the number of filters in the filterbank, default 20. + :param nfft: the FFT size. Default is 512. + :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. + :param lowfreq: lowest band edge of mel filters, default 0 Hz + :param highfreq: highest band edge of mel filters, default samplerate/2 + :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. + """ + highfreq= highfreq or samplerate/2 + assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" + + # compute points evenly spaced in mels + lowmel = hz2mel(lowfreq) + highmel = hz2mel(highfreq) + + # check kaldi/src/feat/Mel-computations.h + fbank = numpy.zeros([nfilt,nfft//2+1]) + mel_freq_delta = (highmel-lowmel)/(nfilt+1) + for j in range(0,nfilt): + leftmel = lowmel+j*mel_freq_delta + centermel = lowmel+(j+1)*mel_freq_delta + rightmel = lowmel+(j+2)*mel_freq_delta + for i in range(0,nfft//2): + mel=hz2mel(i*samplerate/nfft) + if mel>leftmel and mel 0: + nframes,ncoeff = numpy.shape(cepstra) + n = numpy.arange(ncoeff) + lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) + return lift*cepstra + else: + # values of L <= 0, do nothing + return cepstra + +def delta(feat, N): + """Compute delta features from a feature vector sequence. + + :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. + :param N: For each frame, calculate delta features based on preceding and following N frames + :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. + """ + if N < 1: + raise ValueError('N must be an integer >= 1') + NUMFRAMES = len(feat) + denominator = 2 * sum([i**2 for i in range(1, N+1)]) + delta_feat = numpy.empty_like(feat) + padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat + for t in range(NUMFRAMES): + delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] + return delta_feat diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py b/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py new file mode 100644 index 000000000..3efaec190 --- /dev/null +++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py @@ -0,0 +1,190 @@ +# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications +# Author: James Lyons 2012 +from __future__ import division +import numpy +from python_speech_features import sigproc +from scipy.fftpack import dct + +def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, + winfunc=lambda x:numpy.ones((x,))): + """Compute MFCC features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param numcep: the number of cepstrum to return, default 13 + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. + :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc) + feat = numpy.log(feat) + feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] + feat = lifter(feat,ceplifter) + if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy + return feat + +def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, + winfunc=lambda x:numpy.ones((x,))): + """Compute Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The + second return value is the energy in each frame (total energy, unwindowed) + """ + highfreq= highfreq or samplerate/2 + signal = sigproc.preemphasis(signal,preemph) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) + pspec = sigproc.powspec(frames,nfft) + energy = numpy.sum(pspec,1) # this stores the total energy in each frame + energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log + + return feat,energy + +def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): + """Compute log Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) + return numpy.log(feat) + +def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, + winfunc=lambda x:numpy.ones((x,))): + """Compute Spectral Subband Centroid features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. + """ + highfreq= highfreq or samplerate/2 + signal = sigproc.preemphasis(signal,preemph) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) + pspec = sigproc.powspec(frames,nfft) + pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) + + return numpy.dot(pspec*R,fb.T) / feat + +def hz2mel(hz): + """Convert a value in Hertz to Mels + + :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Mels. If an array was passed in, an identical sized array is returned. + """ + return 2595 * numpy.log10(1+hz/700.) + +def mel2hz(mel): + """Convert a value in Mels to Hertz + + :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. + """ + return 700*(10**(mel/2595.0)-1) + +def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): + """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond + to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) + + :param nfilt: the number of filters in the filterbank, default 20. + :param nfft: the FFT size. Default is 512. + :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. + :param lowfreq: lowest band edge of mel filters, default 0 Hz + :param highfreq: highest band edge of mel filters, default samplerate/2 + :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. + """ + highfreq= highfreq or samplerate/2 + assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" + + # compute points evenly spaced in mels + lowmel = hz2mel(lowfreq) + highmel = hz2mel(highfreq) + melpoints = numpy.linspace(lowmel,highmel,nfilt+2) + # our points are in Hz, but we use fft bins, so we have to convert + # from Hz to fft bin number + bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate) + + fbank = numpy.zeros([nfilt,nfft//2+1]) + for j in range(0,nfilt): + for i in range(int(bin[j]), int(bin[j+1])): + fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) + for i in range(int(bin[j+1]), int(bin[j+2])): + fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]) + return fbank + +def lifter(cepstra, L=22): + """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the + magnitude of the high frequency DCT coeffs. + + :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size. + :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter. + """ + if L > 0: + nframes,ncoeff = numpy.shape(cepstra) + n = numpy.arange(ncoeff) + lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) + return lift*cepstra + else: + # values of L <= 0, do nothing + return cepstra + +def delta(feat, N): + """Compute delta features from a feature vector sequence. + + :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. + :param N: For each frame, calculate delta features based on preceding and following N frames + :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. + """ + if N < 1: + raise ValueError('N must be an integer >= 1') + NUMFRAMES = len(feat) + denominator = 2 * sum([i**2 for i in range(1, N+1)]) + delta_feat = numpy.empty_like(feat) + padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat + for t in range(NUMFRAMES): + delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] + return delta_feat diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py new file mode 100644 index 000000000..b7c78a803 --- /dev/null +++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py @@ -0,0 +1,158 @@ +# This file includes routines for basic signal processing including framing and computing power spectra. +# Author: James Lyons 2012 +import decimal + +import numpy +import math +import logging + + +def round_half_up(number): + return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) + + +def rolling_window(a, window, step=1): + # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] + + +def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True): + """Frame a signal into overlapping frames. + + :param sig: the audio signal to frame. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :param stride_trick: use stride trick to compute the rolling window and window multiplication faster + :returns: an array of frames. Size is NUMFRAMES by frame_len. + """ + slen = len(sig) + frame_len = int(round_half_up(frame_len)) + frame_step = int(round_half_up(frame_step)) + if slen <= frame_len: + numframes = 1 + else: + numframes = 1 + (( slen - frame_len) // frame_step) + + # check kaldi/src/feat/feature-window.h + padsignal = sig[:(numframes-1)*frame_step+frame_len] + if wintype is 'povey': + win = numpy.empty(frame_len) + for i in range(frame_len): + win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85 + else: # the hamming window + win = numpy.hamming(frame_len) + + if stride_trick: + frames = rolling_window(padsignal, window=frame_len, step=frame_step) + else: + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + frames = padsignal[indices] + win = numpy.tile(win, (numframes, 1)) + + frames = frames.astype(numpy.float32) + raw_frames = numpy.zeros(frames.shape) + for frm in range(frames.shape[0]): + frames[frm,:] = do_dither(frames[frm,:], dither) # dither + frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset + raw_frames[frm,:] = frames[frm,:] + frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize + + return frames * win, raw_frames + +def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): + """Does overlap-add procedure to undo the action of framesig. + + :param frames: the array of frames. + :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :returns: a 1-D signal. + """ + frame_len = round_half_up(frame_len) + frame_step = round_half_up(frame_step) + numframes = numpy.shape(frames)[0] + assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' + + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + padlen = (numframes - 1) * frame_step + frame_len + + if siglen <= 0: siglen = padlen + + rec_signal = numpy.zeros((padlen,)) + window_correction = numpy.zeros((padlen,)) + win = winfunc(frame_len) + + for i in range(0, numframes): + window_correction[indices[i, :]] = window_correction[ + indices[i, :]] + win + 1e-15 # add a little bit so it is never zero + rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] + + rec_signal = rec_signal / window_correction + return rec_signal[0:siglen] + + +def magspec(frames, NFFT): + """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. + """ + if numpy.shape(frames)[1] > NFFT: + logging.warn( + 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', + numpy.shape(frames)[1], NFFT) + complex_spec = numpy.fft.rfft(frames, NFFT) + return numpy.absolute(complex_spec) + + +def powspec(frames, NFFT): + """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. + """ + return numpy.square(magspec(frames, NFFT)) + + +def logpowspec(frames, NFFT, norm=1): + """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. + """ + ps = powspec(frames, NFFT); + ps[ps <= 1e-30] = 1e-30 + lps = 10 * numpy.log10(ps) + if norm: + return lps - numpy.max(lps) + else: + return lps + +def do_dither(signal, dither_value=1.0): + signal += numpy.random.normal(size=signal.shape) * dither_value + return signal + +def do_remove_dc_offset(signal): + signal -= numpy.mean(signal) + return signal + +def do_preemphasis(signal, coeff=0.97): + """perform preemphasis on the input signal. + + :param signal: The signal to filter. + :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. + :returns: the filtered signal. + """ + return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1]) diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py new file mode 100644 index 000000000..a786c4fb6 --- /dev/null +++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py @@ -0,0 +1,140 @@ +# This file includes routines for basic signal processing including framing and computing power spectra. +# Author: James Lyons 2012 +import decimal + +import numpy +import math +import logging + + +def round_half_up(number): + return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) + + +def rolling_window(a, window, step=1): + # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] + + +def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True): + """Frame a signal into overlapping frames. + + :param sig: the audio signal to frame. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :param stride_trick: use stride trick to compute the rolling window and window multiplication faster + :returns: an array of frames. Size is NUMFRAMES by frame_len. + """ + slen = len(sig) + frame_len = int(round_half_up(frame_len)) + frame_step = int(round_half_up(frame_step)) + if slen <= frame_len: + numframes = 1 + else: + numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) + + padlen = int((numframes - 1) * frame_step + frame_len) + + zeros = numpy.zeros((padlen - slen,)) + padsignal = numpy.concatenate((sig, zeros)) + if stride_trick: + win = winfunc(frame_len) + frames = rolling_window(padsignal, window=frame_len, step=frame_step) + else: + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + frames = padsignal[indices] + win = numpy.tile(winfunc(frame_len), (numframes, 1)) + + return frames * win + + +def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): + """Does overlap-add procedure to undo the action of framesig. + + :param frames: the array of frames. + :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :returns: a 1-D signal. + """ + frame_len = round_half_up(frame_len) + frame_step = round_half_up(frame_step) + numframes = numpy.shape(frames)[0] + assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' + + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + padlen = (numframes - 1) * frame_step + frame_len + + if siglen <= 0: siglen = padlen + + rec_signal = numpy.zeros((padlen,)) + window_correction = numpy.zeros((padlen,)) + win = winfunc(frame_len) + + for i in range(0, numframes): + window_correction[indices[i, :]] = window_correction[ + indices[i, :]] + win + 1e-15 # add a little bit so it is never zero + rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] + + rec_signal = rec_signal / window_correction + return rec_signal[0:siglen] + + +def magspec(frames, NFFT): + """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. + """ + if numpy.shape(frames)[1] > NFFT: + logging.warn( + 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', + numpy.shape(frames)[1], NFFT) + complex_spec = numpy.fft.rfft(frames, NFFT) + return numpy.absolute(complex_spec) + + +def powspec(frames, NFFT): + """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. + """ + return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) + + +def logpowspec(frames, NFFT, norm=1): + """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. + """ + ps = powspec(frames, NFFT); + ps[ps <= 1e-30] = 1e-30 + lps = 10 * numpy.log10(ps) + if norm: + return lps - numpy.max(lps) + else: + return lps + + +def preemphasis(signal, coeff=0.95): + """perform preemphasis on the input signal. + + :param signal: The signal to filter. + :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. + :returns: the filtered signal. + """ + return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) diff --git a/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg b/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg new file mode 100644 index 000000000..0936a2629 Binary files /dev/null and b/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg differ diff --git a/third_party/python_kaldi_features/docs/Makefile b/third_party/python_kaldi_features/docs/Makefile new file mode 100644 index 000000000..0672ce907 --- /dev/null +++ b/third_party/python_kaldi_features/docs/Makefile @@ -0,0 +1,89 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_speech_features.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_speech_features.qhc" + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ + "run these through (pdf)latex." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/third_party/python_kaldi_features/docs/make.bat b/third_party/python_kaldi_features/docs/make.bat new file mode 100644 index 000000000..a20d0b0da --- /dev/null +++ b/third_party/python_kaldi_features/docs/make.bat @@ -0,0 +1,113 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +set SPHINXBUILD=sphinx-build +set BUILDDIR=build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_speech_features.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_speech_features.ghc + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/third_party/python_kaldi_features/docs/source/conf.py b/third_party/python_kaldi_features/docs/source/conf.py new file mode 100644 index 000000000..727fc3275 --- /dev/null +++ b/third_party/python_kaldi_features/docs/source/conf.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- +# +# python_speech_features documentation build configuration file, created by +# sphinx-quickstart on Thu Oct 31 16:49:58 2013. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +import mock + +MOCK_MODULES = ['numpy', 'scipy', 'scipy.fftpack'] +for mod_name in MOCK_MODULES: + sys.modules[mod_name] = mock.Mock() + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0,os.path.abspath('../..')) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'python_speech_features' +copyright = u'2013, James Lyons' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1.0' +# The full version, including alpha/beta/rc tags. +release = '0.1.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'python_speech_featuresdoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'python_speech_features.tex', u'python\\_speech\\_features Documentation', + u'James Lyons', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + +autodoc_member_order = 'bysource' diff --git a/third_party/python_kaldi_features/docs/source/index.rst b/third_party/python_kaldi_features/docs/source/index.rst new file mode 100644 index 000000000..93ec5a4ba --- /dev/null +++ b/third_party/python_kaldi_features/docs/source/index.rst @@ -0,0 +1,54 @@ +.. python_speech_features documentation master file, created by + sphinx-quickstart on Thu Oct 31 16:49:58 2013. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to python_speech_features's documentation! +================================================== + +This library provides common speech features for ASR including MFCCs and filterbank energies. +If you are not sure what MFCCs are, and would like to know more have a look at this MFCC tutorial: +http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/. + +You will need numpy and scipy to run these files. The code for this project is available at https://github.com/jameslyons/python_speech_features . + +Supported features: + +- :py:meth:`python_speech_features.mfcc` - Mel Frequency Cepstral Coefficients +- :py:meth:`python_speech_features.fbank` - Filterbank Energies +- :py:meth:`python_speech_features.logfbank` - Log Filterbank Energies +- :py:meth:`python_speech_features.ssc` - Spectral Subband Centroids + +To use MFCC features:: + + from python_speech_features import mfcc + from python_speech_features import logfbank + import scipy.io.wavfile as wav + + (rate,sig) = wav.read("file.wav") + mfcc_feat = mfcc(sig,rate) + fbank_feat = logfbank(sig,rate) + + print(fbank_feat[1:3,:]) + +From here you can write the features to a file etc. + +Functions provided in python_speech_features module +------------------------------------- + +.. automodule:: python_speech_features.base + :members: + + +Functions provided in sigproc module +------------------------------------ +.. automodule:: python_speech_features.sigproc + :members: + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`search` + diff --git a/third_party/python_kaldi_features/english.wav b/third_party/python_kaldi_features/english.wav new file mode 100644 index 000000000..bb28291f6 Binary files /dev/null and b/third_party/python_kaldi_features/english.wav differ diff --git a/third_party/python_kaldi_features/example.py b/third_party/python_kaldi_features/example.py new file mode 100644 index 000000000..abbe4d021 --- /dev/null +++ b/third_party/python_kaldi_features/example.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +from python_speech_features import mfcc +from python_speech_features import delta +from python_speech_features import logfbank +import scipy.io.wavfile as wav + +(rate,sig) = wav.read("english.wav") + +# note that generally nfilt=40 is used for speech recognition +fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey') + +# the computed fbank coefficents of english.wav with dimension [110,23] +# [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899 +# 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112 +# ... +# ... +# the same with that using kaldi commands: compute-fbank-feats --dither=0.0 + + +mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey') + +# the computed mfcc coefficents of english.wav with dimension [110,13] +# [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292 +# 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298 +# ... +# ... +# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0 + diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO b/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO new file mode 100644 index 000000000..c08c0032c --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: python-speech-features +Version: 0.6 +Summary: Python Speech Feature extraction +Home-page: https://github.com/jameslyons/python_speech_features +Author: James Lyons +Author-email: james.lyons0@gmail.com +License: MIT +Description: UNKNOWN +Platform: UNKNOWN diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt b/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt new file mode 100644 index 000000000..492aefcae --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt @@ -0,0 +1,12 @@ +README.rst +setup.py +python_speech_features/__init__.py +python_speech_features/base.py +python_speech_features/base_orig.py +python_speech_features/sigproc.py +python_speech_features/sigproc_orig.py +python_speech_features.egg-info/PKG-INFO +python_speech_features.egg-info/SOURCES.txt +python_speech_features.egg-info/dependency_links.txt +python_speech_features.egg-info/top_level.txt +test/test_sigproc.py \ No newline at end of file diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt b/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt b/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt new file mode 100644 index 000000000..42c4020dd --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt @@ -0,0 +1 @@ +python_speech_features diff --git a/third_party/python_kaldi_features/python_speech_features/__init__.py b/third_party/python_kaldi_features/python_speech_features/__init__.py new file mode 100644 index 000000000..9b5ed21c9 --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features/__init__.py @@ -0,0 +1 @@ +from .base import * diff --git a/third_party/python_kaldi_features/python_speech_features/base.py b/third_party/python_kaldi_features/python_speech_features/base.py new file mode 100644 index 000000000..592cb4f1e --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features/base.py @@ -0,0 +1,166 @@ +# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications +# Author: James Lyons 2012 +from __future__ import division +import numpy +from python_speech_features import sigproc +from scipy.fftpack import dct + +def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, + nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97, + ceplifter=22,useEnergy=True,wintype='povey'): + """Compute MFCC features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param numcep: the number of cepstrum to return, default 13 + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. + :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype) + feat = numpy.log(feat) + feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] + feat = lifter(feat,ceplifter) + if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy + return feat + +def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, + wintype='hamming'): + """Compute Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + winfunc=lambda x:numpy.ones((x,)) + :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The + second return value is the energy in each frame (total energy, unwindowed) + """ + highfreq= highfreq or samplerate/2 + frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype) + pspec = sigproc.powspec(frames,nfft) # nearly the same until this part + energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame + energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log + + return feat,energy + +def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'): + """Compute log Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype) + return numpy.log(feat) + +def hz2mel(hz): + """Convert a value in Hertz to Mels + + :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Mels. If an array was passed in, an identical sized array is returned. + """ + return 1127 * numpy.log(1+hz/700.0) + + +def mel2hz(mel): + """Convert a value in Mels to Hertz + + :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. + """ + return 700 * (numpy.exp(mel/1127.0)-1) + +def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): + """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond + to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) + + :param nfilt: the number of filters in the filterbank, default 20. + :param nfft: the FFT size. Default is 512. + :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. + :param lowfreq: lowest band edge of mel filters, default 0 Hz + :param highfreq: highest band edge of mel filters, default samplerate/2 + :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. + """ + highfreq= highfreq or samplerate/2 + assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" + + # compute points evenly spaced in mels + lowmel = hz2mel(lowfreq) + highmel = hz2mel(highfreq) + + # check kaldi/src/feat/Mel-computations.h + fbank = numpy.zeros([nfilt,nfft//2+1]) + mel_freq_delta = (highmel-lowmel)/(nfilt+1) + for j in range(0,nfilt): + leftmel = lowmel+j*mel_freq_delta + centermel = lowmel+(j+1)*mel_freq_delta + rightmel = lowmel+(j+2)*mel_freq_delta + for i in range(0,nfft//2): + mel=hz2mel(i*samplerate/nfft) + if mel>leftmel and mel 0: + nframes,ncoeff = numpy.shape(cepstra) + n = numpy.arange(ncoeff) + lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) + return lift*cepstra + else: + # values of L <= 0, do nothing + return cepstra + +def delta(feat, N): + """Compute delta features from a feature vector sequence. + + :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. + :param N: For each frame, calculate delta features based on preceding and following N frames + :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. + """ + if N < 1: + raise ValueError('N must be an integer >= 1') + NUMFRAMES = len(feat) + denominator = 2 * sum([i**2 for i in range(1, N+1)]) + delta_feat = numpy.empty_like(feat) + padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat + for t in range(NUMFRAMES): + delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] + return delta_feat diff --git a/third_party/python_kaldi_features/python_speech_features/base_orig.py b/third_party/python_kaldi_features/python_speech_features/base_orig.py new file mode 100644 index 000000000..3efaec190 --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features/base_orig.py @@ -0,0 +1,190 @@ +# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications +# Author: James Lyons 2012 +from __future__ import division +import numpy +from python_speech_features import sigproc +from scipy.fftpack import dct + +def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, + winfunc=lambda x:numpy.ones((x,))): + """Compute MFCC features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param numcep: the number of cepstrum to return, default 13 + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. + :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc) + feat = numpy.log(feat) + feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] + feat = lifter(feat,ceplifter) + if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy + return feat + +def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, + winfunc=lambda x:numpy.ones((x,))): + """Compute Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The + second return value is the energy in each frame (total energy, unwindowed) + """ + highfreq= highfreq or samplerate/2 + signal = sigproc.preemphasis(signal,preemph) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) + pspec = sigproc.powspec(frames,nfft) + energy = numpy.sum(pspec,1) # this stores the total energy in each frame + energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log + + return feat,energy + +def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): + """Compute log Mel-filterbank energy features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. + """ + feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) + return numpy.log(feat) + +def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, + winfunc=lambda x:numpy.ones((x,))): + """Compute Spectral Subband Centroid features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfilt: the number of filters in the filterbank, default 26. + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. + """ + highfreq= highfreq or samplerate/2 + signal = sigproc.preemphasis(signal,preemph) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) + pspec = sigproc.powspec(frames,nfft) + pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) + + return numpy.dot(pspec*R,fb.T) / feat + +def hz2mel(hz): + """Convert a value in Hertz to Mels + + :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Mels. If an array was passed in, an identical sized array is returned. + """ + return 2595 * numpy.log10(1+hz/700.) + +def mel2hz(mel): + """Convert a value in Mels to Hertz + + :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. + :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. + """ + return 700*(10**(mel/2595.0)-1) + +def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): + """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond + to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) + + :param nfilt: the number of filters in the filterbank, default 20. + :param nfft: the FFT size. Default is 512. + :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. + :param lowfreq: lowest band edge of mel filters, default 0 Hz + :param highfreq: highest band edge of mel filters, default samplerate/2 + :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. + """ + highfreq= highfreq or samplerate/2 + assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" + + # compute points evenly spaced in mels + lowmel = hz2mel(lowfreq) + highmel = hz2mel(highfreq) + melpoints = numpy.linspace(lowmel,highmel,nfilt+2) + # our points are in Hz, but we use fft bins, so we have to convert + # from Hz to fft bin number + bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate) + + fbank = numpy.zeros([nfilt,nfft//2+1]) + for j in range(0,nfilt): + for i in range(int(bin[j]), int(bin[j+1])): + fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) + for i in range(int(bin[j+1]), int(bin[j+2])): + fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]) + return fbank + +def lifter(cepstra, L=22): + """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the + magnitude of the high frequency DCT coeffs. + + :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size. + :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter. + """ + if L > 0: + nframes,ncoeff = numpy.shape(cepstra) + n = numpy.arange(ncoeff) + lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) + return lift*cepstra + else: + # values of L <= 0, do nothing + return cepstra + +def delta(feat, N): + """Compute delta features from a feature vector sequence. + + :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. + :param N: For each frame, calculate delta features based on preceding and following N frames + :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. + """ + if N < 1: + raise ValueError('N must be an integer >= 1') + NUMFRAMES = len(feat) + denominator = 2 * sum([i**2 for i in range(1, N+1)]) + delta_feat = numpy.empty_like(feat) + padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat + for t in range(NUMFRAMES): + delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] + return delta_feat diff --git a/third_party/python_kaldi_features/python_speech_features/sigproc.py b/third_party/python_kaldi_features/python_speech_features/sigproc.py new file mode 100644 index 000000000..b7c78a803 --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features/sigproc.py @@ -0,0 +1,158 @@ +# This file includes routines for basic signal processing including framing and computing power spectra. +# Author: James Lyons 2012 +import decimal + +import numpy +import math +import logging + + +def round_half_up(number): + return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) + + +def rolling_window(a, window, step=1): + # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] + + +def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True): + """Frame a signal into overlapping frames. + + :param sig: the audio signal to frame. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :param stride_trick: use stride trick to compute the rolling window and window multiplication faster + :returns: an array of frames. Size is NUMFRAMES by frame_len. + """ + slen = len(sig) + frame_len = int(round_half_up(frame_len)) + frame_step = int(round_half_up(frame_step)) + if slen <= frame_len: + numframes = 1 + else: + numframes = 1 + (( slen - frame_len) // frame_step) + + # check kaldi/src/feat/feature-window.h + padsignal = sig[:(numframes-1)*frame_step+frame_len] + if wintype is 'povey': + win = numpy.empty(frame_len) + for i in range(frame_len): + win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85 + else: # the hamming window + win = numpy.hamming(frame_len) + + if stride_trick: + frames = rolling_window(padsignal, window=frame_len, step=frame_step) + else: + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + frames = padsignal[indices] + win = numpy.tile(win, (numframes, 1)) + + frames = frames.astype(numpy.float32) + raw_frames = numpy.zeros(frames.shape) + for frm in range(frames.shape[0]): + frames[frm,:] = do_dither(frames[frm,:], dither) # dither + frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset + raw_frames[frm,:] = frames[frm,:] + frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize + + return frames * win, raw_frames + +def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): + """Does overlap-add procedure to undo the action of framesig. + + :param frames: the array of frames. + :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :returns: a 1-D signal. + """ + frame_len = round_half_up(frame_len) + frame_step = round_half_up(frame_step) + numframes = numpy.shape(frames)[0] + assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' + + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + padlen = (numframes - 1) * frame_step + frame_len + + if siglen <= 0: siglen = padlen + + rec_signal = numpy.zeros((padlen,)) + window_correction = numpy.zeros((padlen,)) + win = winfunc(frame_len) + + for i in range(0, numframes): + window_correction[indices[i, :]] = window_correction[ + indices[i, :]] + win + 1e-15 # add a little bit so it is never zero + rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] + + rec_signal = rec_signal / window_correction + return rec_signal[0:siglen] + + +def magspec(frames, NFFT): + """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. + """ + if numpy.shape(frames)[1] > NFFT: + logging.warn( + 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', + numpy.shape(frames)[1], NFFT) + complex_spec = numpy.fft.rfft(frames, NFFT) + return numpy.absolute(complex_spec) + + +def powspec(frames, NFFT): + """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. + """ + return numpy.square(magspec(frames, NFFT)) + + +def logpowspec(frames, NFFT, norm=1): + """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. + """ + ps = powspec(frames, NFFT); + ps[ps <= 1e-30] = 1e-30 + lps = 10 * numpy.log10(ps) + if norm: + return lps - numpy.max(lps) + else: + return lps + +def do_dither(signal, dither_value=1.0): + signal += numpy.random.normal(size=signal.shape) * dither_value + return signal + +def do_remove_dc_offset(signal): + signal -= numpy.mean(signal) + return signal + +def do_preemphasis(signal, coeff=0.97): + """perform preemphasis on the input signal. + + :param signal: The signal to filter. + :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. + :returns: the filtered signal. + """ + return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1]) diff --git a/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py b/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py new file mode 100644 index 000000000..a786c4fb6 --- /dev/null +++ b/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py @@ -0,0 +1,140 @@ +# This file includes routines for basic signal processing including framing and computing power spectra. +# Author: James Lyons 2012 +import decimal + +import numpy +import math +import logging + + +def round_half_up(number): + return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) + + +def rolling_window(a, window, step=1): + # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] + + +def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True): + """Frame a signal into overlapping frames. + + :param sig: the audio signal to frame. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :param stride_trick: use stride trick to compute the rolling window and window multiplication faster + :returns: an array of frames. Size is NUMFRAMES by frame_len. + """ + slen = len(sig) + frame_len = int(round_half_up(frame_len)) + frame_step = int(round_half_up(frame_step)) + if slen <= frame_len: + numframes = 1 + else: + numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) + + padlen = int((numframes - 1) * frame_step + frame_len) + + zeros = numpy.zeros((padlen - slen,)) + padsignal = numpy.concatenate((sig, zeros)) + if stride_trick: + win = winfunc(frame_len) + frames = rolling_window(padsignal, window=frame_len, step=frame_step) + else: + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + frames = padsignal[indices] + win = numpy.tile(winfunc(frame_len), (numframes, 1)) + + return frames * win + + +def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): + """Does overlap-add procedure to undo the action of framesig. + + :param frames: the array of frames. + :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. + :param frame_len: length of each frame measured in samples. + :param frame_step: number of samples after the start of the previous frame that the next frame should begin. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. + :returns: a 1-D signal. + """ + frame_len = round_half_up(frame_len) + frame_step = round_half_up(frame_step) + numframes = numpy.shape(frames)[0] + assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' + + indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( + numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T + indices = numpy.array(indices, dtype=numpy.int32) + padlen = (numframes - 1) * frame_step + frame_len + + if siglen <= 0: siglen = padlen + + rec_signal = numpy.zeros((padlen,)) + window_correction = numpy.zeros((padlen,)) + win = winfunc(frame_len) + + for i in range(0, numframes): + window_correction[indices[i, :]] = window_correction[ + indices[i, :]] + win + 1e-15 # add a little bit so it is never zero + rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] + + rec_signal = rec_signal / window_correction + return rec_signal[0:siglen] + + +def magspec(frames, NFFT): + """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. + """ + if numpy.shape(frames)[1] > NFFT: + logging.warn( + 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', + numpy.shape(frames)[1], NFFT) + complex_spec = numpy.fft.rfft(frames, NFFT) + return numpy.absolute(complex_spec) + + +def powspec(frames, NFFT): + """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. + """ + return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) + + +def logpowspec(frames, NFFT, norm=1): + """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). + + :param frames: the array of frames. Each row is a frame. + :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. + :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. + :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. + """ + ps = powspec(frames, NFFT); + ps[ps <= 1e-30] = 1e-30 + lps = 10 * numpy.log10(ps) + if norm: + return lps - numpy.max(lps) + else: + return lps + + +def preemphasis(signal, coeff=0.95): + """perform preemphasis on the input signal. + + :param signal: The signal to filter. + :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. + :returns: the filtered signal. + """ + return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) diff --git a/third_party/python_kaldi_features/requirements.txt b/third_party/python_kaldi_features/requirements.txt new file mode 100644 index 000000000..a881eb051 --- /dev/null +++ b/third_party/python_kaldi_features/requirements.txt @@ -0,0 +1,3 @@ +mock +scipy +numpy diff --git a/third_party/python_kaldi_features/setup.py b/third_party/python_kaldi_features/setup.py new file mode 100644 index 000000000..47c777186 --- /dev/null +++ b/third_party/python_kaldi_features/setup.py @@ -0,0 +1,14 @@ +try: + from setuptools import setup #enables develop +except ImportError: + from distutils.core import setup + +setup(name='python_speech_features', + version='0.6', + description='Python Speech Feature extraction', + author='James Lyons', + author_email='james.lyons0@gmail.com', + license='MIT', + url='https://github.com/jameslyons/python_speech_features', + packages=['python_speech_features'], + ) diff --git a/third_party/python_kaldi_features/test/test_sigproc.py b/third_party/python_kaldi_features/test/test_sigproc.py new file mode 100644 index 000000000..e08a346ba --- /dev/null +++ b/third_party/python_kaldi_features/test/test_sigproc.py @@ -0,0 +1,31 @@ +from python_speech_features import sigproc +import unittest +import numpy as np +import time + + +class test_case(unittest.TestCase): + def test_frame_sig(self): + n = 10000124 + frame_len = 37 + frame_step = 13 + x = np.random.rand(n) + t0 = time.time() + y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False) + t1 = time.time() + y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True) + t_new = time.time() - t1 + t_old = t1 - t0 + self.assertTupleEqual(y_old.shape, y_new.shape) + np.testing.assert_array_equal(y_old, y_new) + self.assertLess(t_new, t_old) + print('new run time %3.2f < %3.2f sec' % (t_new, t_old)) + + def test_rolling(self): + x = np.arange(10) + y = sigproc.rolling_window(x, window=4, step=3) + y_expected = np.array([[0, 1, 2, 3], + [3, 4, 5, 6], + [6, 7, 8, 9]] + ) + y = np.testing.assert_array_equal(y, y_expected) diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index 948e18a68..780568f99 100644 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -24,7 +24,7 @@ from deepspeech.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('num_samples', int, 2000, "# of samples to for statistics.") +add_arg('num_samples', int, -1, "# of samples to for statistics.") add_arg('specgram_type', str, 'linear', "Audio feature type. Options: linear, mfcc, fbank.",