diff --git a/.notebook/compute_cmvn_loader_test.ipynb b/.notebook/compute_cmvn_loader_test.ipynb index 916f7c414..2b0a8b75f 100644 --- a/.notebook/compute_cmvn_loader_test.ipynb +++ b/.notebook/compute_cmvn_loader_test.ipynb @@ -10,13 +10,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "/workspace/DeepSpeech-2.x\n" + "/home/ssd5/zhanghui/DeepSpeech2.x\n" ] }, { "data": { "text/plain": [ - "'/workspace/DeepSpeech-2.x'" + "'/home/ssd5/zhanghui/DeepSpeech2.x'" ] }, "execution_count": 1, @@ -39,68 +39,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "id": "patient-convention", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", - "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", - " def convert_to_list(value, n, name, dtype=np.int):\n", - "register user softmax to paddle, remove this when fixed!\n", - "register user log_softmax to paddle, remove this when fixed!\n", - "register user sigmoid to paddle, remove this when fixed!\n", - "register user log_sigmoid to paddle, remove this when fixed!\n", - "register user relu to paddle, remove this when fixed!\n", - "override cat of paddle if exists or register, remove this when fixed!\n", - "override item of paddle.Tensor if exists or register, remove this when fixed!\n", - "override long of paddle.Tensor if exists or register, remove this when fixed!\n", - "override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", - "override eq of paddle.Tensor if exists or register, remove this when fixed!\n", - "override eq of paddle if exists or register, remove this when fixed!\n", - "override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", - "override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", - "register user view to paddle.Tensor, remove this when fixed!\n", - "register user view_as to paddle.Tensor, remove this when fixed!\n", - "register user masked_fill to paddle.Tensor, remove this when fixed!\n", - "register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", - "register user fill_ to paddle.Tensor, remove this when fixed!\n", - "register user repeat to paddle.Tensor, remove this when fixed!\n", - "register user softmax to paddle.Tensor, remove this when fixed!\n", - "register user sigmoid to paddle.Tensor, remove this when fixed!\n", - "register user relu to paddle.Tensor, remove this when fixed!\n", - "register user type_as to paddle.Tensor, remove this when fixed!\n", - "register user to to paddle.Tensor, remove this when fixed!\n", - "register user float to paddle.Tensor, remove this when fixed!\n", - "register user tolist to paddle.Tensor, remove this when fixed!\n", - "register user glu to paddle.nn.functional, remove this when fixed!\n", - "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", - "register user Module to paddle.nn, remove this when fixed!\n", - "register user ModuleList to paddle.nn, remove this when fixed!\n", - "register user GLU to paddle.nn, remove this when fixed!\n", - "register user ConstantPad2d to paddle.nn, remove this when fixed!\n", - "register user export to paddle.jit, remove this when fixed!\n", - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", - " from numpy.dual import register_func\n", - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", - "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", - " from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n", - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", - "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", - " long_ = _make_signed(np.long)\n", - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", - "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", - " ulong = _make_unsigned(np.long)\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "Namespace(delta_delta=False, feat_dim=13, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=1, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='linear', stride_ms=10.0, window_ms=20.0)\n" + "Namespace(delta_delta=False, feat_dim=80, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=16, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='fbank', stride_ms=10.0, window_ms=25.0)\n" ] } ], @@ -119,21 +66,21 @@ "# yapf: disable\n", "add_arg('num_samples', int, -1, \"# of samples to for statistics.\")\n", "add_arg('specgram_type', str,\n", - " 'linear',\n", + " 'fbank',\n", " \"Audio feature type. Options: linear, mfcc, fbank.\",\n", " choices=['linear', 'mfcc', 'fbank'])\n", - "add_arg('feat_dim', int, 13, \"Audio feature dim.\")\n", + "add_arg('feat_dim', int, 80, \"Audio feature dim.\")\n", "add_arg('delta_delta', bool,\n", " False,\n", " \"Audio feature with delta delta.\")\n", "add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n", - "add_arg('window_ms', float, 20.0, \"stride length in ms.\")\n", + "add_arg('window_ms', float, 25.0, \"stride length in ms.\")\n", "add_arg('sample_rate', int, 16000, \"target sample rate.\")\n", "add_arg('manifest_path', str,\n", " 'examples/aishell/s1/data/manifest.train.raw',\n", " \"Filepath of manifest to compute normalizer's mean and stddev.\")\n", "add_arg('num_workers',\n", - " default=1,\n", + " default=16,\n", " type=int,\n", " help='num of subprocess workers for processing')\n", "add_arg('output_path', str,\n", @@ -146,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "enormous-currency", "metadata": {}, "outputs": [], @@ -213,67 +160,134 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "armed-semester", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Process Process-2:\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.7/multiprocessing/process.py\", line 297, in _bootstrap\n", - " self.run()\n", - " File \"/usr/local/lib/python3.7/multiprocessing/process.py\", line 99, in run\n", - " self._target(*self._args, **self._kwargs)\n", - " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 463, in _worker_loop\n", - " six.reraise(*sys.exc_info())\n", - " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/six.py\", line 703, in reraise\n", - " raise value\n", - " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 446, in _worker_loop\n", - " for s in sample:\n", - "TypeError: 'int' object is not iterable\n", - "2021-04-20 07:43:09,866 - ERROR - DataLoader reader thread raised an exception!\n" - ] - }, - { - "ename": "SystemError", - "evalue": "(Fatal) Blocking queue is killed because the data reader raises an exception.\n [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mSystemError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0mwav_number\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# for i, batch in enumerate(data_loader()):\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmean_stat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvar_stat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0min_dygraph_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_next_var_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_return_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mSystemError\u001b[0m: (Fatal) Blocking queue is killed because the data reader raises an exception.\n [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n" - ] - }, - { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Exception in thread Thread-5:\n", - "Traceback (most recent call last):\n", - " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 684, in _get_data\n", - " data = self._data_queue.get(timeout=self._timeout)\n", - " File \"/usr/local/lib/python3.7/multiprocessing/queues.py\", line 105, in get\n", - " raise Empty\n", - "_queue.Empty\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.7/threading.py\", line 917, in _bootstrap_inner\n", - " self.run()\n", - " File \"/usr/local/lib/python3.7/threading.py\", line 865, in run\n", - " self._target(*self._args, **self._kwargs)\n", - " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 616, in _thread_loop\n", - " batch = self._get_data()\n", - " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 700, in _get_data\n", - " \"pids: {}\".format(len(failed_workers), pids))\n", - "RuntimeError: DataLoader 1 workers exit unexpectedly, pids: 40561\n", - "\n" + "process 1000 wavs,450739 frames\n", + "process 2000 wavs,887447 frames\n", + "process 3000 wavs,1354148 frames\n", + "process 4000 wavs,1816494 frames\n", + "process 5000 wavs,2359211 frames\n", + "process 6000 wavs,2828455 frames\n", + "process 7000 wavs,3276186 frames\n", + "process 8000 wavs,3692234 frames\n", + "process 9000 wavs,4139360 frames\n", + "process 10000 wavs,4591528 frames\n", + "process 11000 wavs,5020114 frames\n", + "process 12000 wavs,5459523 frames\n", + "process 13000 wavs,5899534 frames\n", + "process 14000 wavs,6323242 frames\n", + "process 15000 wavs,6736597 frames\n", + "process 16000 wavs,7207686 frames\n", + "process 17000 wavs,7637800 frames\n", + "process 18000 wavs,8093004 frames\n", + "process 19000 wavs,8529518 frames\n", + "process 20000 wavs,8906022 frames\n", + "process 21000 wavs,9352652 frames\n", + "process 22000 wavs,9807495 frames\n", + "process 23000 wavs,10247938 frames\n", + "process 24000 wavs,10700011 frames\n", + "process 25000 wavs,11126134 frames\n", + "process 26000 wavs,11558061 frames\n", + "process 27000 wavs,12010359 frames\n", + "process 28000 wavs,12470938 frames\n", + "process 29000 wavs,12916013 frames\n", + "process 30000 wavs,13345816 frames\n", + "process 31000 wavs,13752365 frames\n", + "process 32000 wavs,14174801 frames\n", + "process 33000 wavs,14642170 frames\n", + "process 34000 wavs,15053557 frames\n", + "process 35000 wavs,15531890 frames\n", + "process 36000 wavs,16022711 frames\n", + "process 37000 wavs,16437688 frames\n", + "process 38000 wavs,16859517 frames\n", + "process 39000 wavs,17307676 frames\n", + "process 40000 wavs,17796629 frames\n", + "process 41000 wavs,18264151 frames\n", + "process 42000 wavs,18711898 frames\n", + "process 43000 wavs,19159890 frames\n", + "process 44000 wavs,19576435 frames\n", + "process 45000 wavs,19992793 frames\n", + "process 46000 wavs,20464449 frames\n", + "process 47000 wavs,20886021 frames\n", + "process 48000 wavs,21317318 frames\n", + "process 49000 wavs,21738034 frames\n", + "process 50000 wavs,22171890 frames\n", + "process 51000 wavs,22622238 frames\n", + "process 52000 wavs,23100734 frames\n", + "process 53000 wavs,23526901 frames\n", + "process 54000 wavs,23969746 frames\n", + "process 55000 wavs,24418691 frames\n", + "process 56000 wavs,24862546 frames\n", + "process 57000 wavs,25336448 frames\n", + "process 58000 wavs,25778435 frames\n", + "process 59000 wavs,26216199 frames\n", + "process 60000 wavs,26694692 frames\n", + "process 61000 wavs,27148978 frames\n", + "process 62000 wavs,27617088 frames\n", + "process 63000 wavs,28064946 frames\n", + "process 64000 wavs,28519843 frames\n", + "process 65000 wavs,28989722 frames\n", + "process 66000 wavs,29470156 frames\n", + "process 67000 wavs,29952931 frames\n", + "process 68000 wavs,30360555 frames\n", + "process 69000 wavs,30797929 frames\n", + "process 70000 wavs,31218227 frames\n", + "process 71000 wavs,31663934 frames\n", + "process 72000 wavs,32107468 frames\n", + "process 73000 wavs,32541943 frames\n", + "process 74000 wavs,33010702 frames\n", + "process 75000 wavs,33448082 frames\n", + "process 76000 wavs,33886812 frames\n", + "process 77000 wavs,34338108 frames\n", + "process 78000 wavs,34761495 frames\n", + "process 79000 wavs,35199730 frames\n", + "process 80000 wavs,35669630 frames\n", + "process 81000 wavs,36122402 frames\n", + "process 82000 wavs,36604561 frames\n", + "process 83000 wavs,37085552 frames\n", + "process 84000 wavs,37517500 frames\n", + "process 85000 wavs,37987196 frames\n", + "process 86000 wavs,38415721 frames\n", + "process 87000 wavs,38889467 frames\n", + "process 88000 wavs,39337809 frames\n", + "process 89000 wavs,39792342 frames\n", + "process 90000 wavs,40287946 frames\n", + "process 91000 wavs,40719461 frames\n", + "process 92000 wavs,41178919 frames\n", + "process 93000 wavs,41659635 frames\n", + "process 94000 wavs,42132985 frames\n", + "process 95000 wavs,42584564 frames\n", + "process 96000 wavs,43018598 frames\n", + "process 97000 wavs,43480662 frames\n", + "process 98000 wavs,43973670 frames\n", + "process 99000 wavs,44448190 frames\n", + "process 100000 wavs,44935034 frames\n", + "process 101000 wavs,45379812 frames\n", + "process 102000 wavs,45821207 frames\n", + "process 103000 wavs,46258420 frames\n", + "process 104000 wavs,46743733 frames\n", + "process 105000 wavs,47206922 frames\n", + "process 106000 wavs,47683041 frames\n", + "process 107000 wavs,48122809 frames\n", + "process 108000 wavs,48594623 frames\n", + "process 109000 wavs,49086358 frames\n", + "process 110000 wavs,49525568 frames\n", + "process 111000 wavs,49985820 frames\n", + "process 112000 wavs,50428262 frames\n", + "process 113000 wavs,50897957 frames\n", + "process 114000 wavs,51344589 frames\n", + "process 115000 wavs,51774621 frames\n", + "process 116000 wavs,52243372 frames\n", + "process 117000 wavs,52726025 frames\n", + "process 118000 wavs,53170026 frames\n", + "process 119000 wavs,53614141 frames\n", + "process 120000 wavs,54071271 frames\n" ] } ], @@ -317,8 +331,8 @@ " all_var_stat = None\n", " all_number = 0\n", " wav_number = 0\n", - " # for i, batch in enumerate(data_loader()):\n", - " for batch in data_loader():\n", + " for i, batch in enumerate(data_loader()):\n", + " #for batch in data_loader():\n", " number, mean_stat, var_stat = batch\n", " if i == 0:\n", " all_mean_stat = mean_stat\n", @@ -342,11 +356,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "danish-executive", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'mean_stat': [-813852467.7953382, -769025957.9140725, -809499593.411409, -774700574.014532, -750961217.5896736, -760564397.2864963, -805662399.3771614, -843490965.4231446, -850242081.9416809, -857678651.504435, -879067453.9826999, -908602072.3856701, -936850957.7187386, -957242686.489041, -968425442.0916103, -972687545.5953809, -980383731.7683417, -991533337.6343704, -1001966818.1164789, -1010334169.7486078, -1016855066.9099333, -1022176245.7021623, -1025700476.4788507, -1030678878.3195274, -1037075963.124199, -1042705719.0195516, -1047422212.6492896, -1049003537.271861, -1050314833.7453628, -1050772191.0204058, -1050010034.9948177, -1050436065.1336465, -1053327181.7978873, -1058710548.2036785, -1065950852.4966162, -1071709705.0060445, -1077682778.259181, -1083371045.272074, -1089708906.2657735, -1096312217.7865202, -1101089858.8364556, -1104965332.4332569, -1107791702.5223634, -1109431075.2374773, -1110066333.0280604, -1110382732.0722318, -1110480306.3793216, -1110203297.7110727, -1109972534.3583376, -1109378081.8792782, -1108212059.413654, -1107235713.2041805, -1106973581.9280007, -1107352339.7860134, -1108730029.862537, -1110425202.83704, -1113220669.4552443, -1115887535.4870913, -1118105356.3628063, -1120001376.8503075, -1121135822.320366, -1122265971.8751016, -1123990217.401155, -1125786729.6230593, -1127784957.2745507, -1129180108.9033566, -1132000461.6688302, -1134675829.8190608, -1137652487.5164194, -1141755948.0463965, -1145340901.5468378, -1148637682.593287, -1151755522.470022, -1154981643.2268832, -1157417488.840151, -1161240429.0989249, -1165411128.671642, -1170521097.1034513, -1176307165.5109766, -1183456865.0039694, -1190535938.6591117, -1197946309.0472982, -1203596565.037139, -1207563038.1241052, -1209707561.5829782, -1211407066.2452552, -1211884576.9201162, -1212778872.005509, -1214041413.8080075, -1215367953.1745043, -1216850831.482193, -1217678325.5351057, -1218854289.54188, -1219325064.8610544, -1219080344.7580786, -1218541313.657531, -1217889833.2067819, -1216552930.1654336, -1216423777.4113154, -1216575252.225508, -1217075384.9826024, -1217391577.901724, -1217838974.57273, -1218131805.6054134, -1218294889.7465532, -1218566666.1755593, -1218790537.5519717, -1218748668.9956846, -1218603191.4941735, -1218004566.4348054, -1217312410.127734, -1217207493.9522285, -1217284002.3834674, -1217644312.51745, -1218039821.6444128, -1218721811.6269798, -1219121088.9265897, -1219014460.8090584, -1218530127.6776083, -1217952335.451711, -1217316073.8666434, -1217035380.1151958, -1216636431.2964456, -1216257015.2945514, -1215658496.1208403, -1215097272.0976632, -1214669859.2064147, -1214593853.4809475, -1214599475.7838447, -1214575440.823035, -1214158828.8008435, -1213482920.2673717, -1212476577.5897374, -1211251374.2198513, -1210284855.590475, -1209302456.065669, -1209106252.6625297, -1209373211.5146718, -1209689421.7984035, -1210021342.495856, -1210650609.3592312, -1211428521.3900626, -1212616111.4257205, -1213820075.2948189, -1215320588.7144456, -1217175082.2739282, -1219703351.4585004, -1222007827.120464, -1224637375.5900724, -1228367798.912171, -1234853879.862459, -1247222219.867692, -1268562808.1616178, -1302034822.9569275, -1347823631.0776038, -1402753916.9445229, -1458826717.3262982, -1505843092.0970414, -1534278782.249077, -1543955545.8994718, -1600409154.893352], 'var_stat': [12665413908.91729, 11145088801.244318, 12567119446.035736, 11758392758.06822, 11200687982.736668, 11551903443.711124, 12880777868.435602, 14084854368.236998, 14394011058.866192, 14678818621.277662, 15346278722.626339, 16268053979.757076, 17191705347.854794, 17877540386.548733, 18251857849.077663, 18392628178.710472, 18645534548.4045, 19018598212.22902, 19366711357.782673, 19655730286.72857, 19890681996.786858, 20094163350.461906, 20227774955.225887, 20423525628.66887, 20669928826.76939, 20882313568.247944, 21062392676.270527, 21126648821.879055, 21185210734.751118, 21209014745.520447, 21182293842.91236, 21197433134.875977, 21302147790.662144, 21504666657.651955, 21781818550.89697, 21996170165.145462, 22217169779.096275, 22431161762.176693, 22672708668.38104, 22922683961.072956, 23101137011.201683, 23249680793.556847, 23358894817.24979, 23422895267.919228, 23449479198.303394, 23464433357.671055, 23469197140.124596, 23459013479.866177, 23447935341.542686, 23422585038.052387, 23375601301.949135, 23338397991.497776, 23329682884.21905, 23348002892.39853, 23406274659.89975, 23478242518.92228, 23592891371.876236, 23703885161.772205, 23797158601.65954, 23875230355.66992, 23918333664.3946, 23968582109.371258, 24040547318.081936, 24112364295.110058, 24189973697.612144, 24242165205.640236, 24364255205.82311, 24472408850.760197, 24590211203.05312, 24763026764.005527, 24909192634.69144, 25043438176.23281, 25167141466.500504, 25297108031.48665, 25395377064.0999, 25550930772.86505, 25721404827.10336, 25931101211.156487, 26168988710.098465, 26465528802.762875, 26760033029.443783, 27075408488.605213, 27316626931.655052, 27487275073.52796, 27579518448.2332, 27652308513.875782, 27673412508.45838, 27711509210.702576, 27767312240.641487, 27827464683.295334, 27894794590.957966, 27935988489.16511, 27992337099.891083, 28019655483.58796, 28014286886.252903, 27996189233.857716, 27973078840.875465, 27920045013.68706, 27917103211.22359, 27927566165.64652, 27953525818.61368, 27973386070.140022, 27999317832.502476, 28019494120.641834, 28033010746.452637, 28051086123.896503, 28066195174.191753, 28068570977.318798, 28064890246.85437, 28042424375.860577, 28015849655.869568, 28014812222.566605, 28021039053.959835, 28039270607.169422, 28058271295.10199, 28088976520.10178, 28107824988.74732, 28105633030.784756, 28087681357.818607, 28065484299.963837, 28039555887.004284, 28028214431.52875, 28011714871.929447, 27995603790.480755, 27970125897.561134, 27946436130.511288, 27929044772.5522, 27926612443.390316, 27926256324.387302, 27924771848.71099, 27905526922.390133, 27876268519.168198, 27832532606.552593, 27779497699.976765, 27737034351.907337, 27692129825.179924, 27684252911.371475, 27698882622.878677, 27712387157.27985, 27726474638.933037, 27752647691.051613, 27786197932.382797, 27836378752.662235, 27887415700.334576, 27949784230.702114, 28028117657.84245, 28136313097.200474, 28234098926.207996, 28345845477.25874, 28507222800.146496, 28793832339.90449, 29350765483.070816, 30328262350.231213, 31894930713.76519, 34093669067.422382, 36801959396.22739, 39638995447.49344, 42088579425.44825, 43616108982.85117, 44152063315.31461, 47464832889.5967], 'frame_num': 54129649}\n" + ] + } + ], + "source": [ + "print(cmvn_info)" + ] }, { "cell_type": "code", @@ -358,9 +382,389 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "dominant-abuse", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "process 1000 wavs,450240 frames\n", + " \n", + "process 2000 wavs,886411 frames\n", + " \n", + "process 3000 wavs,1352580 frames\n", + " \n", + "process 4000 wavs,1814397 frames\n", + " \n", + "process 5000 wavs,2356587 frames\n", + " \n", + "process 6000 wavs,2825310 frames\n", + " \n", + "process 7000 wavs,3272506 frames\n", + " \n", + "process 8000 wavs,3688045 frames\n", + " \n", + "process 9000 wavs,4134669 frames\n", + " \n", + "process 10000 wavs,4586357 frames\n", + " \n", + "process 11000 wavs,5014429 frames\n", + " \n", + "process 12000 wavs,5453334 frames\n", + " \n", + "process 13000 wavs,5892888 frames\n", + " \n", + "process 14000 wavs,6316059 frames\n", + " \n", + "process 15000 wavs,6728870 frames\n", + " \n", + "process 16000 wavs,7199442 frames\n", + " \n", + "process 17000 wavs,7629055 frames\n", + " \n", + "process 18000 wavs,8083729 frames\n", + " \n", + "process 19000 wavs,8519732 frames\n", + " \n", + "process 20000 wavs,8895694 frames\n", + " \n", + "process 21000 wavs,9341778 frames\n", + " \n", + "process 22000 wavs,9796126 frames\n", + " \n", + "process 23000 wavs,10236057 frames\n", + " \n", + "process 24000 wavs,10687461 frames\n", + " \n", + "process 25000 wavs,11113082 frames\n", + " \n", + "process 26000 wavs,11544482 frames\n", + " \n", + "process 27000 wavs,11996273 frames\n", + " \n", + "process 28000 wavs,12456350 frames\n", + " \n", + "process 29000 wavs,12900895 frames\n", + " \n", + "process 30000 wavs,13330353 frames\n", + " \n", + "process 31000 wavs,13736568 frames\n", + " \n", + "process 32000 wavs,14158472 frames\n", + " \n", + "process 33000 wavs,14625316 frames\n", + " \n", + "process 34000 wavs,15036206 frames\n", + " \n", + "process 35000 wavs,15514001 frames\n", + " \n", + "process 36000 wavs,16004323 frames\n", + " \n", + "process 37000 wavs,16418799 frames\n", + " \n", + "process 38000 wavs,16840100 frames\n", + " \n", + "process 39000 wavs,17287752 frames\n", + " \n", + "process 40000 wavs,17776206 frames\n", + " \n", + "process 41000 wavs,18243209 frames\n", + " \n", + "process 42000 wavs,18690449 frames\n", + " \n", + "process 43000 wavs,19137940 frames\n", + " \n", + "process 44000 wavs,19553966 frames\n", + " \n", + "process 45000 wavs,19969813 frames\n", + " \n", + "process 46000 wavs,20440963 frames\n", + " \n", + "process 47000 wavs,20862022 frames\n", + " \n", + "process 48000 wavs,21292801 frames\n", + " \n", + "process 49000 wavs,21713004 frames\n", + " \n", + "process 50000 wavs,22146346 frames\n", + " \n", + "process 51000 wavs,22596172 frames\n", + " \n", + "process 52000 wavs,23074160 frames\n", + " \n", + "process 53000 wavs,23499823 frames\n", + " \n", + "process 54000 wavs,23942151 frames\n", + " \n", + "process 55000 wavs,24390566 frames\n", + " \n", + "process 56000 wavs,24833905 frames\n", + " \n", + "process 57000 wavs,25307270 frames\n", + " \n", + "process 58000 wavs,25748720 frames\n", + " \n", + "process 59000 wavs,26185964 frames\n", + " \n", + "process 60000 wavs,26663953 frames\n", + " \n", + "process 61000 wavs,27117720 frames\n", + " \n", + "process 62000 wavs,27585349 frames\n", + " \n", + "process 63000 wavs,28032693 frames\n", + " \n", + "process 64000 wavs,28487074 frames\n", + " \n", + "process 65000 wavs,28956462 frames\n", + " \n", + "process 66000 wavs,29436358 frames\n", + " \n", + "process 67000 wavs,29918569 frames\n", + " \n", + "process 68000 wavs,30325682 frames\n", + " \n", + "process 69000 wavs,30762528 frames\n", + " \n", + "process 70000 wavs,31182319 frames\n", + " \n", + "process 71000 wavs,31627526 frames\n", + " \n", + "process 72000 wavs,32070556 frames\n", + " \n", + "process 73000 wavs,32504534 frames\n", + " \n", + "process 74000 wavs,32972775 frames\n", + " \n", + "process 75000 wavs,33409637 frames\n", + " \n", + "process 76000 wavs,33847861 frames\n", + " \n", + "process 77000 wavs,34298647 frames\n", + " \n", + "process 78000 wavs,34721536 frames\n", + " \n", + "process 79000 wavs,35159236 frames\n", + " \n", + "process 80000 wavs,35628628 frames\n", + " \n", + "process 81000 wavs,36080909 frames\n", + " \n", + "process 82000 wavs,36562496 frames\n", + " \n", + "process 83000 wavs,37042976 frames\n", + " \n", + "process 84000 wavs,37474403 frames\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "process 85000 wavs,37943596 frames\n", + " \n", + "process 86000 wavs,38371620 frames\n", + " \n", + "process 87000 wavs,38844874 frames\n", + " \n", + "process 88000 wavs,39292686 frames\n", + " \n", + "process 89000 wavs,39746715 frames\n", + " \n", + "process 90000 wavs,40241800 frames\n", + " \n", + "process 91000 wavs,40672817 frames\n", + " \n", + "process 92000 wavs,41131773 frames\n", + " \n", + "process 93000 wavs,41612001 frames\n", + " \n", + "process 94000 wavs,42084822 frames\n", + " \n", + "process 95000 wavs,42535878 frames\n", + " \n", + "process 96000 wavs,42969365 frames\n", + " \n", + "process 97000 wavs,43430890 frames\n", + " \n", + "process 98000 wavs,43923378 frames\n", + " \n", + "process 99000 wavs,44397370 frames\n", + " \n", + "process 100000 wavs,44883695 frames\n", + " \n", + "process 101000 wavs,45327968 frames\n", + " \n", + "process 102000 wavs,45768860 frames\n", + " \n", + "process 103000 wavs,46205602 frames\n", + " \n", + "process 104000 wavs,46690407 frames\n", + " \n", + "process 105000 wavs,47153089 frames\n", + " \n", + "process 106000 wavs,47628699 frames\n", + " \n", + "process 107000 wavs,48067945 frames\n", + " \n", + "process 108000 wavs,48539256 frames\n", + " \n", + "process 109000 wavs,49030485 frames\n", + " \n", + "process 110000 wavs,49469189 frames\n", + " \n", + "process 111000 wavs,49928968 frames\n", + " \n", + "process 112000 wavs,50370921 frames\n", + " \n", + "process 113000 wavs,50840090 frames\n", + " \n", + "process 114000 wavs,51286249 frames\n", + " \n", + "process 115000 wavs,51715786 frames\n", + " \n", + "process 116000 wavs,52184017 frames\n", + " \n", + "process 117000 wavs,52666156 frames\n", + " \n", + "process 118000 wavs,53109645 frames\n", + " \n", + "process 119000 wavs,53553253 frames\n", + " \n", + "process 120000 wavs,54009877 frames\n", + "{'mean_stat': [700612678.1184504, 704246512.9321843, 720430663.1822729, 754033269.0474415, 798737761.616614, 829467218.4204571, 851246702.9426627, 862261185.2661449, 859339943.6923889, 846303730.8696194, 832995109.605447, 823196536.6029147, 832626008.2569772, 845571326.1936859, 848801373.0562981, 846503549.328017, 836774344.5500796, 823481091.0445303, 820728368.2518216, 804571348.4957463, 795306095.0083207, 811729024.2415155, 805734803.5703195, 813076782.1959459, 806620199.406499, 809655573.8886961, 804371708.9347517, 809272248.6085774, 810322689.7490631, 814294131.1973915, 816262716.0476038, 816213124.2411841, 817158473.4380915, 821414211.5629157, 827408091.5728914, 834353896.0519086, 840094990.3467333, 842613218.6554606, 842070761.1727513, 834970952.5260613, 837020570.8200948, 829592602.7833654, 830116543.8893851, 829482316.3881509, 833397219.4597517, 839251633.3120549, 845475010.4718693, 852378426.7183967, 859563981.8633184, 866063840.5523493, 867790921.9978689, 868215100.5962687, 869683066.032885, 872467375.6674014, 873097681.1780069, 873025823.0543871, 869897292.7201596, 866386426.3869117, 863166726.7256871, 854653071.2244718, 842402803.9000899, 830838253.4144138, 830143002.3536818, 831492285.0310817, 833304371.8781006, 838896092.8621838, 843866088.9578133, 847316792.1429776, 851038022.3643295, 855931698.0149751, 859320543.9795249, 863031001.3470656, 868325062.1832993, 873626971.0115026, 878726636.924209, 884861725.972504, 886920281.5192285, 883056006.5094173, 863719240.7255149, 773378975.9476194], 'var_stat': [9237018652.657722, 9417257721.82426, 10105084297.159702, 11071318522.587782, 12422783727.426847, 13400306419.784964, 14148498843.406874, 14576436982.89939, 14529009036.494726, 14105645932.596651, 13682988821.478252, 13413013425.088106, 13764134927.293928, 14233704806.737064, 14361631309.367067, 14281358385.45644, 13939662689.213865, 13496884231.929493, 13382566162.783987, 12871350930.6626, 12576198160.876635, 13051463889.56708, 12859205935.513906, 13053861416.098743, 12830323588.550724, 12886405923.897238, 12708529922.84171, 12847306110.231739, 12880398489.53404, 13002566299.565536, 13066708060.463543, 13064231286.858614, 13088983337.353497, 13221393824.891022, 13412425607.755072, 13631485149.777075, 13807797519.156103, 13877277485.033077, 13848613909.96762, 13609176326.2529, 13649815250.130072, 13397698404.696907, 13388964704.359968, 13354326914.968012, 13469861474.898457, 13652539440.283333, 13846837321.329163, 14062143714.601675, 14292571198.61228, 14504626563.299246, 14563864749.132776, 14579720287.991764, 14626700787.353922, 14716185568.128899, 14728532777.28015, 14719101187.113443, 14607945896.239174, 14478517828.531614, 14355110561.681187, 14057430280.249746, 13634284490.879377, 13248236002.494394, 13217602306.335958, 13257856701.946049, 13323688441.072674, 13515395318.023148, 13685827169.67645, 13811622609.426846, 13947347160.615082, 14115883822.884943, 14231204526.433033, 14356066668.651815, 14533604268.238445, 14708971788.69237, 14875667326.732443, 15079098318.79331, 15144888989.667963, 15002658970.504765, 14349232841.34513, 11544480117.013124], 'frame_num': 54068199}\n" + ] + } + ], + "source": [ + "import random\n", + "\n", + "import numpy as np\n", + "import paddle\n", + "from paddle.io import DataLoader\n", + "from paddle.io import Dataset\n", + "\n", + "from deepspeech.frontend.audio import AudioSegment\n", + "from deepspeech.frontend.utility import load_cmvn\n", + "from deepspeech.frontend.utility import read_manifest\n", + "\n", + "# https://github.com/PaddlePaddle/Paddle/pull/31481\n", + "class CollateFunc(object):\n", + " ''' Collate function for AudioDataset\n", + " '''\n", + " def __init__(self, feature_func):\n", + " self.feature_func = feature_func\n", + " \n", + " def __call__(self, batch):\n", + " mean_stat = None\n", + " var_stat = None\n", + " number = 0\n", + " for item in batch:\n", + " audioseg = AudioSegment.from_file(item['feat'])\n", + " feat = self.feature_func(audioseg) #(D, T)\n", + "\n", + " sums = np.sum(feat, axis=1)\n", + " if mean_stat is None:\n", + " mean_stat = sums\n", + " else:\n", + " mean_stat += sums\n", + "\n", + " square_sums = np.sum(np.square(feat), axis=1)\n", + " if var_stat is None:\n", + " var_stat = square_sums\n", + " else:\n", + " var_stat += square_sums\n", + "\n", + " number += feat.shape[1]\n", + " return number, mean_stat, var_stat\n", + "\n", + "\n", + "class AudioDataset(Dataset):\n", + " def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):\n", + " self._rng = rng if rng else np.random.RandomState(random_seed)\n", + " manifest = read_manifest(manifest_path)\n", + " if num_samples == -1:\n", + " sampled_manifest = manifest\n", + " else:\n", + " sampled_manifest = self._rng.choice(manifest, num_samples, replace=False)\n", + " self.items = sampled_manifest\n", + "\n", + " def __len__(self):\n", + " return len(self.items)\n", + "\n", + " def __getitem__(self, idx):\n", + " return self.items[idx]\n", + " \n", + " \n", + "augmentation_pipeline = AugmentationPipeline('{}')\n", + "audio_featurizer = AudioFeaturizer(\n", + " specgram_type=args.specgram_type,\n", + " feat_dim=args.feat_dim,\n", + " delta_delta=args.delta_delta,\n", + " stride_ms=args.stride_ms,\n", + " window_ms=args.window_ms,\n", + " n_fft=None,\n", + " max_freq=None,\n", + " target_sample_rate=args.sample_rate,\n", + " use_dB_normalization=True,\n", + " target_dB=-20)\n", + "\n", + "def augment_and_featurize(audio_segment):\n", + " augmentation_pipeline.transform_audio(audio_segment)\n", + " return audio_featurizer.featurize(audio_segment)\n", + "\n", + "\n", + "collate_func = CollateFunc(augment_and_featurize)\n", + "\n", + "dataset = AudioDataset(\n", + " args.manifest_path,\n", + " args.num_samples)\n", + "\n", + "batch_size = 20\n", + "data_loader = DataLoader(\n", + " dataset,\n", + " batch_size=batch_size,\n", + " shuffle=False,\n", + " num_workers=args.num_workers,\n", + " collate_fn=collate_func)\n", + "\n", + "with paddle.no_grad():\n", + " all_mean_stat = None\n", + " all_var_stat = None\n", + " all_number = 0\n", + " wav_number = 0\n", + " for i, batch in enumerate(data_loader):\n", + " number, mean_stat, var_stat = batch\n", + " if i == 0:\n", + " all_mean_stat = mean_stat\n", + " all_var_stat = var_stat\n", + " else:\n", + " all_mean_stat += mean_stat\n", + " all_var_stat += var_stat\n", + " all_number += number\n", + " wav_number += batch_size\n", + "\n", + " if wav_number % 1000 == 0:\n", + " print('process {} wavs,{} frames'.format(wav_number,\n", + " all_number))\n", + "\n", + "cmvn_info = {\n", + " 'mean_stat': list(all_mean_stat.tolist()),\n", + " 'var_stat': list(all_var_stat.tolist()),\n", + " 'frame_num': all_number\n", + "}\n", + "print(cmvn_info)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unlike-search", + "metadata": {}, "outputs": [], "source": [] }