You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/.notebook/compute_cmvn_loader_test.ipynb

794 lines
42 KiB

E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code
4 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "purple-consequence",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
]
},
{
"data": {
"text/plain": [
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%cd ..\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "defensive-mason",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"id": "patient-convention",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Namespace(delta_delta=False, feat_dim=80, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=16, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='fbank', stride_ms=10.0, window_ms=25.0)\n"
]
}
],
"source": [
"import argparse\n",
"import functools\n",
"\n",
"from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
"from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
"from deepspeech.frontend.normalizer import FeatureNormalizer\n",
"from deepspeech.utils.utility import add_arguments\n",
"from deepspeech.utils.utility import print_arguments\n",
"\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"# yapf: disable\n",
"add_arg('num_samples', int, -1, \"# of samples to for statistics.\")\n",
"add_arg('specgram_type', str,\n",
" 'fbank',\n",
" \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
" choices=['linear', 'mfcc', 'fbank'])\n",
"add_arg('feat_dim', int, 80, \"Audio feature dim.\")\n",
"add_arg('delta_delta', bool,\n",
" False,\n",
" \"Audio feature with delta delta.\")\n",
"add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n",
"add_arg('window_ms', float, 25.0, \"stride length in ms.\")\n",
"add_arg('sample_rate', int, 16000, \"target sample rate.\")\n",
"add_arg('manifest_path', str,\n",
" 'examples/aishell/s1/data/manifest.train.raw',\n",
" \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
"add_arg('num_workers',\n",
" default=16,\n",
" type=int,\n",
" help='num of subprocess workers for processing')\n",
"add_arg('output_path', str,\n",
" 'data/librispeech/mean_std.npz',\n",
" \"Filepath of write mean and stddev to (.npz).\")\n",
"# yapf: disable\n",
"args = parser.parse_args([])\n",
"print(args)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "enormous-currency",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"import numpy as np\n",
"import paddle\n",
"from paddle.io import DataLoader\n",
"from paddle.io import Dataset\n",
"\n",
"from deepspeech.frontend.audio import AudioSegment\n",
"from deepspeech.frontend.utility import load_cmvn\n",
"from deepspeech.frontend.utility import read_manifest\n",
"\n",
"class CollateFunc(object):\n",
" ''' Collate function for AudioDataset\n",
" '''\n",
" def __init__(self):\n",
" pass\n",
" \n",
" def __call__(self, batch):\n",
" mean_stat = None\n",
" var_stat = None\n",
" number = 0\n",
" for feat in batch:\n",
" sums = np.sum(feat, axis=1)\n",
" if mean_stat is None:\n",
" mean_stat = sums\n",
" else:\n",
" mean_stat += sums\n",
"\n",
" square_sums = np.sum(np.square(feat), axis=1)\n",
" if var_stat is None:\n",
" var_stat = square_sums\n",
" else:\n",
" var_stat += square_sums\n",
"\n",
" number += feat.shape[1]\n",
" #return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)\n",
" return number, mean_stat, var_stat\n",
"\n",
"\n",
"class AudioDataset(Dataset):\n",
" def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):\n",
" self.feature_func = feature_func\n",
" self._rng = rng\n",
" manifest = read_manifest(manifest_path)\n",
" if num_samples == -1:\n",
" sampled_manifest = manifest\n",
" else:\n",
" sampled_manifest = self._rng.sample(manifest, num_samples)\n",
" self.items = sampled_manifest\n",
"\n",
" def __len__(self):\n",
" return len(self.items)\n",
"\n",
" def __getitem__(self, idx):\n",
" key = self.items[idx]['feat']\n",
" audioseg = AudioSegment.from_file(key)\n",
" feat = self.feature_func(audioseg) #(D, T)\n",
" return feat"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "armed-semester",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"process 1000 wavs,450739 frames\n",
"process 2000 wavs,887447 frames\n",
"process 3000 wavs,1354148 frames\n",
"process 4000 wavs,1816494 frames\n",
"process 5000 wavs,2359211 frames\n",
"process 6000 wavs,2828455 frames\n",
"process 7000 wavs,3276186 frames\n",
"process 8000 wavs,3692234 frames\n",
"process 9000 wavs,4139360 frames\n",
"process 10000 wavs,4591528 frames\n",
"process 11000 wavs,5020114 frames\n",
"process 12000 wavs,5459523 frames\n",
"process 13000 wavs,5899534 frames\n",
"process 14000 wavs,6323242 frames\n",
"process 15000 wavs,6736597 frames\n",
"process 16000 wavs,7207686 frames\n",
"process 17000 wavs,7637800 frames\n",
"process 18000 wavs,8093004 frames\n",
"process 19000 wavs,8529518 frames\n",
"process 20000 wavs,8906022 frames\n",
"process 21000 wavs,9352652 frames\n",
"process 22000 wavs,9807495 frames\n",
"process 23000 wavs,10247938 frames\n",
"process 24000 wavs,10700011 frames\n",
"process 25000 wavs,11126134 frames\n",
"process 26000 wavs,11558061 frames\n",
"process 27000 wavs,12010359 frames\n",
"process 28000 wavs,12470938 frames\n",
"process 29000 wavs,12916013 frames\n",
"process 30000 wavs,13345816 frames\n",
"process 31000 wavs,13752365 frames\n",
"process 32000 wavs,14174801 frames\n",
"process 33000 wavs,14642170 frames\n",
"process 34000 wavs,15053557 frames\n",
"process 35000 wavs,15531890 frames\n",
"process 36000 wavs,16022711 frames\n",
"process 37000 wavs,16437688 frames\n",
"process 38000 wavs,16859517 frames\n",
"process 39000 wavs,17307676 frames\n",
"process 40000 wavs,17796629 frames\n",
"process 41000 wavs,18264151 frames\n",
"process 42000 wavs,18711898 frames\n",
"process 43000 wavs,19159890 frames\n",
"process 44000 wavs,19576435 frames\n",
"process 45000 wavs,19992793 frames\n",
"process 46000 wavs,20464449 frames\n",
"process 47000 wavs,20886021 frames\n",
"process 48000 wavs,21317318 frames\n",
"process 49000 wavs,21738034 frames\n",
"process 50000 wavs,22171890 frames\n",
"process 51000 wavs,22622238 frames\n",
"process 52000 wavs,23100734 frames\n",
"process 53000 wavs,23526901 frames\n",
"process 54000 wavs,23969746 frames\n",
"process 55000 wavs,24418691 frames\n",
"process 56000 wavs,24862546 frames\n",
"process 57000 wavs,25336448 frames\n",
"process 58000 wavs,25778435 frames\n",
"process 59000 wavs,26216199 frames\n",
"process 60000 wavs,26694692 frames\n",
"process 61000 wavs,27148978 frames\n",
"process 62000 wavs,27617088 frames\n",
"process 63000 wavs,28064946 frames\n",
"process 64000 wavs,28519843 frames\n",
"process 65000 wavs,28989722 frames\n",
"process 66000 wavs,29470156 frames\n",
"process 67000 wavs,29952931 frames\n",
"process 68000 wavs,30360555 frames\n",
"process 69000 wavs,30797929 frames\n",
"process 70000 wavs,31218227 frames\n",
"process 71000 wavs,31663934 frames\n",
"process 72000 wavs,32107468 frames\n",
"process 73000 wavs,32541943 frames\n",
"process 74000 wavs,33010702 frames\n",
"process 75000 wavs,33448082 frames\n",
"process 76000 wavs,33886812 frames\n",
"process 77000 wavs,34338108 frames\n",
"process 78000 wavs,34761495 frames\n",
"process 79000 wavs,35199730 frames\n",
"process 80000 wavs,35669630 frames\n",
"process 81000 wavs,36122402 frames\n",
"process 82000 wavs,36604561 frames\n",
"process 83000 wavs,37085552 frames\n",
"process 84000 wavs,37517500 frames\n",
"process 85000 wavs,37987196 frames\n",
"process 86000 wavs,38415721 frames\n",
"process 87000 wavs,38889467 frames\n",
"process 88000 wavs,39337809 frames\n",
"process 89000 wavs,39792342 frames\n",
"process 90000 wavs,40287946 frames\n",
"process 91000 wavs,40719461 frames\n",
"process 92000 wavs,41178919 frames\n",
"process 93000 wavs,41659635 frames\n",
"process 94000 wavs,42132985 frames\n",
"process 95000 wavs,42584564 frames\n",
"process 96000 wavs,43018598 frames\n",
"process 97000 wavs,43480662 frames\n",
"process 98000 wavs,43973670 frames\n",
"process 99000 wavs,44448190 frames\n",
"process 100000 wavs,44935034 frames\n",
"process 101000 wavs,45379812 frames\n",
"process 102000 wavs,45821207 frames\n",
"process 103000 wavs,46258420 frames\n",
"process 104000 wavs,46743733 frames\n",
"process 105000 wavs,47206922 frames\n",
"process 106000 wavs,47683041 frames\n",
"process 107000 wavs,48122809 frames\n",
"process 108000 wavs,48594623 frames\n",
"process 109000 wavs,49086358 frames\n",
"process 110000 wavs,49525568 frames\n",
"process 111000 wavs,49985820 frames\n",
"process 112000 wavs,50428262 frames\n",
"process 113000 wavs,50897957 frames\n",
"process 114000 wavs,51344589 frames\n",
"process 115000 wavs,51774621 frames\n",
"process 116000 wavs,52243372 frames\n",
"process 117000 wavs,52726025 frames\n",
"process 118000 wavs,53170026 frames\n",
"process 119000 wavs,53614141 frames\n",
"process 120000 wavs,54071271 frames\n"
]
}
],
"source": [
"\n",
"augmentation_pipeline = AugmentationPipeline('{}')\n",
"audio_featurizer = AudioFeaturizer(\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" stride_ms=args.stride_ms,\n",
" window_ms=args.window_ms,\n",
" n_fft=None,\n",
" max_freq=None,\n",
" target_sample_rate=args.sample_rate,\n",
" use_dB_normalization=True,\n",
" target_dB=-20)\n",
"\n",
"def augment_and_featurize(audio_segment):\n",
" augmentation_pipeline.transform_audio(audio_segment)\n",
" return audio_featurizer.featurize(audio_segment)\n",
"\n",
"\n",
"collate_func = CollateFunc()\n",
"\n",
"dataset = AudioDataset(\n",
" args.manifest_path,\n",
" augment_and_featurize, \n",
" args.num_samples)\n",
"\n",
"batch_size = 20\n",
"data_loader = DataLoader(\n",
" dataset,\n",
" batch_size=batch_size,\n",
" shuffle=False,\n",
" num_workers=args.num_workers,\n",
" collate_fn=collate_func)\n",
"\n",
"with paddle.no_grad():\n",
" all_mean_stat = None\n",
" all_var_stat = None\n",
" all_number = 0\n",
" wav_number = 0\n",
" for i, batch in enumerate(data_loader()):\n",
" #for batch in data_loader():\n",
" number, mean_stat, var_stat = batch\n",
" if i == 0:\n",
" all_mean_stat = mean_stat\n",
" all_var_stat = var_stat\n",
" else:\n",
" all_mean_stat += mean_stat\n",
" all_var_stat += var_stat\n",
" all_number += number\n",
" wav_number += batch_size\n",
"\n",
" if wav_number % 1000 == 0:\n",
" print('process {} wavs,{} frames'.format(wav_number,\n",
" all_number))\n",
"\n",
"cmvn_info = {\n",
" 'mean_stat': list(all_mean_stat.tolist()),\n",
" 'var_stat': list(all_var_stat.tolist()),\n",
" 'frame_num': all_number\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "danish-executive",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'mean_stat': [-813852467.7953382, -769025957.9140725, -809499593.411409, -774700574.014532, -750961217.5896736, -760564397.2864963, -805662399.3771614, -843490965.4231446, -850242081.9416809, -857678651.504435, -879067453.9826999, -908602072.3856701, -936850957.7187386, -957242686.489041, -968425442.0916103, -972687545.5953809, -980383731.7683417, -991533337.6343704, -1001966818.1164789, -1010334169.7486078, -1016855066.9099333, -1022176245.7021623, -1025700476.4788507, -1030678878.3195274, -1037075963.124199, -1042705719.0195516, -1047422212.6492896, -1049003537.271861, -1050314833.7453628, -1050772191.0204058, -1050010034.9948177, -1050436065.1336465, -1053327181.7978873, -1058710548.2036785, -1065950852.4966162, -1071709705.0060445, -1077682778.259181, -1083371045.272074, -1089708906.2657735, -1096312217.7865202, -1101089858.8364556, -1104965332.4332569, -1107791702.5223634, -1109431075.2374773, -1110066333.0280604, -1110382732.0722318, -1110480306.3793216, -1110203297.7110727, -1109972534.3583376, -1109378081.8792782, -1108212059.413654, -1107235713.2041805, -1106973581.9280007, -1107352339.7860134, -1108730029.862537, -1110425202.83704, -1113220669.4552443, -1115887535.4870913, -1118105356.3628063, -1120001376.8503075, -1121135822.320366, -1122265971.8751016, -1123990217.401155, -1125786729.6230593, -1127784957.2745507, -1129180108.9033566, -1132000461.6688302, -1134675829.8190608, -1137652487.5164194, -1141755948.0463965, -1145340901.5468378, -1148637682.593287, -1151755522.470022, -1154981643.2268832, -1157417488.840151, -1161240429.0989249, -1165411128.671642, -1170521097.1034513, -1176307165.5109766, -1183456865.0039694, -1190535938.6591117, -1197946309.0472982, -1203596565.037139, -1207563038.1241052, -1209707561.5829782, -1211407066.2452552, -1211884576.9201162, -1212778872.005509, -1214041413.8080075, -1215367953.1745043, -1216850831.482193, -1217678325.5351057, -1218854289.54188, -1219325064.8610544, -1219080344.7580786, -1218541313.657531, -1217889833.2067819, -1216552930.1654336, -1216423777.4113154, -1216575252.225508, -1217075384.9826024, -1217391577.901724, -1217838974.57273, -1218131805.6054134, -1218294889.7465532, -1218566666.1755593, -1218790537.5519717, -1218748668.9956846, -1218603191.4941735, -1218004566.4348054, -1217312410.127734, -1217207493.9522285, -1217284002.3834674, -1217644312.51745, -1218039821.6444128, -1218721811.6269798, -1219121088.9265897, -1219014460.8090584, -1218530127.6776083, -1217952335.451711, -1217316073.8666434, -1217035380.1151958, -1216636431.2964456, -1216257015.2945514, -1215658496.1208403, -1215097272.0976632, -1214669859.2064147, -1214593853.4809475, -1214599475.7838447, -1214575440.823035, -1214158828.8008435, -1213482920.2673717, -1212476577.5897374, -1211251374.2198513, -1210284855.590475, -1209302456.065669, -1209106252.6625297, -1209373211.5146718, -1209689421.7984035, -1210021342.495856, -1210650609.3592312, -1211428521.3900626, -1212616111.4257205, -1213820075.2948189, -1215320588.7144456, -1217175082.2739282, -1219703351.4585004, -1222007827.120464, -1224637375.5900724, -1228367798.912171, -1234853879.862459, -1247222219.867692, -1268562808.1616178, -1302034822.9569275, -1347823631.0776038, -1402753916.9445229, -1458826717.3262982, -1505843092.0970414, -1534278782.249077, -1543955545.8994718, -1600409154.893352], 'var_stat': [12665413908.91729, 11145088801.244318, 12567119446.035736, 11758392758.06822, 11200687982.736668, 11551903443.711124, 12880777868.435602, 14084854368.236998, 14394011058.866192, 14678818621.277662, 15346278722.626339, 16268053979.757076, 17191705347.854794, 17877540386.548733, 18251857849.077663, 18392628178.710472, 18645534548.4045, 19018598212.22902, 19366711357.782673, 19655730286.72857, 19890681996.786858, 20094163350.461906, 20227774955.225887, 20423525628.66887, 20669928826.76939, 20882313568.247944, 21062392676.270527, 21126648821.879055, 21185210734.751118, 21209014745.520447, 21182293842.91236, 21197433134.875977, 21302147790.662144, 21504666657.651955, 21781818550.89697, 21996170165.145462, 22217169779.096275, 22431161
]
}
],
"source": [
"print(cmvn_info)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "accurate-terminal",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "dominant-abuse",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 1000 wavs,450240 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 2000 wavs,886411 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 3000 wavs,1352580 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 4000 wavs,1814397 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 5000 wavs,2356587 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 6000 wavs,2825310 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 7000 wavs,3272506 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 8000 wavs,3688045 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 9000 wavs,4134669 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 10000 wavs,4586357 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 11000 wavs,5014429 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 12000 wavs,5453334 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 13000 wavs,5892888 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 14000 wavs,6316059 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 15000 wavs,6728870 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 16000 wavs,7199442 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 17000 wavs,7629055 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 18000 wavs,8083729 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 19000 wavs,8519732 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 20000 wavs,8895694 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 21000 wavs,9341778 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 22000 wavs,9796126 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 23000 wavs,10236057 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 24000 wavs,10687461 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 25000 wavs,11113082 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 26000 wavs,11544482 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 27000 wavs,11996273 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 28000 wavs,12456350 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 29000 wavs,12900895 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 30000 wavs,13330353 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 31000 wavs,13736568 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 32000 wavs,14158472 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 33000 wavs,14625316 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 34000 wavs,15036206 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 35000 wavs,15514001 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 36000 wavs,16004323 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 37000 wavs,16418799 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 38000 wavs,16840100 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 39000 wavs,17287752 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 40000 wavs,17776206 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 41000 wavs,18243209 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 42000 wavs,18690449 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 43000 wavs,19137940 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 44000 wavs,19553966 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 45000 wavs,19969813 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 46000 wavs,20440963 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 47000 wavs,20862022 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 48000 wavs,21292801 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 49000 wavs,21713004 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 50000 wavs,22146346 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 51000 wavs,22596172 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 52000 wavs,23074160 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 53000 wavs,23499823 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 54000 wavs,23942151 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 55000 wavs,24390566 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 56000 wavs,24833905 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 57000 wavs,25307270 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 58000 wavs,25748720 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 59000 wavs,26185964 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 60000 wavs,26663953 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 61000 wavs,27117720 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 62000 wavs,27585349 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 63000 wavs,28032693 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 64000 wavs,28487074 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 65000 wavs,28956462 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 66000 wavs,29436358 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 67000 wavs,29918569 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 68000 wavs,30325682 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 69000 wavs,30762528 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 70000 wavs,31182319 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 71000 wavs,31627526 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 72000 wavs,32070556 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 73000 wavs,32504534 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 74000 wavs,32972775 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 75000 wavs,33409637 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 76000 wavs,33847861 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 77000 wavs,34298647 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 78000 wavs,34721536 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 79000 wavs,35159236 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 80000 wavs,35628628 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 81000 wavs,36080909 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 82000 wavs,36562496 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 83000 wavs,37042976 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 84000 wavs,37474403 frames\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 85000 wavs,37943596 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 86000 wavs,38371620 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 87000 wavs,38844874 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 88000 wavs,39292686 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 89000 wavs,39746715 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 90000 wavs,40241800 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 91000 wavs,40672817 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 92000 wavs,41131773 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 93000 wavs,41612001 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 94000 wavs,42084822 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 95000 wavs,42535878 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 96000 wavs,42969365 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 97000 wavs,43430890 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 98000 wavs,43923378 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 99000 wavs,44397370 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 100000 wavs,44883695 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 101000 wavs,45327968 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 102000 wavs,45768860 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 103000 wavs,46205602 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 104000 wavs,46690407 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 105000 wavs,47153089 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 106000 wavs,47628699 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 107000 wavs,48067945 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 108000 wavs,48539256 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 109000 wavs,49030485 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 110000 wavs,49469189 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 111000 wavs,49928968 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 112000 wavs,50370921 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 113000 wavs,50840090 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 114000 wavs,51286249 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 115000 wavs,51715786 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 116000 wavs,52184017 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 117000 wavs,52666156 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 118000 wavs,53109645 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 119000 wavs,53553253 frames\n",
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
"process 120000 wavs,54009877 frames\n",
"{'mean_stat': [700612678.1184504, 704246512.9321843, 720430663.1822729, 754033269.0474415, 798737761.616614, 829467218.4204571, 851246702.9426627, 862261185.2661449, 859339943.6923889, 846303730.8696194, 832995109.605447, 823196536.6029147, 832626008.2569772, 845571326.1936859, 848801373.0562981, 846503549.328017, 836774344.5500796, 823481091.0445303, 820728368.2518216, 804571348.4957463, 795306095.0083207, 811729024.2415155, 805734803.5703195, 813076782.1959459, 806620199.406499, 809655573.8886961, 804371708.9347517, 809272248.6085774, 810322689.7490631, 814294131.1973915, 816262716.0476038, 816213124.2411841, 817158473.4380915, 821414211.5629157, 827408091.5728914, 834353896.0519086, 840094990.3467333, 842613218.6554606, 842070761.1727513, 834970952.5260613, 837020570.8200948, 829592602.7833654, 830116543.8893851, 829482316.3881509, 833397219.4597517, 839251633.3120549, 845475010.4718693, 852378426.7183967, 859563981.8633184, 866063840.5523493, 867790921.9978689, 868215100.5962687, 869683066.032885, 872467375.6674014, 873097681.1780069, 873025823.0543871, 869897292.7201596, 866386426.3869117, 863166726.7256871, 854653071.2244718, 842402803.9000899, 830838253.4144138, 830143002.3536818, 831492285.0310817, 833304371.8781006, 838896092.8621838, 843866088.9578133, 847316792.1429776, 851038022.3643295, 855931698.0149751, 859320543.9795249, 863031001.3470656, 868325062.1832993, 873626971.0115026, 878726636.924209, 884861725.972504, 886920281.5192285, 883056006.5094173, 863719240.7255149, 773378975.9476194], 'var_stat': [9237018652.657722, 9417257721.82426, 10105084297.159702, 11071318522.587782, 12422783727.426847, 13400306419.784964, 14148498843.406874, 14576436982.89939, 14529009036.494726, 14105645932.596651, 13682988821.478252, 13413013425.088106, 13764134927.293928, 14233704806.737064, 14361631309.367067, 14281358385.45644, 13939662689.213865, 13496884231.929493, 13382566162.783987, 12871350930.6626, 12576198160.876635, 13051463889.56708, 12859205935.513906, 13053861416.098743, 12830323588.550724, 12886405923.897238, 12708529922.84171, 12847306110.231739, 12880398489.53404, 13002566299.565536, 13066708060.463543, 13064231286.858614, 13088983337.353497, 13221393824.891022, 13412425607.755072, 13631485149.777075, 13807797519.156103, 13877277485.033077, 13848613909.96762, 13609176326.2529, 13649815250.130072, 13397698404.696907, 13388964704.359968, 13354326914.968012, 13469861474.898457, 13652539440.283333, 13846837321.329163, 14062143714.601675, 14292571198.61228, 14504626563.299246, 14563864749.132776, 14579720287.991764, 14626700787.353922, 14716185568.128899, 14728532777.28015, 14719101187.113443, 14607945896.239174, 14478517828.531614, 14355110561.681187, 14057430280.249746, 13634284490.879377, 13248236002.494394, 13217602306.335958, 13257856701.946049, 13323688441.072674, 13515395318.023148, 13685827169.67645, 13811622609.426846, 13947347160.615082, 14115883822.884943, 14231204526.433033, 14356066668.651815, 14533604268.238445, 14708971788.69237, 14875667326.732443, 15079098318.79331, 15144888989.667963, 15002658970.504765, 14349232841.34513, 11544480117.013124], 'frame_num': 54068199}\n"
]
}
],
"source": [
"import random\n",
"\n",
"import numpy as np\n",
"import paddle\n",
"from paddle.io import DataLoader\n",
"from paddle.io import Dataset\n",
"\n",
"from deepspeech.frontend.audio import AudioSegment\n",
"from deepspeech.frontend.utility import load_cmvn\n",
"from deepspeech.frontend.utility import read_manifest\n",
"\n",
"# https://github.com/PaddlePaddle/Paddle/pull/31481\n",
"class CollateFunc(object):\n",
" ''' Collate function for AudioDataset\n",
" '''\n",
" def __init__(self, feature_func):\n",
" self.feature_func = feature_func\n",
" \n",
" def __call__(self, batch):\n",
" mean_stat = None\n",
" var_stat = None\n",
" number = 0\n",
" for item in batch:\n",
" audioseg = AudioSegment.from_file(item['feat'])\n",
" feat = self.feature_func(audioseg) #(D, T)\n",
"\n",
" sums = np.sum(feat, axis=1)\n",
" if mean_stat is None:\n",
" mean_stat = sums\n",
" else:\n",
" mean_stat += sums\n",
"\n",
" square_sums = np.sum(np.square(feat), axis=1)\n",
" if var_stat is None:\n",
" var_stat = square_sums\n",
" else:\n",
" var_stat += square_sums\n",
"\n",
" number += feat.shape[1]\n",
" return number, mean_stat, var_stat\n",
"\n",
"\n",
"class AudioDataset(Dataset):\n",
" def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):\n",
" self._rng = rng if rng else np.random.RandomState(random_seed)\n",
" manifest = read_manifest(manifest_path)\n",
" if num_samples == -1:\n",
" sampled_manifest = manifest\n",
" else:\n",
" sampled_manifest = self._rng.choice(manifest, num_samples, replace=False)\n",
" self.items = sampled_manifest\n",
"\n",
" def __len__(self):\n",
" return len(self.items)\n",
"\n",
" def __getitem__(self, idx):\n",
" return self.items[idx]\n",
" \n",
" \n",
"augmentation_pipeline = AugmentationPipeline('{}')\n",
"audio_featurizer = AudioFeaturizer(\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" stride_ms=args.stride_ms,\n",
" window_ms=args.window_ms,\n",
" n_fft=None,\n",
" max_freq=None,\n",
" target_sample_rate=args.sample_rate,\n",
" use_dB_normalization=True,\n",
" target_dB=-20)\n",
"\n",
"def augment_and_featurize(audio_segment):\n",
" augmentation_pipeline.transform_audio(audio_segment)\n",
" return audio_featurizer.featurize(audio_segment)\n",
"\n",
"\n",
"collate_func = CollateFunc(augment_and_featurize)\n",
"\n",
"dataset = AudioDataset(\n",
" args.manifest_path,\n",
" args.num_samples)\n",
"\n",
"batch_size = 20\n",
"data_loader = DataLoader(\n",
" dataset,\n",
" batch_size=batch_size,\n",
" shuffle=False,\n",
" num_workers=args.num_workers,\n",
" collate_fn=collate_func)\n",
"\n",
"with paddle.no_grad():\n",
" all_mean_stat = None\n",
" all_var_stat = None\n",
" all_number = 0\n",
" wav_number = 0\n",
" for i, batch in enumerate(data_loader):\n",
" number, mean_stat, var_stat = batch\n",
" if i == 0:\n",
" all_mean_stat = mean_stat\n",
" all_var_stat = var_stat\n",
" else:\n",
" all_mean_stat += mean_stat\n",
" all_var_stat += var_stat\n",
" all_number += number\n",
" wav_number += batch_size\n",
"\n",
" if wav_number % 1000 == 0:\n",
" print('process {} wavs,{} frames'.format(wav_number,\n",
" all_number))\n",
"\n",
"cmvn_info = {\n",
" 'mean_stat': list(all_mean_stat.tolist()),\n",
" 'var_stat': list(all_var_stat.tolist()),\n",
" 'frame_num': all_number\n",
"}\n",
"print(cmvn_info)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "unlike-search",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}