You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
794 lines
42 KiB
794 lines
42 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "purple-consequence",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
|
|
]
|
|
},
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"%cd ..\n",
|
|
"%pwd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "defensive-mason",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "patient-convention",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Namespace(delta_delta=False, feat_dim=80, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=16, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='fbank', stride_ms=10.0, window_ms=25.0)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import argparse\n",
|
|
"import functools\n",
|
|
"\n",
|
|
"from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
|
|
"from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
|
|
"from deepspeech.frontend.normalizer import FeatureNormalizer\n",
|
|
"from deepspeech.utils.utility import add_arguments\n",
|
|
"from deepspeech.utils.utility import print_arguments\n",
|
|
"\n",
|
|
"parser = argparse.ArgumentParser(description=__doc__)\n",
|
|
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
|
|
"# yapf: disable\n",
|
|
"add_arg('num_samples', int, -1, \"# of samples to for statistics.\")\n",
|
|
"add_arg('specgram_type', str,\n",
|
|
" 'fbank',\n",
|
|
" \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
|
|
" choices=['linear', 'mfcc', 'fbank'])\n",
|
|
"add_arg('feat_dim', int, 80, \"Audio feature dim.\")\n",
|
|
"add_arg('delta_delta', bool,\n",
|
|
" False,\n",
|
|
" \"Audio feature with delta delta.\")\n",
|
|
"add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n",
|
|
"add_arg('window_ms', float, 25.0, \"stride length in ms.\")\n",
|
|
"add_arg('sample_rate', int, 16000, \"target sample rate.\")\n",
|
|
"add_arg('manifest_path', str,\n",
|
|
" 'examples/aishell/s1/data/manifest.train.raw',\n",
|
|
" \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
|
|
"add_arg('num_workers',\n",
|
|
" default=16,\n",
|
|
" type=int,\n",
|
|
" help='num of subprocess workers for processing')\n",
|
|
"add_arg('output_path', str,\n",
|
|
" 'data/librispeech/mean_std.npz',\n",
|
|
" \"Filepath of write mean and stddev to (.npz).\")\n",
|
|
"# yapf: disable\n",
|
|
"args = parser.parse_args([])\n",
|
|
"print(args)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "enormous-currency",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import random\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import paddle\n",
|
|
"from paddle.io import DataLoader\n",
|
|
"from paddle.io import Dataset\n",
|
|
"\n",
|
|
"from deepspeech.frontend.audio import AudioSegment\n",
|
|
"from deepspeech.frontend.utility import load_cmvn\n",
|
|
"from deepspeech.frontend.utility import read_manifest\n",
|
|
"\n",
|
|
"class CollateFunc(object):\n",
|
|
" ''' Collate function for AudioDataset\n",
|
|
" '''\n",
|
|
" def __init__(self):\n",
|
|
" pass\n",
|
|
" \n",
|
|
" def __call__(self, batch):\n",
|
|
" mean_stat = None\n",
|
|
" var_stat = None\n",
|
|
" number = 0\n",
|
|
" for feat in batch:\n",
|
|
" sums = np.sum(feat, axis=1)\n",
|
|
" if mean_stat is None:\n",
|
|
" mean_stat = sums\n",
|
|
" else:\n",
|
|
" mean_stat += sums\n",
|
|
"\n",
|
|
" square_sums = np.sum(np.square(feat), axis=1)\n",
|
|
" if var_stat is None:\n",
|
|
" var_stat = square_sums\n",
|
|
" else:\n",
|
|
" var_stat += square_sums\n",
|
|
"\n",
|
|
" number += feat.shape[1]\n",
|
|
" #return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)\n",
|
|
" return number, mean_stat, var_stat\n",
|
|
"\n",
|
|
"\n",
|
|
"class AudioDataset(Dataset):\n",
|
|
" def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):\n",
|
|
" self.feature_func = feature_func\n",
|
|
" self._rng = rng\n",
|
|
" manifest = read_manifest(manifest_path)\n",
|
|
" if num_samples == -1:\n",
|
|
" sampled_manifest = manifest\n",
|
|
" else:\n",
|
|
" sampled_manifest = self._rng.sample(manifest, num_samples)\n",
|
|
" self.items = sampled_manifest\n",
|
|
"\n",
|
|
" def __len__(self):\n",
|
|
" return len(self.items)\n",
|
|
"\n",
|
|
" def __getitem__(self, idx):\n",
|
|
" key = self.items[idx]['feat']\n",
|
|
" audioseg = AudioSegment.from_file(key)\n",
|
|
" feat = self.feature_func(audioseg) #(D, T)\n",
|
|
" return feat"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "armed-semester",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"process 1000 wavs,450739 frames\n",
|
|
"process 2000 wavs,887447 frames\n",
|
|
"process 3000 wavs,1354148 frames\n",
|
|
"process 4000 wavs,1816494 frames\n",
|
|
"process 5000 wavs,2359211 frames\n",
|
|
"process 6000 wavs,2828455 frames\n",
|
|
"process 7000 wavs,3276186 frames\n",
|
|
"process 8000 wavs,3692234 frames\n",
|
|
"process 9000 wavs,4139360 frames\n",
|
|
"process 10000 wavs,4591528 frames\n",
|
|
"process 11000 wavs,5020114 frames\n",
|
|
"process 12000 wavs,5459523 frames\n",
|
|
"process 13000 wavs,5899534 frames\n",
|
|
"process 14000 wavs,6323242 frames\n",
|
|
"process 15000 wavs,6736597 frames\n",
|
|
"process 16000 wavs,7207686 frames\n",
|
|
"process 17000 wavs,7637800 frames\n",
|
|
"process 18000 wavs,8093004 frames\n",
|
|
"process 19000 wavs,8529518 frames\n",
|
|
"process 20000 wavs,8906022 frames\n",
|
|
"process 21000 wavs,9352652 frames\n",
|
|
"process 22000 wavs,9807495 frames\n",
|
|
"process 23000 wavs,10247938 frames\n",
|
|
"process 24000 wavs,10700011 frames\n",
|
|
"process 25000 wavs,11126134 frames\n",
|
|
"process 26000 wavs,11558061 frames\n",
|
|
"process 27000 wavs,12010359 frames\n",
|
|
"process 28000 wavs,12470938 frames\n",
|
|
"process 29000 wavs,12916013 frames\n",
|
|
"process 30000 wavs,13345816 frames\n",
|
|
"process 31000 wavs,13752365 frames\n",
|
|
"process 32000 wavs,14174801 frames\n",
|
|
"process 33000 wavs,14642170 frames\n",
|
|
"process 34000 wavs,15053557 frames\n",
|
|
"process 35000 wavs,15531890 frames\n",
|
|
"process 36000 wavs,16022711 frames\n",
|
|
"process 37000 wavs,16437688 frames\n",
|
|
"process 38000 wavs,16859517 frames\n",
|
|
"process 39000 wavs,17307676 frames\n",
|
|
"process 40000 wavs,17796629 frames\n",
|
|
"process 41000 wavs,18264151 frames\n",
|
|
"process 42000 wavs,18711898 frames\n",
|
|
"process 43000 wavs,19159890 frames\n",
|
|
"process 44000 wavs,19576435 frames\n",
|
|
"process 45000 wavs,19992793 frames\n",
|
|
"process 46000 wavs,20464449 frames\n",
|
|
"process 47000 wavs,20886021 frames\n",
|
|
"process 48000 wavs,21317318 frames\n",
|
|
"process 49000 wavs,21738034 frames\n",
|
|
"process 50000 wavs,22171890 frames\n",
|
|
"process 51000 wavs,22622238 frames\n",
|
|
"process 52000 wavs,23100734 frames\n",
|
|
"process 53000 wavs,23526901 frames\n",
|
|
"process 54000 wavs,23969746 frames\n",
|
|
"process 55000 wavs,24418691 frames\n",
|
|
"process 56000 wavs,24862546 frames\n",
|
|
"process 57000 wavs,25336448 frames\n",
|
|
"process 58000 wavs,25778435 frames\n",
|
|
"process 59000 wavs,26216199 frames\n",
|
|
"process 60000 wavs,26694692 frames\n",
|
|
"process 61000 wavs,27148978 frames\n",
|
|
"process 62000 wavs,27617088 frames\n",
|
|
"process 63000 wavs,28064946 frames\n",
|
|
"process 64000 wavs,28519843 frames\n",
|
|
"process 65000 wavs,28989722 frames\n",
|
|
"process 66000 wavs,29470156 frames\n",
|
|
"process 67000 wavs,29952931 frames\n",
|
|
"process 68000 wavs,30360555 frames\n",
|
|
"process 69000 wavs,30797929 frames\n",
|
|
"process 70000 wavs,31218227 frames\n",
|
|
"process 71000 wavs,31663934 frames\n",
|
|
"process 72000 wavs,32107468 frames\n",
|
|
"process 73000 wavs,32541943 frames\n",
|
|
"process 74000 wavs,33010702 frames\n",
|
|
"process 75000 wavs,33448082 frames\n",
|
|
"process 76000 wavs,33886812 frames\n",
|
|
"process 77000 wavs,34338108 frames\n",
|
|
"process 78000 wavs,34761495 frames\n",
|
|
"process 79000 wavs,35199730 frames\n",
|
|
"process 80000 wavs,35669630 frames\n",
|
|
"process 81000 wavs,36122402 frames\n",
|
|
"process 82000 wavs,36604561 frames\n",
|
|
"process 83000 wavs,37085552 frames\n",
|
|
"process 84000 wavs,37517500 frames\n",
|
|
"process 85000 wavs,37987196 frames\n",
|
|
"process 86000 wavs,38415721 frames\n",
|
|
"process 87000 wavs,38889467 frames\n",
|
|
"process 88000 wavs,39337809 frames\n",
|
|
"process 89000 wavs,39792342 frames\n",
|
|
"process 90000 wavs,40287946 frames\n",
|
|
"process 91000 wavs,40719461 frames\n",
|
|
"process 92000 wavs,41178919 frames\n",
|
|
"process 93000 wavs,41659635 frames\n",
|
|
"process 94000 wavs,42132985 frames\n",
|
|
"process 95000 wavs,42584564 frames\n",
|
|
"process 96000 wavs,43018598 frames\n",
|
|
"process 97000 wavs,43480662 frames\n",
|
|
"process 98000 wavs,43973670 frames\n",
|
|
"process 99000 wavs,44448190 frames\n",
|
|
"process 100000 wavs,44935034 frames\n",
|
|
"process 101000 wavs,45379812 frames\n",
|
|
"process 102000 wavs,45821207 frames\n",
|
|
"process 103000 wavs,46258420 frames\n",
|
|
"process 104000 wavs,46743733 frames\n",
|
|
"process 105000 wavs,47206922 frames\n",
|
|
"process 106000 wavs,47683041 frames\n",
|
|
"process 107000 wavs,48122809 frames\n",
|
|
"process 108000 wavs,48594623 frames\n",
|
|
"process 109000 wavs,49086358 frames\n",
|
|
"process 110000 wavs,49525568 frames\n",
|
|
"process 111000 wavs,49985820 frames\n",
|
|
"process 112000 wavs,50428262 frames\n",
|
|
"process 113000 wavs,50897957 frames\n",
|
|
"process 114000 wavs,51344589 frames\n",
|
|
"process 115000 wavs,51774621 frames\n",
|
|
"process 116000 wavs,52243372 frames\n",
|
|
"process 117000 wavs,52726025 frames\n",
|
|
"process 118000 wavs,53170026 frames\n",
|
|
"process 119000 wavs,53614141 frames\n",
|
|
"process 120000 wavs,54071271 frames\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"augmentation_pipeline = AugmentationPipeline('{}')\n",
|
|
"audio_featurizer = AudioFeaturizer(\n",
|
|
" specgram_type=args.specgram_type,\n",
|
|
" feat_dim=args.feat_dim,\n",
|
|
" delta_delta=args.delta_delta,\n",
|
|
" stride_ms=args.stride_ms,\n",
|
|
" window_ms=args.window_ms,\n",
|
|
" n_fft=None,\n",
|
|
" max_freq=None,\n",
|
|
" target_sample_rate=args.sample_rate,\n",
|
|
" use_dB_normalization=True,\n",
|
|
" target_dB=-20)\n",
|
|
"\n",
|
|
"def augment_and_featurize(audio_segment):\n",
|
|
" augmentation_pipeline.transform_audio(audio_segment)\n",
|
|
" return audio_featurizer.featurize(audio_segment)\n",
|
|
"\n",
|
|
"\n",
|
|
"collate_func = CollateFunc()\n",
|
|
"\n",
|
|
"dataset = AudioDataset(\n",
|
|
" args.manifest_path,\n",
|
|
" augment_and_featurize, \n",
|
|
" args.num_samples)\n",
|
|
"\n",
|
|
"batch_size = 20\n",
|
|
"data_loader = DataLoader(\n",
|
|
" dataset,\n",
|
|
" batch_size=batch_size,\n",
|
|
" shuffle=False,\n",
|
|
" num_workers=args.num_workers,\n",
|
|
" collate_fn=collate_func)\n",
|
|
"\n",
|
|
"with paddle.no_grad():\n",
|
|
" all_mean_stat = None\n",
|
|
" all_var_stat = None\n",
|
|
" all_number = 0\n",
|
|
" wav_number = 0\n",
|
|
" for i, batch in enumerate(data_loader()):\n",
|
|
" #for batch in data_loader():\n",
|
|
" number, mean_stat, var_stat = batch\n",
|
|
" if i == 0:\n",
|
|
" all_mean_stat = mean_stat\n",
|
|
" all_var_stat = var_stat\n",
|
|
" else:\n",
|
|
" all_mean_stat += mean_stat\n",
|
|
" all_var_stat += var_stat\n",
|
|
" all_number += number\n",
|
|
" wav_number += batch_size\n",
|
|
"\n",
|
|
" if wav_number % 1000 == 0:\n",
|
|
" print('process {} wavs,{} frames'.format(wav_number,\n",
|
|
" all_number))\n",
|
|
"\n",
|
|
"cmvn_info = {\n",
|
|
" 'mean_stat': list(all_mean_stat.tolist()),\n",
|
|
" 'var_stat': list(all_var_stat.tolist()),\n",
|
|
" 'frame_num': all_number\n",
|
|
"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "danish-executive",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'mean_stat': [-813852467.7953382, -769025957.9140725, -809499593.411409, -774700574.014532, -750961217.5896736, -760564397.2864963, -805662399.3771614, -843490965.4231446, -850242081.9416809, -857678651.504435, -879067453.9826999, -908602072.3856701, -936850957.7187386, -957242686.489041, -968425442.0916103, -972687545.5953809, -980383731.7683417, -991533337.6343704, -1001966818.1164789, -1010334169.7486078, -1016855066.9099333, -1022176245.7021623, -1025700476.4788507, -1030678878.3195274, -1037075963.124199, -1042705719.0195516, -1047422212.6492896, -1049003537.271861, -1050314833.7453628, -1050772191.0204058, -1050010034.9948177, -1050436065.1336465, -1053327181.7978873, -1058710548.2036785, -1065950852.4966162, -1071709705.0060445, -1077682778.259181, -1083371045.272074, -1089708906.2657735, -1096312217.7865202, -1101089858.8364556, -1104965332.4332569, -1107791702.5223634, -1109431075.2374773, -1110066333.0280604, -1110382732.0722318, -1110480306.3793216, -1110203297.7110727, -1109972534.3583376, -1109378081.8792782, -1108212059.413654, -1107235713.2041805, -1106973581.9280007, -1107352339.7860134, -1108730029.862537, -1110425202.83704, -1113220669.4552443, -1115887535.4870913, -1118105356.3628063, -1120001376.8503075, -1121135822.320366, -1122265971.8751016, -1123990217.401155, -1125786729.6230593, -1127784957.2745507, -1129180108.9033566, -1132000461.6688302, -1134675829.8190608, -1137652487.5164194, -1141755948.0463965, -1145340901.5468378, -1148637682.593287, -1151755522.470022, -1154981643.2268832, -1157417488.840151, -1161240429.0989249, -1165411128.671642, -1170521097.1034513, -1176307165.5109766, -1183456865.0039694, -1190535938.6591117, -1197946309.0472982, -1203596565.037139, -1207563038.1241052, -1209707561.5829782, -1211407066.2452552, -1211884576.9201162, -1212778872.005509, -1214041413.8080075, -1215367953.1745043, -1216850831.482193, -1217678325.5351057, -1218854289.54188, -1219325064.8610544, -1219080344.7580786, -1218541313.657531, -1217889833.2067819, -1216552930.1654336, -1216423777.4113154, -1216575252.225508, -1217075384.9826024, -1217391577.901724, -1217838974.57273, -1218131805.6054134, -1218294889.7465532, -1218566666.1755593, -1218790537.5519717, -1218748668.9956846, -1218603191.4941735, -1218004566.4348054, -1217312410.127734, -1217207493.9522285, -1217284002.3834674, -1217644312.51745, -1218039821.6444128, -1218721811.6269798, -1219121088.9265897, -1219014460.8090584, -1218530127.6776083, -1217952335.451711, -1217316073.8666434, -1217035380.1151958, -1216636431.2964456, -1216257015.2945514, -1215658496.1208403, -1215097272.0976632, -1214669859.2064147, -1214593853.4809475, -1214599475.7838447, -1214575440.823035, -1214158828.8008435, -1213482920.2673717, -1212476577.5897374, -1211251374.2198513, -1210284855.590475, -1209302456.065669, -1209106252.6625297, -1209373211.5146718, -1209689421.7984035, -1210021342.495856, -1210650609.3592312, -1211428521.3900626, -1212616111.4257205, -1213820075.2948189, -1215320588.7144456, -1217175082.2739282, -1219703351.4585004, -1222007827.120464, -1224637375.5900724, -1228367798.912171, -1234853879.862459, -1247222219.867692, -1268562808.1616178, -1302034822.9569275, -1347823631.0776038, -1402753916.9445229, -1458826717.3262982, -1505843092.0970414, -1534278782.249077, -1543955545.8994718, -1600409154.893352], 'var_stat': [12665413908.91729, 11145088801.244318, 12567119446.035736, 11758392758.06822, 11200687982.736668, 11551903443.711124, 12880777868.435602, 14084854368.236998, 14394011058.866192, 14678818621.277662, 15346278722.626339, 16268053979.757076, 17191705347.854794, 17877540386.548733, 18251857849.077663, 18392628178.710472, 18645534548.4045, 19018598212.22902, 19366711357.782673, 19655730286.72857, 19890681996.786858, 20094163350.461906, 20227774955.225887, 20423525628.66887, 20669928826.76939, 20882313568.247944, 21062392676.270527, 21126648821.879055, 21185210734.751118, 21209014745.520447, 21182293842.91236, 21197433134.875977, 21302147790.662144, 21504666657.651955, 21781818550.89697, 21996170165.145462, 22217169779.096275, 22431161762.176693, 22672708668.38104, 22922683961.072956, 23101137011.201683, 23249680793.556847, 23358894817.24979, 23422895267.919228, 23449479198.303394, 23464433357.671055, 23469197140.124596, 23459013479.866177, 23447935341.542686, 23422585038.052387, 23375601301.949135, 23338397991.497776, 23329682884.21905, 23348002892.39853, 23406274659.89975, 23478242518.92228, 23592891371.876236, 23703885161.772205, 23797158601.65954, 23875230355.66992, 23918333664.3946, 23968582109.371258, 24040547318.081936, 24112364295.110058, 24189973697.612144, 24242165205.640236, 24364255205.82311, 24472408850.760197, 24590211203.05312, 24763026764.005527, 24909192634.69144, 25043438176.23281, 25167141466.500504, 25297108031.48665, 25395377064.0999, 25550930772.86505, 25721404827.10336, 25931101211.156487, 26168988710.098465, 26465528802.762875, 26760033029.443783, 27075408488.605213, 27316626931.655052, 27487275073.52796, 27579518448.2332, 27652308513.875782, 27673412508.45838, 27711509210.702576, 27767312240.641487, 27827464683.295334, 27894794590.957966, 27935988489.16511, 27992337099.891083, 28019655483.58796, 28014286886.252903, 27996189233.857716, 27973078840.875465, 27920045013.68706, 27917103211.22359, 27927566165.64652, 27953525818.61368, 27973386070.140022, 27999317832.502476, 28019494120.641834, 28033010746.452637, 28051086123.896503, 28066195174.191753, 28068570977.318798, 28064890246.85437, 28042424375.860577, 28015849655.869568, 28014812222.566605, 28021039053.959835, 28039270607.169422, 28058271295.10199, 28088976520.10178, 28107824988.74732, 28105633030.784756, 28087681357.818607, 28065484299.963837, 28039555887.004284, 28028214431.52875, 28011714871.929447, 27995603790.480755, 27970125897.561134, 27946436130.511288, 27929044772.5522, 27926612443.390316, 27926256324.387302, 27924771848.71099, 27905526922.390133, 27876268519.168198, 27832532606.552593, 27779497699.976765, 27737034351.907337, 27692129825.179924, 27684252911.371475, 27698882622.878677, 27712387157.27985, 27726474638.933037, 27752647691.051613, 27786197932.382797, 27836378752.662235, 27887415700.334576, 27949784230.702114, 28028117657.84245, 28136313097.200474, 28234098926.207996, 28345845477.25874, 28507222800.146496, 28793832339.90449, 29350765483.070816, 30328262350.231213, 31894930713.76519, 34093669067.422382, 36801959396.22739, 39638995447.49344, 42088579425.44825, 43616108982.85117, 44152063315.31461, 47464832889.5967], 'frame_num': 54129649}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(cmvn_info)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "accurate-terminal",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "dominant-abuse",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 1000 wavs,450240 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 2000 wavs,886411 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 3000 wavs,1352580 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 4000 wavs,1814397 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 5000 wavs,2356587 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 6000 wavs,2825310 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 7000 wavs,3272506 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 8000 wavs,3688045 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 9000 wavs,4134669 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 10000 wavs,4586357 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 11000 wavs,5014429 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 12000 wavs,5453334 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 13000 wavs,5892888 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 14000 wavs,6316059 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 15000 wavs,6728870 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 16000 wavs,7199442 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 17000 wavs,7629055 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 18000 wavs,8083729 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 19000 wavs,8519732 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 20000 wavs,8895694 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 21000 wavs,9341778 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 22000 wavs,9796126 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 23000 wavs,10236057 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 24000 wavs,10687461 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 25000 wavs,11113082 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 26000 wavs,11544482 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 27000 wavs,11996273 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 28000 wavs,12456350 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 29000 wavs,12900895 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 30000 wavs,13330353 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 31000 wavs,13736568 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 32000 wavs,14158472 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 33000 wavs,14625316 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 34000 wavs,15036206 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 35000 wavs,15514001 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 36000 wavs,16004323 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 37000 wavs,16418799 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 38000 wavs,16840100 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 39000 wavs,17287752 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 40000 wavs,17776206 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 41000 wavs,18243209 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 42000 wavs,18690449 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 43000 wavs,19137940 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 44000 wavs,19553966 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 45000 wavs,19969813 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 46000 wavs,20440963 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 47000 wavs,20862022 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 48000 wavs,21292801 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 49000 wavs,21713004 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 50000 wavs,22146346 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 51000 wavs,22596172 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 52000 wavs,23074160 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 53000 wavs,23499823 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 54000 wavs,23942151 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 55000 wavs,24390566 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 56000 wavs,24833905 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 57000 wavs,25307270 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 58000 wavs,25748720 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 59000 wavs,26185964 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 60000 wavs,26663953 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 61000 wavs,27117720 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 62000 wavs,27585349 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 63000 wavs,28032693 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 64000 wavs,28487074 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 65000 wavs,28956462 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 66000 wavs,29436358 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 67000 wavs,29918569 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 68000 wavs,30325682 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 69000 wavs,30762528 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 70000 wavs,31182319 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 71000 wavs,31627526 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 72000 wavs,32070556 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 73000 wavs,32504534 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 74000 wavs,32972775 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 75000 wavs,33409637 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 76000 wavs,33847861 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 77000 wavs,34298647 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 78000 wavs,34721536 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 79000 wavs,35159236 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 80000 wavs,35628628 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 81000 wavs,36080909 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 82000 wavs,36562496 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 83000 wavs,37042976 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 84000 wavs,37474403 frames\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 85000 wavs,37943596 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 86000 wavs,38371620 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 87000 wavs,38844874 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 88000 wavs,39292686 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 89000 wavs,39746715 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 90000 wavs,40241800 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 91000 wavs,40672817 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 92000 wavs,41131773 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 93000 wavs,41612001 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 94000 wavs,42084822 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 95000 wavs,42535878 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 96000 wavs,42969365 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 97000 wavs,43430890 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 98000 wavs,43923378 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 99000 wavs,44397370 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 100000 wavs,44883695 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 101000 wavs,45327968 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 102000 wavs,45768860 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 103000 wavs,46205602 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 104000 wavs,46690407 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 105000 wavs,47153089 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 106000 wavs,47628699 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 107000 wavs,48067945 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 108000 wavs,48539256 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 109000 wavs,49030485 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 110000 wavs,49469189 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 111000 wavs,49928968 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 112000 wavs,50370921 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 113000 wavs,50840090 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 114000 wavs,51286249 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 115000 wavs,51715786 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 116000 wavs,52184017 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 117000 wavs,52666156 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 118000 wavs,53109645 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 119000 wavs,53553253 frames\n",
|
|
"<class 'int'> <class 'paddle.VarBase'> <class 'paddle.VarBase'>\n",
|
|
"process 120000 wavs,54009877 frames\n",
|
|
"{'mean_stat': [700612678.1184504, 704246512.9321843, 720430663.1822729, 754033269.0474415, 798737761.616614, 829467218.4204571, 851246702.9426627, 862261185.2661449, 859339943.6923889, 846303730.8696194, 832995109.605447, 823196536.6029147, 832626008.2569772, 845571326.1936859, 848801373.0562981, 846503549.328017, 836774344.5500796, 823481091.0445303, 820728368.2518216, 804571348.4957463, 795306095.0083207, 811729024.2415155, 805734803.5703195, 813076782.1959459, 806620199.406499, 809655573.8886961, 804371708.9347517, 809272248.6085774, 810322689.7490631, 814294131.1973915, 816262716.0476038, 816213124.2411841, 817158473.4380915, 821414211.5629157, 827408091.5728914, 834353896.0519086, 840094990.3467333, 842613218.6554606, 842070761.1727513, 834970952.5260613, 837020570.8200948, 829592602.7833654, 830116543.8893851, 829482316.3881509, 833397219.4597517, 839251633.3120549, 845475010.4718693, 852378426.7183967, 859563981.8633184, 866063840.5523493, 867790921.9978689, 868215100.5962687, 869683066.032885, 872467375.6674014, 873097681.1780069, 873025823.0543871, 869897292.7201596, 866386426.3869117, 863166726.7256871, 854653071.2244718, 842402803.9000899, 830838253.4144138, 830143002.3536818, 831492285.0310817, 833304371.8781006, 838896092.8621838, 843866088.9578133, 847316792.1429776, 851038022.3643295, 855931698.0149751, 859320543.9795249, 863031001.3470656, 868325062.1832993, 873626971.0115026, 878726636.924209, 884861725.972504, 886920281.5192285, 883056006.5094173, 863719240.7255149, 773378975.9476194], 'var_stat': [9237018652.657722, 9417257721.82426, 10105084297.159702, 11071318522.587782, 12422783727.426847, 13400306419.784964, 14148498843.406874, 14576436982.89939, 14529009036.494726, 14105645932.596651, 13682988821.478252, 13413013425.088106, 13764134927.293928, 14233704806.737064, 14361631309.367067, 14281358385.45644, 13939662689.213865, 13496884231.929493, 13382566162.783987, 12871350930.6626, 12576198160.876635, 13051463889.56708, 12859205935.513906, 13053861416.098743, 12830323588.550724, 12886405923.897238, 12708529922.84171, 12847306110.231739, 12880398489.53404, 13002566299.565536, 13066708060.463543, 13064231286.858614, 13088983337.353497, 13221393824.891022, 13412425607.755072, 13631485149.777075, 13807797519.156103, 13877277485.033077, 13848613909.96762, 13609176326.2529, 13649815250.130072, 13397698404.696907, 13388964704.359968, 13354326914.968012, 13469861474.898457, 13652539440.283333, 13846837321.329163, 14062143714.601675, 14292571198.61228, 14504626563.299246, 14563864749.132776, 14579720287.991764, 14626700787.353922, 14716185568.128899, 14728532777.28015, 14719101187.113443, 14607945896.239174, 14478517828.531614, 14355110561.681187, 14057430280.249746, 13634284490.879377, 13248236002.494394, 13217602306.335958, 13257856701.946049, 13323688441.072674, 13515395318.023148, 13685827169.67645, 13811622609.426846, 13947347160.615082, 14115883822.884943, 14231204526.433033, 14356066668.651815, 14533604268.238445, 14708971788.69237, 14875667326.732443, 15079098318.79331, 15144888989.667963, 15002658970.504765, 14349232841.34513, 11544480117.013124], 'frame_num': 54068199}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import random\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import paddle\n",
|
|
"from paddle.io import DataLoader\n",
|
|
"from paddle.io import Dataset\n",
|
|
"\n",
|
|
"from deepspeech.frontend.audio import AudioSegment\n",
|
|
"from deepspeech.frontend.utility import load_cmvn\n",
|
|
"from deepspeech.frontend.utility import read_manifest\n",
|
|
"\n",
|
|
"# https://github.com/PaddlePaddle/Paddle/pull/31481\n",
|
|
"class CollateFunc(object):\n",
|
|
" ''' Collate function for AudioDataset\n",
|
|
" '''\n",
|
|
" def __init__(self, feature_func):\n",
|
|
" self.feature_func = feature_func\n",
|
|
" \n",
|
|
" def __call__(self, batch):\n",
|
|
" mean_stat = None\n",
|
|
" var_stat = None\n",
|
|
" number = 0\n",
|
|
" for item in batch:\n",
|
|
" audioseg = AudioSegment.from_file(item['feat'])\n",
|
|
" feat = self.feature_func(audioseg) #(D, T)\n",
|
|
"\n",
|
|
" sums = np.sum(feat, axis=1)\n",
|
|
" if mean_stat is None:\n",
|
|
" mean_stat = sums\n",
|
|
" else:\n",
|
|
" mean_stat += sums\n",
|
|
"\n",
|
|
" square_sums = np.sum(np.square(feat), axis=1)\n",
|
|
" if var_stat is None:\n",
|
|
" var_stat = square_sums\n",
|
|
" else:\n",
|
|
" var_stat += square_sums\n",
|
|
"\n",
|
|
" number += feat.shape[1]\n",
|
|
" return number, mean_stat, var_stat\n",
|
|
"\n",
|
|
"\n",
|
|
"class AudioDataset(Dataset):\n",
|
|
" def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):\n",
|
|
" self._rng = rng if rng else np.random.RandomState(random_seed)\n",
|
|
" manifest = read_manifest(manifest_path)\n",
|
|
" if num_samples == -1:\n",
|
|
" sampled_manifest = manifest\n",
|
|
" else:\n",
|
|
" sampled_manifest = self._rng.choice(manifest, num_samples, replace=False)\n",
|
|
" self.items = sampled_manifest\n",
|
|
"\n",
|
|
" def __len__(self):\n",
|
|
" return len(self.items)\n",
|
|
"\n",
|
|
" def __getitem__(self, idx):\n",
|
|
" return self.items[idx]\n",
|
|
" \n",
|
|
" \n",
|
|
"augmentation_pipeline = AugmentationPipeline('{}')\n",
|
|
"audio_featurizer = AudioFeaturizer(\n",
|
|
" specgram_type=args.specgram_type,\n",
|
|
" feat_dim=args.feat_dim,\n",
|
|
" delta_delta=args.delta_delta,\n",
|
|
" stride_ms=args.stride_ms,\n",
|
|
" window_ms=args.window_ms,\n",
|
|
" n_fft=None,\n",
|
|
" max_freq=None,\n",
|
|
" target_sample_rate=args.sample_rate,\n",
|
|
" use_dB_normalization=True,\n",
|
|
" target_dB=-20)\n",
|
|
"\n",
|
|
"def augment_and_featurize(audio_segment):\n",
|
|
" augmentation_pipeline.transform_audio(audio_segment)\n",
|
|
" return audio_featurizer.featurize(audio_segment)\n",
|
|
"\n",
|
|
"\n",
|
|
"collate_func = CollateFunc(augment_and_featurize)\n",
|
|
"\n",
|
|
"dataset = AudioDataset(\n",
|
|
" args.manifest_path,\n",
|
|
" args.num_samples)\n",
|
|
"\n",
|
|
"batch_size = 20\n",
|
|
"data_loader = DataLoader(\n",
|
|
" dataset,\n",
|
|
" batch_size=batch_size,\n",
|
|
" shuffle=False,\n",
|
|
" num_workers=args.num_workers,\n",
|
|
" collate_fn=collate_func)\n",
|
|
"\n",
|
|
"with paddle.no_grad():\n",
|
|
" all_mean_stat = None\n",
|
|
" all_var_stat = None\n",
|
|
" all_number = 0\n",
|
|
" wav_number = 0\n",
|
|
" for i, batch in enumerate(data_loader):\n",
|
|
" number, mean_stat, var_stat = batch\n",
|
|
" if i == 0:\n",
|
|
" all_mean_stat = mean_stat\n",
|
|
" all_var_stat = var_stat\n",
|
|
" else:\n",
|
|
" all_mean_stat += mean_stat\n",
|
|
" all_var_stat += var_stat\n",
|
|
" all_number += number\n",
|
|
" wav_number += batch_size\n",
|
|
"\n",
|
|
" if wav_number % 1000 == 0:\n",
|
|
" print('process {} wavs,{} frames'.format(wav_number,\n",
|
|
" all_number))\n",
|
|
"\n",
|
|
"cmvn_info = {\n",
|
|
" 'mean_stat': list(all_mean_stat.tolist()),\n",
|
|
" 'var_stat': list(all_var_stat.tolist()),\n",
|
|
" 'frame_num': all_number\n",
|
|
"}\n",
|
|
"print(cmvn_info)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "unlike-search",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|