You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/.notebook/dataloader_with_tokens_toke...

1205 lines
58 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "medieval-monday",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x\n"
]
},
{
"data": {
"text/plain": [
"'/workspace/DeepSpeech-2.x'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%cd ..\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "emerging-meter",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" def convert_to_list(value, n, name, dtype=np.int):\n"
]
}
],
"source": [
"import math\n",
"import random\n",
"import tarfile\n",
"import logging\n",
"import numpy as np\n",
"from collections import namedtuple\n",
"from functools import partial\n",
"\n",
"import paddle\n",
"from paddle.io import Dataset\n",
"from paddle.io import DataLoader\n",
"from paddle.io import BatchSampler\n",
"from paddle.io import DistributedBatchSampler\n",
"from paddle import distributed as dist\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "excessive-american",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 3,
"id": "naval-brave",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:93] register user softmax to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:109] register user relu to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/tiny/s1/data/spm_bpe', 'infer_manifest': 'examples/tiny/s1/data/manifest.tiny', 'mean_std_path': 'examples/tiny/s1/data/mean_std.npz', 'vocab_path': 'examples/tiny/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/tiny/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"# yapf: disable\n",
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
" \"bi-directional RNNs. Not for GRU.\")\n",
"add_arg('unit_type', str,\n",
" 'char',\n",
" \"Options: char, word, spm.\",\n",
" choices=['char', 'word', 'spm'])\n",
"add_arg('spm_model_prefix', str,\n",
" 'examples/tiny/s1/data/spm_bpe',\n",
" \"spm model prefix.\",)\n",
"add_arg('infer_manifest', str,\n",
" 'examples/tiny/s1/data/manifest.tiny',\n",
" \"Filepath of manifest to infer.\")\n",
"add_arg('mean_std_path', str,\n",
" 'examples/tiny/s1/data/mean_std.npz',\n",
" \"Filepath of normalizer's mean & std.\")\n",
"add_arg('vocab_path', str,\n",
" 'examples/tiny/s1/data/vocab.txt',\n",
" \"Filepath of vocabulary.\")\n",
"add_arg('lang_model_path', str,\n",
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
" \"Filepath for language model.\")\n",
"add_arg('model_path', str,\n",
" 'examples/tiny/s1/checkpoints/step_final',\n",
" \"If None, the training starts from scratch, \"\n",
" \"otherwise, it resumes from the pre-trained model.\")\n",
"add_arg('decoding_method', str,\n",
" 'ctc_beam_search',\n",
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
"add_arg('error_rate_type', str,\n",
" 'wer',\n",
" \"Error rate type for evaluation.\",\n",
" choices=['wer', 'cer'])\n",
"add_arg('specgram_type', str,\n",
" 'fbank',\n",
" \"Audio feature type. Options: linear, mfcc.\",\n",
" choices=['linear', 'mfcc'])\n",
"add_arg('feat_dim', int, 80, \"mfcc or fbank feat dim.\")\n",
"add_arg('delta_delta', bool, False, \"delta delta\")\n",
"# yapf: disable\n",
"args = parser.parse_args([])\n",
"print(vars(args))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "wired-principal",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/aishell/s1/data/spm_bpe', 'infer_manifest': 'examples/aishell/s1/data/manifest.test', 'mean_std_path': '', 'vocab_path': 'examples/aishell/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"# yapf: disable\n",
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
" \"bi-directional RNNs. Not for GRU.\")\n",
"add_arg('unit_type', str,\n",
" 'char',\n",
" \"Options: char, word, spm.\",\n",
" choices=['char', 'word', 'spm'])\n",
"add_arg('spm_model_prefix', str,\n",
" 'examples/aishell/s1/data/spm_bpe',\n",
" \"spm model prefix.\",)\n",
"add_arg('infer_manifest', str,\n",
" 'examples/aishell/s1/data/manifest.test',\n",
" \"Filepath of manifest to infer.\")\n",
"add_arg('mean_std_path', str,\n",
" '',\n",
" \"examples/aishell/s1/data/mean_std.npz, Filepath of normalizer's mean & std.\")\n",
"add_arg('vocab_path', str,\n",
" 'examples/aishell/s1/data/vocab.txt',\n",
" \"Filepath of vocabulary.\")\n",
"add_arg('lang_model_path', str,\n",
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
" \"Filepath for language model.\")\n",
"add_arg('model_path', str,\n",
" 'examples/aishell/s1/checkpoints/step_final',\n",
" \"If None, the training starts from scratch, \"\n",
" \"otherwise, it resumes from the pre-trained model.\")\n",
"add_arg('decoding_method', str,\n",
" 'ctc_beam_search',\n",
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
"add_arg('error_rate_type', str,\n",
" 'wer',\n",
" \"Error rate type for evaluation.\",\n",
" choices=['wer', 'cer'])\n",
"add_arg('specgram_type', str,\n",
" 'fbank',\n",
" \"Audio feature type. Options: linear, mfcc.\",\n",
" choices=['linear', 'mfcc', 'fbank'])\n",
"add_arg('feat_dim', int, 80, \"mfcc or fbank feat dim.\")\n",
"add_arg('delta_delta', bool, False, \"delta delta\")\n",
"# yapf: disable\n",
"args = parser.parse_args([])\n",
"print(vars(args))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bearing-physics",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
" from numpy.dual import register_func\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" long_ = _make_signed(np.long)\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" ulong = _make_unsigned(np.long)\n"
]
}
],
"source": [
"from deepspeech.frontend.utility import read_manifest\n",
"from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
"from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer\n",
"from deepspeech.frontend.speech import SpeechSegment\n",
"from deepspeech.frontend.normalizer import FeatureNormalizer\n",
"\n",
"\n",
"from deepspeech.io.collator import SpeechCollator\n",
"from deepspeech.io.dataset import ManifestDataset\n",
"from deepspeech.io.sampler import (\n",
" SortagradDistributedBatchSampler,\n",
" SortagradBatchSampler,\n",
")\n",
"from deepspeech.io import create_dataloader\n",
"batch_reader = create_dataloader(\n",
" manifest_path=args.infer_manifest,\n",
" unit_type=args.unit_type,\n",
" vocab_filepath=args.vocab_path,\n",
" mean_std_filepath=args.mean_std_path,\n",
" spm_model_prefix=args.spm_model_prefix,\n",
" augmentation_config='{}',\n",
" max_input_len=27.0,\n",
" min_input_len=0.0,\n",
" max_output_len=float('inf'),\n",
" min_output_len=0.0,\n",
" max_output_input_ratio=float('inf'),\n",
" min_output_input_ratio=0.0,\n",
" stride_ms=10.0,\n",
" window_ms=20.0,\n",
" max_freq=None,\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" use_dB_normalization=True,\n",
" random_seed=0,\n",
" keep_transcription_text=True,\n",
" is_training=False,\n",
" batch_size=args.num_samples,\n",
" num_workers=0,\n",
" sortagrad=True,\n",
" shuffle_method=None,\n",
" dist=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "classified-melissa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"fbank\n",
"[232 387 331 ... 249 249 262] int16\n",
"fbank\n",
"[-138 -219 -192 ... 338 324 351] int16\n",
"fbank\n",
"[ 694 1175 1022 ... 553 514 627] int16\n",
"fbank\n",
"[-39 -79 -53 ... 139 172 99] int16\n",
"fbank\n",
"[-277 -480 -425 ... 758 767 739] int16\n",
"fbank\n",
"[ 399 693 609 ... 1291 1270 1291] int16\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" if arr.dtype == np.object:\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"fbank\n",
"[ -750 -1254 -1107 ... 2276 1889 2067] int16\n",
"fbank\n",
"[ -127 -199 -149 ... -5243 -5065 -5398] int16\n",
"fbank\n",
"[ 465 783 677 ... 980 903 1008] int16\n",
"fbank\n",
"[ 90 160 157 ... -2 -16 -21] int16\n",
"fbank\n",
"[ 213 345 295 ... 2483 2246 2501] int16\n",
"fbank\n",
"[ -86 -159 -131 ... 270 258 290] int16\n",
"fbank\n",
"[-1023 -1714 -1505 ... 1532 1596 1575] int16\n",
"fbank\n",
"[-366 -602 -527 ... 374 370 379] int16\n",
"fbank\n",
"[ 761 1275 1127 ... 369 413 295] int16\n",
"fbank\n",
"[382 621 550 ... 161 161 174] int16\n",
"fbank\n",
"[ -28 -91 -120 ... 28 34 11] int16\n",
"fbank\n",
"[ -5 -5 -5 ... 268 294 341] int16\n",
"fbank\n",
"[240 417 684 ... 267 262 219] int16\n",
"fbank\n",
"[131 206 194 ... 383 320 343] int16\n",
"test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[31069, 21487, 29233, 30340, 20320, -1 , -1 ],\n",
" [20540, 24471, 19968, 25552, 30340, 26159, -1 ],\n",
" [36825, 20010, 31243, 24230, 26159, 32654, 30340],\n",
" [20108, 21040, 20108, -1 , -1 , -1 , -1 ],\n",
" [21435, 34892, 25919, 21270, -1 , -1 , -1 ]])\n",
"fbank\n",
"[1155 1890 1577 ... 1092 989 1130] int16\n",
"fbank\n",
"[296 358 296 ... 140 140 168] int16\n",
"fbank\n",
"[-50 -91 -63 ... 104 104 86] int16\n",
"fbank\n",
"[-37 -66 -50 ... -31 -45 -52] int16\n",
"fbank\n",
"[-401 -652 -547 ... -339 -307 -344] int16\n",
"fbank\n",
"[-21 -47 -51 ... 94 81 107] int16\n",
"fbank\n",
"[ 533 887 755 ... 3074 2853 3254] int16\n",
"fbank\n",
"[ 44 71 66 ... -628 -733 -601] int16\n",
"fbank\n",
"[ 50 86 79 ... 129 116 138] int16\n",
"fbank\n",
"[ 92 146 126 ... -208 -193 -179] int16\n",
"test raw: 祝可爱的你\n",
"test raw: 去行政化\n",
"audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [184, 194, 196, 204, 207])\n",
"test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [5, 6, 7, 3, 4])\n",
"audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[12.25633812, 12.61639309, 10.36936474, ..., 13.02949619, 11.51365757, 10.59789085],\n",
" [13.32148266, 13.41071606, 11.43800735, ..., 13.69783783, 12.83939362, 11.51259613],\n",
" [12.62640572, 12.53621101, 10.97212505, ..., 13.33757591, 12.32293034, 10.75493717],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[10.99619484, 11.35202599, 9.56922054 , ..., 9.94971657 , 9.88354111 , 9.55315971 ],\n",
" [10.44461155, 9.81688595 , 5.62538481 , ..., 10.60468388, 10.94417381, 9.42646980 ],\n",
" [10.23835754, 10.23407459, 7.99464273 , ..., 10.68097591, 9.91640091 , 10.04131031],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[14.10299397, 14.50298119, 12.87738323, ..., 12.62796497, 12.69949627, 11.43171215],\n",
" [13.85035992, 13.15289116, 10.66541386, ..., 13.34364223, 13.46972179, 11.02160740],\n",
" [13.19866467, 13.23537827, 11.65760899, ..., 12.72559357, 12.42716217, 11.74562359],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[12.85668373, 12.82431412, 11.68144703, ..., 14.10119247, 15.12791920, 13.68221378],\n",
" [13.19507027, 13.40244961, 11.43618393, ..., 13.32919979, 13.68267441, 12.73429012],\n",
" [13.02173328, 12.92082500, 11.44303989, ..., 12.77793121, 13.10915661, 11.77327728],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[12.90771198, 13.40234852, 13.01435471, ..., 13.80359459, 14.08088684, 13.17883396],\n",
" [14.06678009, 14.06943512, 12.52837276, ..., 13.66423225, 13.66300583, 13.60142994],\n",
" [12.58743191, 12.94520760, 11.75190544, ..., 14.28828907, 14.08229160, 13.02433395],\n",
" ...,\n",
" [16.20896912, 16.42283821, 14.94358730, ..., 12.91146755, 12.66766262, 11.76361752],\n",
" [13.49324894, 14.14653301, 13.16490936, ..., 13.23435783, 13.45378494, 12.60386276],\n",
" [15.56288910, 15.92445087, 14.90794277, ..., 13.43840790, 13.41075516, 12.55605984]]])\n"
]
}
],
"source": [
"for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
" print('test:', text)\n",
" print(\"test raw:\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
" print(\"test raw:\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
" print('audio len:', audio_len)\n",
" print('test len:', text_len)\n",
" print('audio:', audio)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "unexpected-skating",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "minus-modern",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"fbank\n",
"[232 387 331 ... 249 249 262] int16\n",
"fbank\n",
"[-138 -219 -192 ... 338 324 351] int16\n",
"fbank\n",
"[ 694 1175 1022 ... 553 514 627] int16\n",
"fbank\n",
"[-39 -79 -53 ... 139 172 99] int16\n",
"fbank\n",
"[-277 -480 -425 ... 758 767 739] int16\n",
"fbank\n",
"test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[2695, 505, 2332, 2553, 169, -1 , -1 ],\n",
" [ 230, 1237, 2 , 1556, 2553, 1694, -1 ],\n",
" [3703, 28 , 2739, 1172, 1694, 2966, 2553],\n",
" [ 70 , 355, 70 , -1 , -1 , -1 , -1 ],\n",
" [ 477, 3363, 1621, 412, -1 , -1 , -1 ]])\n",
"[ 399 693 609 ... 1291 1270 1291] int16\n",
"test raw: ઇǹज৹©\n",
"test raw: ǝണٕƜ\n",
"test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [5, 6, 7, 3, 4])\n",
"audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[12.25794601, 12.61855793, 10.37306023, ..., 13.12571049, 11.53678799, 10.32210350],\n",
" [13.32333183, 13.41336918, 11.44248962, ..., 13.65861225, 12.79308128, 11.31168747],\n",
" [12.62584686, 12.53506088, 10.96861362, ..., 13.32526493, 12.41560936, 10.71458912],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[11.00003052, 11.35529137, 9.56384087 , ..., 10.06063652, 10.16322994, 9.43149185 ],\n",
" [10.44556236, 9.81155300 , 5.49400425 , ..., 10.84116268, 11.02734756, 9.42253590 ],\n",
" [10.23620510, 10.23321152, 7.99466419 , ..., 10.93381882, 10.28395081, 10.00841141],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[14.10379314, 14.50375748, 12.87825108, ..., 12.68065739, 12.62359715, 11.53773308],\n",
" [13.84964657, 13.15079498, 10.67198086, ..., 13.24875164, 13.45796680, 10.97363472],\n",
" [13.19808197, 13.23482990, 11.65900230, ..., 12.70375061, 12.41395664, 11.88668156],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[12.85676289, 12.82410812, 11.67961884, ..., 14.12018299, 15.14850044, 13.80065727],\n",
" [13.19532776, 13.40243340, 11.43492508, ..., 13.29144669, 13.70278549, 12.67841339],\n",
" [13.02196407, 12.92111111, 11.43998623, ..., 12.71165752, 13.16518497, 11.92028046],\n",
" ...,\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[12.90661621, 13.40162563, 13.01394463, ..., 13.84056377, 14.11240959, 13.21227264],\n",
" [14.06642914, 14.06922340, 12.52955723, ..., 13.55829811, 13.60157204, 13.50268650],\n",
" [12.58881378, 12.94780254, 11.75758171, ..., 14.29055786, 14.12165928, 13.02695847],\n",
" ...,\n",
" [16.20891571, 16.42290306, 14.94398117, ..., 12.86083794, 12.63515949, 11.67581463],\n",
" [13.49345875, 14.14656067, 13.16498375, ..., 13.28024578, 13.40956783, 12.70357513],\n",
" [15.56265163, 15.92387581, 14.90643024, ..., 13.45694065, 13.44703197, 12.81099033]]])\n",
"audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [184, 194, 196, 204, 207])\n"
]
}
],
"source": [
"keep_transcription_text=False\n",
"batch_reader = create_dataloader(\n",
" manifest_path=args.infer_manifest,\n",
" unit_type=args.unit_type,\n",
" vocab_filepath=args.vocab_path,\n",
" mean_std_filepath=args.mean_std_path,\n",
" spm_model_prefix=args.spm_model_prefix,\n",
" augmentation_config='{}',\n",
" max_input_len=27.0,\n",
" min_input_len=0.0,\n",
" max_output_len=float('inf'),\n",
" min_output_len=0.0,\n",
" max_output_input_ratio=float('inf'),\n",
" min_output_input_ratio=0.0,\n",
" stride_ms=10.0,\n",
" window_ms=20.0,\n",
" max_freq=None,\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" use_dB_normalization=True,\n",
" random_seed=0,\n",
" keep_transcription_text=keep_transcription_text,\n",
" is_training=False,\n",
" batch_size=args.num_samples,\n",
" num_workers=0,\n",
" sortagrad=True,\n",
" shuffle_method=None,\n",
" dist=False)\n",
"for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
" print('test:', text)\n",
" print(\"test raw:\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
" print(\"test raw:\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
" print('test len:', text_len)\n",
" print('audio:', audio)\n",
" print('audio len:', audio_len)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "competitive-mounting",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"id": "knowing-military",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 1, 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False, 'stride_ms': 10.0, 'window_ms': 25.0, 'sample_rate': 16000, 'manifest_path': 'examples/aishell/s1/data/manifest.train', 'output_path': 'examples/aishell/s1/data/mean_std.npz'}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"\n",
"add_arg('num_samples', int, 1, \"# of samples to for statistics.\")\n",
"add_arg('specgram_type', str, 'fbank',\n",
" \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
" choices=['linear', 'mfcc', 'fbank'])\n",
"add_arg('feat_dim', int, 80, \"Audio feature dim.\")\n",
"add_arg('delta_delta', bool, False,\"Audio feature with delta delta.\")\n",
"add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n",
"add_arg('window_ms', float, 25.0, \"stride length in ms.\")\n",
"add_arg('sample_rate', int, 16000, \"target sample rate.\")\n",
"add_arg('manifest_path', str,\n",
" 'examples/aishell/s1/data/manifest.train',\n",
" \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
"add_arg('output_path', str,\n",
" 'examples/aishell/s1/data/mean_std.npz',\n",
" \"Filepath of write mean and stddev to (.npz).\")\n",
"args = parser.parse_args([])\n",
"print(vars(args))\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "unnecessary-province",
"metadata": {},
"outputs": [],
"source": [
"\n",
"from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
"from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
"from deepspeech.frontend.normalizer import FeatureNormalizer\n",
"from deepspeech.frontend.audio import AudioSegment\n",
"from deepspeech.frontend.utility import load_cmvn\n",
"from deepspeech.frontend.utility import read_manifest\n",
"\n",
"\n",
"\n",
"def mean(args):\n",
" augmentation_pipeline = AugmentationPipeline('{}')\n",
" audio_featurizer = AudioFeaturizer(\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" stride_ms=args.stride_ms,\n",
" window_ms=args.window_ms,\n",
" n_fft=None,\n",
" max_freq=None,\n",
" target_sample_rate=args.sample_rate,\n",
" use_dB_normalization=True,\n",
" target_dB=-20,\n",
" dither=0.0)\n",
"\n",
" def augment_and_featurize(audio_segment):\n",
" augmentation_pipeline.transform_audio(audio_segment)\n",
" return audio_featurizer.featurize(audio_segment)\n",
"\n",
" normalizer = FeatureNormalizer(\n",
" mean_std_filepath=None,\n",
" manifest_path=args.manifest_path,\n",
" featurize_func=augment_and_featurize,\n",
" num_samples=args.num_samples)\n",
" normalizer.write_to_file(args.output_path)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "interested-camping",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
"[54. 90. 77. ... 58. 58. 61.]\n",
"29746\n",
"fbank\n",
"[54 90 77 ... 58 58 61] int16\n",
"(184, 80) float64\n",
"[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n",
" 7.80671114]\n",
" [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n",
" 8.83860079]\n",
" [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n",
" 7.96168491]\n",
" ...\n",
" [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n",
" 8.75616521]\n",
" [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n",
" 7.69287184]\n",
" [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n",
" 8.16007108]]\n",
"(184, 80) float64\n",
"[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n",
" 7.80671114]\n",
" [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n",
" 8.83860079]\n",
" [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n",
" 7.96168491]\n",
" ...\n",
" [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n",
" 8.75616521]\n",
" [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n",
" 7.69287184]\n",
" [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n",
" 8.16007108]]\n"
]
}
],
"source": [
"wav='/workspace/DeepSpeech-2.x/examples/aishell/s1/../../..//examples/dataset/aishell/data_aishell/wav/test/S0916/BAC009S0916W0426.wav'\n",
"test='祝可爱的你'\n",
"audio_featurizer = AudioFeaturizer(\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" stride_ms=args.stride_ms,\n",
" window_ms=args.window_ms,\n",
" n_fft=None,\n",
" max_freq=None,\n",
" target_sample_rate=args.sample_rate,\n",
" use_dB_normalization=False,\n",
" target_dB=-20,\n",
" dither=0.0)\n",
"samples = AudioSegment.from_file(wav)\n",
"print(samples._samples)\n",
"print(samples._samples * 2**15)\n",
"print(len(samples._samples))\n",
"feat = audio_featurizer.featurize(samples, False, False)\n",
"feat = feat.T\n",
"print(feat.shape, feat.dtype)\n",
"print(feat)\n",
"\n",
"from python_speech_features import logfbank\n",
"max_freq = args.sample_rate / 2\n",
"fbank_feat = logfbank(\n",
" signal=samples.to('int16'),\n",
" samplerate=args.sample_rate,\n",
" winlen=0.001 * args.window_ms,\n",
" winstep=0.001 * args.stride_ms,\n",
" nfilt=args.feat_dim,\n",
" nfft=512,\n",
" lowfreq=20,\n",
" highfreq=max_freq,\n",
" preemph=0.97,\n",
" dither=0.0,\n",
" wintype='povey')\n",
"print(fbank_feat.shape, fbank_feat.dtype)\n",
"print(fbank_feat)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "numeric-analyst",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(184, 160)\n",
"[ 8.59522397 8.43148278 8.36414052 8.45487173 8.31761643 8.04843683\n",
" 8.01683696 7.6574614 7.95521932 8.22945157 10.20138275 9.0447775\n",
" 9.14763398 9.18184349 9.03801065 9.04852307 8.67706728 8.71894271\n",
" 9.54553655 9.19535135 8.76413076 8.47828946 8.52586143 8.49469288\n",
" 8.72461247 8.28562879 8.11581393 7.99922156 7.91023364 8.04142296\n",
" 7.89762773 7.76257636 8.32043745 8.01592886 8.34109665 8.90115454\n",
" 8.48246945 7.98658664 8.05745122 8.11384088 8.18864479 8.8091827\n",
" 11.8067711 13.25258218 14.44311795 13.90515283 14.00120623 13.99801252\n",
" 13.81595394 13.6379904 13.3574897 13.14933334 12.96518543 13.02601156\n",
" 12.70246737 12.54410834 12.15615068 11.86574681 11.67497882 10.79645481\n",
" 10.48150035 10.03758575 10.05637027 9.92891308 10.06923218 12.43382431\n",
" 12.71428321 14.33135052 13.94470959 14.29188291 14.11483993 14.03496606\n",
" 13.78167331 13.66701466 14.40308625 14.73934137 15.09569382 14.89565815\n",
" 15.10519995 14.94383582 15.03275563 15.42194679 15.29219967 15.41602274\n",
" 15.39242545 15.76836177 16.259222 16.47777231 17.03366795 17.46165793\n",
" 17.52596217 17.78844031 17.99878075 18.11446843 17.95761578 17.99900337\n",
" 17.86282737 17.7290163 17.47686504 17.43425516 17.07750485 16.64395242\n",
" 15.68217043 14.90058399 14.45645737 14.0405463 14.89549542 16.00405781\n",
" 16.27301689 16.37572895 16.31219037 16.31765447 16.44819716 16.36281089\n",
" 16.24932823 15.79302555 14.76361963 13.95761882 13.48917053 13.45543501\n",
" 13.00091327 13.13854248 13.74596395 13.86340629 14.00656109 13.77432101\n",
" 13.64267001 13.35742634 13.23042234 12.97916104 12.80694468 12.70005006\n",
" 13.2802483 13.22644525 13.14579624 13.02536594 13.36511022 11.37167205\n",
" 12.11598045 12.47619798 12.83885973 11.63880287 11.42083924 11.08747705\n",
" 11.04093403 11.11263149 10.74353319 10.58734669 10.46180738 10.34157335\n",
" 9.63131146 9.70582692 9.29059204 8.94583657 8.66065094 8.46799095\n",
" 8.25064103 8.30239167 8.19463371 8.12104567 8.02731234 8.06412715\n",
" 7.84889951 7.73090283 7.74119562 7.85444657 7.80717312 7.7129933\n",
" 7.84087442 7.77907788 7.60660865 7.55051479 7.458385 7.496416\n",
" 7.69519793 7.49086759 7.32199493 8.01617458 7.58525375 7.06661122\n",
" 6.94653756 7.19874283 7.28515661 7.17574078]\n",
"(184,)\n",
"(184,)\n",
"[1.48370471 1.52174523 1.46984238 1.67010478 1.88757689 1.68825992\n",
" 1.74270259 1.55497318 1.29200818 1.68446481 1.88133219 1.97138928\n",
" 2.15910096 2.3149476 1.9820247 2.07694378 1.93498835 2.01493974\n",
" 2.39156824 2.02396518 1.69586449 1.63808752 1.64020228 1.43573473\n",
" 1.93092656 1.37466294 1.34704929 1.59600739 1.03960441 1.45276496\n",
" 1.59360131 1.57466343 1.89491479 1.79333746 1.32701974 1.49441767\n",
" 1.51466756 1.63497989 1.42858074 1.51135396 1.61077201 1.81066387\n",
" 1.83367783 2.3507094 2.87885378 3.26231227 2.1313117 1.98557548\n",
" 1.99105426 2.26150533 2.34298751 2.44621608 2.39201042 2.41226503\n",
" 2.5142992 3.03777565 2.81592295 2.75117863 2.78324175 2.68819666\n",
" 2.8945782 2.84464168 2.680973 2.78397395 2.47996808 1.71829563\n",
" 1.60636949 1.65992483 1.38122631 1.74831825 2.16006884 1.68076185\n",
" 1.69329487 1.44929837 1.63763312 1.80101076 2.01166253 2.03254244\n",
" 1.9583913 2.04542255 2.00859694 2.16600883 2.16095629 1.97541122\n",
" 2.13807632 2.06386436 2.2154187 2.84205688 2.54862449 2.64321545\n",
" 2.6805773 2.52300146 2.53209001 2.54682059 2.4521937 2.43155532\n",
" 2.42571275 2.23421289 2.23164529 2.23597192 2.14215121 2.10406703\n",
" 2.07962874 1.88506161 1.80092372 1.61156092 1.77426835 1.98765563\n",
" 2.0356793 1.87964187 1.779513 1.87187681 1.76463632 1.70978684\n",
" 1.76471778 1.75604749 1.62792552 1.73929352 1.6887024 1.8677704\n",
" 2.17342368 2.08166072 2.14567453 2.15936953 2.18351006 2.41010388\n",
" 2.26101752 2.25468001 2.23739715 2.15395133 2.04547813 1.92038843\n",
" 1.85491264 1.91905927 2.16709365 1.99924152 2.1850471 2.55461622\n",
" 2.72476673 1.69682926 1.73249614 2.06992695 2.1210591 1.66854454\n",
" 1.63907505 1.32203822 1.38992558 1.2436937 1.17932877 1.02963653\n",
" 1.26085036 1.16997132 1.09339504 1.14188689 1.18675772 1.31859788\n",
" 1.21746591 1.3872131 1.26095274 1.34885761 1.46633543 1.64506975\n",
" 1.36013821 1.45574721 1.43766588 1.65119054 1.57163772 1.55082968\n",
" 1.29413316 1.38351736 1.64234673 1.57186432 1.45381083 1.71204761\n",
" 1.51828607 1.30639985 1.32928395 1.49004237 1.6057589 1.81815735\n",
" 1.67784678 1.72180861 1.60703743 1.64850255]\n"
]
}
],
"source": [
"a = np.hstack([feat, feat])\n",
"print(a.shape)\n",
"m = np.mean(a, axis=1)\n",
"print(m)\n",
"print(m.shape)\n",
"std = np.std(a, axis=1)\n",
"print(std.shape)\n",
"print(std)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "nonprofit-potato",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 18,
"id": "hispanic-ethics",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torchaudio\n",
"import torchaudio.compliance.kaldi as kaldi\n",
"import torchaudio.sox_effects as sox_effects\n",
"from torch.nn.utils.rnn import pad_sequence\n",
"torchaudio.set_audio_backend(\"sox\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "changing-calvin",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 29746])\n",
"tensor([[54., 90., 77., ..., 58., 58., 61.]])\n",
"(184, 80)\n",
"[[10.617376 10.077089 5.3248763 ... 10.248186 8.896992 7.8067265]\n",
" [11.044004 10.318072 6.3086634 ... 11.237308 10.358393 8.838616 ]\n",
" [10.269302 9.9963665 7.3296647 ... 10.451319 9.692951 7.9617033]\n",
" ...\n",
" [10.14497 9.886743 6.738012 ... 10.215809 9.0034275 8.756177 ]\n",
" [ 9.977456 9.679498 7.9066052 ... 10.224365 9.594568 7.6928873]\n",
" [ 6.4735703 7.7633557 7.7576594 ... 9.965221 9.622637 8.160085 ]]\n",
"-----------\n",
"[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
"(184, 80)\n",
"[[-10.177039 -10.717326 -15.46954 ... -10.546229 -11.897424 -12.987689]\n",
" [ -9.750411 -10.476343 -14.485752 ... -9.557108 -10.436023 -11.955799]\n",
" [-10.525113 -10.798049 -13.46475 ... -10.343097 -11.101464 -12.832712]\n",
" ...\n",
" [-10.649446 -10.907673 -14.056403 ... -10.578607 -11.790988 -12.038239]\n",
" [-10.816959 -11.114918 -12.88781 ... -10.570049 -11.199847 -13.101528]\n",
" [-14.320845 -13.03106 -13.036756 ... -10.829194 -11.171779 -12.634331]]\n",
"**************\n",
"[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
"[54. 90. 77. ... 58. 58. 61.] float32\n",
"(184, 80)\n",
"[[10.617376 10.077089 5.3248763 ... 10.248186 8.896992 7.8067265]\n",
" [11.044004 10.318072 6.3086634 ... 11.237308 10.358393 8.838616 ]\n",
" [10.269302 9.9963665 7.3296647 ... 10.451319 9.692951 7.9617033]\n",
" ...\n",
" [10.14497 9.886743 6.738012 ... 10.215809 9.0034275 8.756177 ]\n",
" [ 9.977456 9.679498 7.9066052 ... 10.224365 9.594568 7.6928873]\n",
" [ 6.4735703 7.7633557 7.7576594 ... 9.965221 9.622637 8.160085 ]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: torchaudio.backend.sox_backend.load_wav has been deprecated and will be removed from 0.9.0 release. Please use \"torchaudio.load\".\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"waveform, sample_rate = torchaudio.load_wav(wav)\n",
"print(waveform.shape)\n",
"print(waveform)\n",
"mat = kaldi.fbank(\n",
" waveform,\n",
" num_mel_bins=80,\n",
" frame_length=25,\n",
" frame_shift=10,\n",
" dither=0,\n",
" energy_floor=0.0,\n",
" sample_frequency=sample_rate\n",
" )\n",
"mat = mat.detach().numpy()\n",
"print(mat.shape)\n",
"print(mat)\n",
"\n",
"print('-----------')\n",
"print(samples._samples)\n",
"aud = torch.tensor(samples._samples).view(1, -1)\n",
"mat = kaldi.fbank(\n",
" aud,\n",
" num_mel_bins=80,\n",
" frame_length=25,\n",
" frame_shift=10,\n",
" dither=0,\n",
" energy_floor=0.0,\n",
" sample_frequency=sample_rate\n",
" )\n",
"mat = mat.detach().numpy()\n",
"print(mat.shape)\n",
"print(mat)\n",
"\n",
"print('**************')\n",
"print(samples._samples)\n",
"tmp = samples.to('int16').astype('float32')\n",
"print(tmp, tmp.dtype)\n",
"aud = torch.tensor(tmp).view(1, -1)\n",
"mat = kaldi.fbank(\n",
" aud,\n",
" num_mel_bins=80,\n",
" frame_length=25,\n",
" frame_shift=10,\n",
" dither=0,\n",
" energy_floor=0.0,\n",
" sample_frequency=sample_rate\n",
" )\n",
"mat = mat.detach().numpy()\n",
"print(mat.shape)\n",
"print(mat)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "buried-dependence",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "silver-printing",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"id": "outer-space",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(29746,)\n",
"[54 90 77 ... 58 58 61]\n",
"(184, 80)\n",
"[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n",
" 7.80671114]\n",
" [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n",
" 8.83860079]\n",
" [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n",
" 7.96168491]\n",
" ...\n",
" [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n",
" 8.75616521]\n",
" [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n",
" 7.69287184]\n",
" [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n",
" 8.16007108]]\n",
"(184, 13)\n",
"[[ 14.73775998 -13.30393391 5.85974818 ... -3.42359739 2.82785335\n",
" 8.86862748]\n",
" [ 15.31274834 -13.33671651 4.06537223 ... 8.15970347 2.15934846\n",
" 6.78353115]\n",
" [ 13.82218765 -13.39296404 6.8304843 ... 2.55332563 8.86724453\n",
" -0.05919222]\n",
" ...\n",
" [ 13.5837844 -13.42104892 11.21222354 ... 4.81477718 1.66627505\n",
" 5.59045842]\n",
" [ 13.75757034 -13.92626662 13.06074011 ... -0.46694046 5.56214833\n",
" 12.0785146 ]\n",
" [ 11.92813809 -15.9169855 8.78372271 ... -1.42014277 -3.25768086\n",
" 0.88337965]]\n"
]
}
],
"source": [
"from python_speech_features import mfcc\n",
"from python_speech_features import delta\n",
"from python_speech_features import logfbank\n",
"import scipy.io.wavfile as iowav\n",
"\n",
"(rate,sig) = iowav.read(wav)\n",
"print(sig.shape)\n",
"print(sig)\n",
"\n",
"# note that generally nfilt=40 is used for speech recognition\n",
"fbank_feat = logfbank(sig,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n",
"print(fbank_feat.shape)\n",
"print(fbank_feat)\n",
"\n",
"# the computed fbank coefficents of english.wav with dimension [110,23]\n",
"# [ 12.2865\t12.6906\t13.1765\t15.714\t16.064\t15.7553\t16.5746\t16.9205\t16.6472\t16.1302\t16.4576\t16.7326\t16.8864\t17.7215\t18.88\t19.1377\t19.1495\t18.6683\t18.3886\t20.3506\t20.2772\t18.8248\t18.1899\n",
"# 11.9198\t13.146\t14.7215\t15.8642\t17.4288\t16.394\t16.8238\t16.1095\t16.4297\t16.6331\t16.3163\t16.5093\t17.4981\t18.3429\t19.6555\t19.6263\t19.8435\t19.0534\t19.001\t20.0287\t19.7707\t19.5852\t19.1112\n",
"# ...\n",
"# ...\n",
"# the same with that using kaldi commands: compute-fbank-feats --dither=0.0\n",
"\n",
"mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')\n",
"print(mfcc_feat.shape)\n",
"print(mfcc_feat)\n",
"\n",
"# the computed mfcc coefficents of english.wav with dimension [110,13]\n",
"# [ 17.1337\t-23.3651\t-7.41751\t-7.73686\t-21.3682\t-8.93884\t-3.70843\t4.68346\t-16.0676\t12.782\t-7.24054\t8.25089\t10.7292\n",
"# 17.1692\t-23.3028\t-5.61872\t-4.0075\t-23.287\t-20.6101\t-5.51584\t-6.15273\t-14.4333\t8.13052\t-0.0345329\t2.06274\t-0.564298\n",
"# ...\n",
"# ...\n",
"# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "sporting-school",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(184, 80)\n",
"[[-10.17703627 -10.71732606 -15.46954014 ... -10.54623152 -11.89742148\n",
" -12.98770428]\n",
" [ -9.75040771 -10.47634331 -14.48575413 ... -9.55710616 -10.43602673\n",
" -11.95581463]\n",
" [-10.52510987 -10.79804975 -13.46475161 ... -10.34309947 -11.10146239\n",
" -12.83273051]\n",
" ...\n",
" [-10.64944197 -10.90767335 -14.05640404 ... -10.57860915 -11.7909807\n",
" -12.03825021]\n",
" [-10.8169558 -11.11491806 -12.88781116 ... -10.57004889 -11.19985048\n",
" -13.10154358]\n",
" [-14.32084168 -13.03106051 -13.03675699 ... -10.82919465 -11.17177892\n",
" -12.63434434]]\n",
"(184, 13)\n",
"[[ -6.05665544 -13.30393391 5.85974818 ... -3.42359739 2.82785335\n",
" 8.86862748]\n",
" [ -5.48166707 -13.33671651 4.06537223 ... 8.15970347 2.15934846\n",
" 6.78353115]\n",
" [ -6.97222776 -13.39296404 6.8304843 ... 2.55332563 8.86724453\n",
" -0.05919222]\n",
" ...\n",
" [ -7.21063102 -13.42104892 11.21222354 ... 4.81477718 1.66627505\n",
" 5.59045842]\n",
" [ -7.03684508 -13.92626662 13.06074011 ... -0.46694046 5.56214833\n",
" 12.0785146 ]\n",
" [ -8.86627732 -15.9169855 8.78372271 ... -1.42014277 -3.25768086\n",
" 0.88337965]]\n"
]
}
],
"source": [
"fbank_feat = logfbank(samples._samples,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n",
"print(fbank_feat.shape)\n",
"print(fbank_feat)\n",
"\n",
"mfcc_feat = mfcc(samples._samples,dither=0,useEnergy=True,wintype='povey')\n",
"print(mfcc_feat.shape)\n",
"print(mfcc_feat)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "restricted-license",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "specialized-threat",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}