Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into fix_bug

pull/853/head
huangyuxin 4 years ago
commit f4e59293bf

2
.gitignore vendored

@ -18,5 +18,7 @@ tools/sox-14.4.2
tools/soxbindings tools/soxbindings
tools/montreal-forced-aligner/ tools/montreal-forced-aligner/
tools/Montreal-Forced-Aligner/ tools/Montreal-Forced-Aligner/
tools/sctk
tools/sctk-20159b5/
*output/ *output/

@ -1,605 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "academic-surname",
"metadata": {},
"outputs": [],
"source": [
"import paddle\n",
"from paddle import nn"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "fundamental-treasure",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"L = nn.Linear(256, 2048)\n",
"L2 = nn.Linear(2048, 256)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "consolidated-elephant",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import torch\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "moderate-noise",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"float64\n",
"Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[[-1.54171216, -2.61531472, -1.79881978, ..., -0.31395876, 0.56513089, -0.44516513],\n",
" [-0.79492962, 1.91157901, 0.66567147, ..., 0.54825783, -1.01471853, -0.84924090],\n",
" [-1.22556651, -0.36225814, 0.65063190, ..., 0.65726501, 0.05563191, 0.09009409],\n",
" ...,\n",
" [ 0.38615900, -0.77905393, 0.99732304, ..., -1.38463700, -3.32365036, -1.31089687],\n",
" [ 0.05579993, 0.06885809, -1.66662002, ..., -0.23346378, -3.29372883, 1.30561364],\n",
" [ 1.90676069, 1.95093191, -0.28849599, ..., -0.06860496, 0.95347673, 1.00475824]],\n",
"\n",
" [[-0.91453546, 0.55298805, -1.06146812, ..., -0.86378336, 1.00454640, 1.26062179],\n",
" [ 0.10223761, 0.81301165, 2.36865163, ..., 0.16821407, 0.29240361, 1.05408621],\n",
" [-1.33196676, 1.94433689, 0.01934209, ..., 0.48036841, 0.51585966, 1.22893548],\n",
" ...,\n",
" [-0.19558455, -0.47075930, 0.90796155, ..., -1.28598249, -0.24321797, 0.17734711],\n",
" [ 0.89819717, -1.39516675, 0.17138045, ..., 2.39761519, 1.76364994, -0.52177650],\n",
" [ 0.94122332, -0.18581429, 1.36099780, ..., 0.67647684, -0.04699665, 1.51205540]]])\n",
"tensor([[[-1.5417, -2.6153, -1.7988, ..., -0.3140, 0.5651, -0.4452],\n",
" [-0.7949, 1.9116, 0.6657, ..., 0.5483, -1.0147, -0.8492],\n",
" [-1.2256, -0.3623, 0.6506, ..., 0.6573, 0.0556, 0.0901],\n",
" ...,\n",
" [ 0.3862, -0.7791, 0.9973, ..., -1.3846, -3.3237, -1.3109],\n",
" [ 0.0558, 0.0689, -1.6666, ..., -0.2335, -3.2937, 1.3056],\n",
" [ 1.9068, 1.9509, -0.2885, ..., -0.0686, 0.9535, 1.0048]],\n",
"\n",
" [[-0.9145, 0.5530, -1.0615, ..., -0.8638, 1.0045, 1.2606],\n",
" [ 0.1022, 0.8130, 2.3687, ..., 0.1682, 0.2924, 1.0541],\n",
" [-1.3320, 1.9443, 0.0193, ..., 0.4804, 0.5159, 1.2289],\n",
" ...,\n",
" [-0.1956, -0.4708, 0.9080, ..., -1.2860, -0.2432, 0.1773],\n",
" [ 0.8982, -1.3952, 0.1714, ..., 2.3976, 1.7636, -0.5218],\n",
" [ 0.9412, -0.1858, 1.3610, ..., 0.6765, -0.0470, 1.5121]]])\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"x = np.random.randn(2, 51, 256)\n",
"print(x.dtype)\n",
"px = paddle.to_tensor(x, dtype='float32')\n",
"tx = torch.tensor(x, dtype=torch.float32)\n",
"print(px)\n",
"print(tx)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cooked-progressive",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"id": "mechanical-prisoner",
"metadata": {},
"outputs": [],
"source": [
"data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
"t_norm_ff = data['norm_ff']\n",
"t_ff_out = data['ff_out']\n",
"t_ff_l_x = data['ff_l_x']\n",
"t_ff_l_a_x = data['ff_l_a_x']\n",
"t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
"t_ps = data['ps']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "indie-marriage",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"id": "assured-zambia",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"True\n",
"True\n",
"True\n"
]
}
],
"source": [
"L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
"L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
"\n",
"ps = []\n",
"for n, p in L.named_parameters():\n",
" ps.append(p)\n",
"\n",
"for n, p in L2.state_dict().items():\n",
" ps.append(p)\n",
" \n",
"for p, tp in zip(ps, t_ps):\n",
" print(np.allclose(p.numpy(), tp.T))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "committed-jacob",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "extreme-traffic",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "optimum-milwaukee",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"id": "viral-indian",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"True\n",
"True\n",
"True\n"
]
}
],
"source": [
"# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
"# t_norm_ff = data['norm_ff']\n",
"# t_ff_out = data['ff_out']\n",
"# t_ff_l_x = data['ff_l_x']\n",
"# t_ff_l_a_x = data['ff_l_a_x']\n",
"# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
"# t_ps = data['ps']\n",
"TL = torch.nn.Linear(256, 2048)\n",
"TL2 = torch.nn.Linear(2048, 256)\n",
"TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
"TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
"\n",
"# for n, p in TL.named_parameters():\n",
"# print(n, p)\n",
"# for n, p in TL2.named_parameters():\n",
"# print(n, p)\n",
"\n",
"ps = []\n",
"for n, p in TL.state_dict().items():\n",
" ps.append(p.data.numpy())\n",
" \n",
"for n, p in TL2.state_dict().items():\n",
" ps.append(p.data.numpy())\n",
" \n",
"for p, tp in zip(ps, t_ps):\n",
" print(np.allclose(p, tp))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "skilled-vietnamese",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[[ 0.67277956 0.08313607 -0.62761104 ... -0.17480263 0.42718208\n",
" -0.5787626 ]\n",
" [ 0.91516656 0.5393416 1.7159258 ... 0.06144593 0.06486575\n",
" -0.03350811]\n",
" [ 0.438351 0.6227843 0.24096036 ... 1.0912522 -0.90929437\n",
" -1.012989 ]\n",
" ...\n",
" [ 0.68631977 0.14240924 0.10763275 ... -0.11513516 0.48065388\n",
" 0.04070369]\n",
" [-0.9525228 0.23197874 0.31264272 ... 0.5312439 0.18773697\n",
" -0.8450228 ]\n",
" [ 0.42024016 -0.04561988 0.54541194 ... -0.41933843 -0.00436018\n",
" -0.06663495]]\n",
"\n",
" [[-0.11638781 -0.33566502 -0.20887226 ... 0.17423287 -0.9195841\n",
" -0.8161046 ]\n",
" [-0.3469874 0.88269687 -0.11887559 ... -0.15566081 0.16357468\n",
" -0.20766167]\n",
" [-0.3847657 0.3984318 -0.06963477 ... -0.00360622 1.2360432\n",
" -0.26811332]\n",
" ...\n",
" [ 0.08230796 -0.46158582 0.54582864 ... 0.15747628 -0.44790155\n",
" 0.06020184]\n",
" [-0.8095085 0.43163058 -0.42837143 ... 0.8627463 0.90656304\n",
" 0.15847842]\n",
" [-1.485811 -0.18216592 -0.8882585 ... 0.32596245 0.7822631\n",
" -0.6460344 ]]]\n",
"[[[ 0.67278004 0.08313602 -0.6276114 ... -0.17480245 0.42718196\n",
" -0.5787625 ]\n",
" [ 0.91516703 0.5393413 1.7159253 ... 0.06144581 0.06486579\n",
" -0.03350812]\n",
" [ 0.43835106 0.62278455 0.24096027 ... 1.0912521 -0.9092943\n",
" -1.0129892 ]\n",
" ...\n",
" [ 0.6863195 0.14240888 0.10763284 ... -0.11513527 0.48065376\n",
" 0.04070365]\n",
" [-0.9525231 0.23197863 0.31264275 ... 0.53124386 0.18773702\n",
" -0.84502304]\n",
" [ 0.42024007 -0.04561983 0.545412 ... -0.41933888 -0.00436005\n",
" -0.066635 ]]\n",
"\n",
" [[-0.11638767 -0.33566508 -0.20887226 ... 0.17423296 -0.9195838\n",
" -0.8161046 ]\n",
" [-0.34698725 0.88269705 -0.11887549 ... -0.15566081 0.16357464\n",
" -0.20766166]\n",
" [-0.3847657 0.3984319 -0.06963488 ... -0.00360619 1.2360426\n",
" -0.26811326]\n",
" ...\n",
" [ 0.08230786 -0.4615857 0.5458287 ... 0.15747619 -0.44790167\n",
" 0.06020182]\n",
" [-0.8095083 0.4316307 -0.42837155 ... 0.862746 0.9065631\n",
" 0.15847899]\n",
" [-1.485811 -0.18216613 -0.8882584 ... 0.32596254 0.7822631\n",
" -0.6460344 ]]]\n",
"True\n",
"False\n"
]
}
],
"source": [
"y = L(px)\n",
"print(y.numpy())\n",
"\n",
"ty = TL(tx)\n",
"print(ty.data.numpy())\n",
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
"print(np.allclose(y.numpy(), ty.detach().numpy()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "incorrect-allah",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "prostate-cameroon",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"id": "governmental-surge",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.04476918 0.554463 -0.3027508 ... -0.49600336 0.3751858\n",
" 0.8254095 ]\n",
" [ 0.95594174 -0.29528382 -1.2899452 ... 0.43718258 0.05584608\n",
" -0.06974669]]\n",
"[[ 0.04476918 0.5544631 -0.3027507 ... -0.49600336 0.37518573\n",
" 0.8254096 ]\n",
" [ 0.95594174 -0.29528376 -1.2899454 ... 0.4371827 0.05584623\n",
" -0.0697467 ]]\n",
"True\n",
"False\n",
"True\n"
]
}
],
"source": [
"x = np.random.randn(2, 256)\n",
"px = paddle.to_tensor(x, dtype='float32')\n",
"tx = torch.tensor(x, dtype=torch.float32)\n",
"y = L(px)\n",
"print(y.numpy())\n",
"ty = TL(tx)\n",
"print(ty.data.numpy())\n",
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
"print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
"print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "confidential-jacket",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"id": "improved-civilization",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5e7e7c9fde8350084abf1898cf52651cfc84b17a\n"
]
}
],
"source": [
"print(paddle.version.commit)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "d1e2d3b4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['__builtins__',\n",
" '__cached__',\n",
" '__doc__',\n",
" '__file__',\n",
" '__loader__',\n",
" '__name__',\n",
" '__package__',\n",
" '__spec__',\n",
" 'commit',\n",
" 'full_version',\n",
" 'istaged',\n",
" 'major',\n",
" 'minor',\n",
" 'mkl',\n",
" 'patch',\n",
" 'rc',\n",
" 'show',\n",
" 'with_mkl']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(paddle.version)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c880c719",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.1.0\n"
]
}
],
"source": [
"print(paddle.version.full_version)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f26977bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"commit: 5e7e7c9fde8350084abf1898cf52651cfc84b17a\n",
"None\n"
]
}
],
"source": [
"print(paddle.version.show())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "04ad47f6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.6.0\n"
]
}
],
"source": [
"print(torch.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e1e03830",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['__builtins__',\n",
" '__cached__',\n",
" '__doc__',\n",
" '__file__',\n",
" '__loader__',\n",
" '__name__',\n",
" '__package__',\n",
" '__spec__',\n",
" '__version__',\n",
" 'cuda',\n",
" 'debug',\n",
" 'git_version',\n",
" 'hip']"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(torch.version)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "4ad0389b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'b31f58de6fa8bbda5353b3c77d9be4914399724d'"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.version.git_version"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7870ea10",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'10.2'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.version.cuda"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db8ee5a7",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6321ec2a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

@ -1,389 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "emerging-meter",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" def convert_to_list(value, n, name, dtype=np.int):\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
" from numpy.dual import register_func\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" long_ = _make_signed(np.long)\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" ulong = _make_unsigned(np.long)\n"
]
}
],
"source": [
"import math\n",
"import random\n",
"import tarfile\n",
"import logging\n",
"import numpy as np\n",
"from collections import namedtuple\n",
"from functools import partial\n",
"\n",
"import paddle\n",
"from paddle.io import Dataset\n",
"from paddle.io import DataLoader\n",
"from paddle.io import BatchSampler\n",
"from paddle.io import DistributedBatchSampler\n",
"from paddle import distributed as dist\n",
"\n",
"from data_utils.utility import read_manifest\n",
"from data_utils.augmentor.augmentation import AugmentationPipeline\n",
"from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n",
"from data_utils.speech import SpeechSegment\n",
"from data_utils.normalizer import FeatureNormalizer\n",
"\n",
"\n",
"from data_utils.dataset import (\n",
" DeepSpeech2Dataset,\n",
" DeepSpeech2DistributedBatchSampler,\n",
" DeepSpeech2BatchSampler,\n",
" SpeechCollator,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "excessive-american",
"metadata": {},
"outputs": [],
"source": [
"def create_dataloader(manifest_path,\t\n",
" vocab_filepath,\t\n",
" mean_std_filepath,\t\n",
" augmentation_config='{}',\t\n",
" max_duration=float('inf'),\t\n",
" min_duration=0.0,\t\n",
" stride_ms=10.0,\t\n",
" window_ms=20.0,\t\n",
" max_freq=None,\t\n",
" specgram_type='linear',\t\n",
" use_dB_normalization=True,\t\n",
" random_seed=0,\t\n",
" keep_transcription_text=False,\t\n",
" is_training=False,\t\n",
" batch_size=1,\t\n",
" num_workers=0,\t\n",
" sortagrad=False,\t\n",
" shuffle_method=None,\t\n",
" dist=False):\t\n",
"\n",
" dataset = DeepSpeech2Dataset(\t\n",
" manifest_path,\t\n",
" vocab_filepath,\t\n",
" mean_std_filepath,\t\n",
" augmentation_config=augmentation_config,\t\n",
" max_duration=max_duration,\t\n",
" min_duration=min_duration,\t\n",
" stride_ms=stride_ms,\t\n",
" window_ms=window_ms,\t\n",
" max_freq=max_freq,\t\n",
" specgram_type=specgram_type,\t\n",
" use_dB_normalization=use_dB_normalization,\t\n",
" random_seed=random_seed,\t\n",
" keep_transcription_text=keep_transcription_text)\t\n",
"\n",
" if dist:\t\n",
" batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n",
" dataset,\t\n",
" batch_size,\t\n",
" num_replicas=None,\t\n",
" rank=None,\t\n",
" shuffle=is_training,\t\n",
" drop_last=is_training,\t\n",
" sortagrad=is_training,\t\n",
" shuffle_method=shuffle_method)\t\n",
" else:\t\n",
" batch_sampler = DeepSpeech2BatchSampler(\t\n",
" dataset,\t\n",
" shuffle=is_training,\t\n",
" batch_size=batch_size,\t\n",
" drop_last=is_training,\t\n",
" sortagrad=is_training,\t\n",
" shuffle_method=shuffle_method)\t\n",
"\n",
" def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n",
" \"\"\"\t\n",
" Padding audio features with zeros to make them have the same shape (or\t\n",
" a user-defined shape) within one bach.\t\n",
"\n",
" If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n",
" as the target shape for padding. Otherwise, `padding_to` will be the\t\n",
" target shape (only refers to the second axis).\t\n",
"\n",
" If `flatten` is True, features will be flatten to 1darray.\t\n",
" \"\"\"\t\n",
" new_batch = []\t\n",
" # get target shape\t\n",
" max_length = max([audio.shape[1] for audio, text in batch])\t\n",
" if padding_to != -1:\t\n",
" if padding_to < max_length:\t\n",
" raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n",
" \"than any instance's shape in the batch\")\t\n",
" max_length = padding_to\t\n",
" max_text_length = max([len(text) for audio, text in batch])\t\n",
" # padding\t\n",
" padded_audios = []\t\n",
" audio_lens = []\t\n",
" texts, text_lens = [], []\t\n",
" for audio, text in batch:\t\n",
" padded_audio = np.zeros([audio.shape[0], max_length])\t\n",
" padded_audio[:, :audio.shape[1]] = audio\t\n",
" if flatten:\t\n",
" padded_audio = padded_audio.flatten()\t\n",
" padded_audios.append(padded_audio)\t\n",
" audio_lens.append(audio.shape[1])\t\n",
"\n",
" padded_text = np.zeros([max_text_length])\n",
" if is_training:\n",
" padded_text[:len(text)] = text\t# ids\n",
" else:\n",
" padded_text[:len(text)] = [ord(t) for t in text] # string\n",
" \n",
" texts.append(padded_text)\t\n",
" text_lens.append(len(text))\t\n",
"\n",
" padded_audios = np.array(padded_audios).astype('float32')\t\n",
" audio_lens = np.array(audio_lens).astype('int64')\t\n",
" texts = np.array(texts).astype('int32')\t\n",
" text_lens = np.array(text_lens).astype('int64')\t\n",
" return padded_audios, texts, audio_lens, text_lens\t\n",
"\n",
" loader = DataLoader(\t\n",
" dataset,\t\n",
" batch_sampler=batch_sampler,\t\n",
" collate_fn=partial(padding_batch, is_training=is_training),\t\n",
" num_workers=num_workers)\t\n",
" return loader"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "naval-brave",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"# yapf: disable\n",
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
" \"bi-directional RNNs. Not for GRU.\")\n",
"add_arg('infer_manifest', str,\n",
" 'examples/aishell/data/manifest.dev',\n",
" \"Filepath of manifest to infer.\")\n",
"add_arg('mean_std_path', str,\n",
" 'examples/aishell/data/mean_std.npz',\n",
" \"Filepath of normalizer's mean & std.\")\n",
"add_arg('vocab_path', str,\n",
" 'examples/aishell/data/vocab.txt',\n",
" \"Filepath of vocabulary.\")\n",
"add_arg('lang_model_path', str,\n",
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
" \"Filepath for language model.\")\n",
"add_arg('model_path', str,\n",
" 'examples/aishell/checkpoints/step_final',\n",
" \"If None, the training starts from scratch, \"\n",
" \"otherwise, it resumes from the pre-trained model.\")\n",
"add_arg('decoding_method', str,\n",
" 'ctc_beam_search',\n",
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
"add_arg('error_rate_type', str,\n",
" 'wer',\n",
" \"Error rate type for evaluation.\",\n",
" choices=['wer', 'cer'])\n",
"add_arg('specgram_type', str,\n",
" 'linear',\n",
" \"Audio feature type. Options: linear, mfcc.\",\n",
" choices=['linear', 'mfcc'])\n",
"# yapf: disable\n",
"args = parser.parse_args([])\n",
"print(vars(args))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "bearing-physics",
"metadata": {},
"outputs": [],
"source": [
"batch_reader = create_dataloader(\n",
" manifest_path=args.infer_manifest,\n",
" vocab_filepath=args.vocab_path,\n",
" mean_std_filepath=args.mean_std_path,\n",
" augmentation_config='{}',\n",
" #max_duration=float('inf'),\n",
" max_duration=27.0,\n",
" min_duration=0.0,\n",
" stride_ms=10.0,\n",
" window_ms=20.0,\n",
" max_freq=None,\n",
" specgram_type=args.specgram_type,\n",
" use_dB_normalization=True,\n",
" random_seed=0,\n",
" keep_transcription_text=True,\n",
" is_training=False,\n",
" batch_size=args.num_samples,\n",
" sortagrad=True,\n",
" shuffle_method=None,\n",
" dist=False)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "classified-melissa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[22823, 26102, 20195, 37324, 0 , 0 ],\n",
" [22238, 26469, 23601, 22909, 0 , 0 ],\n",
" [20108, 26376, 22235, 26085, 0 , 0 ],\n",
" [36824, 35201, 20445, 25345, 32654, 24863],\n",
" [29042, 27748, 21463, 23456, 0 , 0 ]])\n",
"test raw 大时代里\n",
"test raw 煲汤受宠\n",
"audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [163, 167, 180, 186, 186])\n",
"test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [4, 4, 4, 6, 4])\n",
"audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[ 1.11669052, 0.79015088, 0.93658292, ..., 0. , 0. , 0. ],\n",
" [ 0.83549136, 0.72643483, 0.83578080, ..., 0. , 0. , 0. ],\n",
" [-0.89155018, -0.18894747, -0.53357804, ..., 0. , 0. , 0. ],\n",
" ...,\n",
" [ 0.33386710, -0.81240511, 0.12869737, ..., 0. , 0. , 0. ],\n",
" [-0.17537928, 0.58380985, 0.70696265, ..., 0. , 0. , 0. ],\n",
" [-0.84175998, 1.22041416, 0.07929770, ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-0.35964420, 0.77392709, 0.71409988, ..., 0. , 0. , 0. ],\n",
" [-0.15990183, 0.42962283, 0.06222462, ..., 0. , 0. , 0. ],\n",
" [-0.31166190, -0.74864638, -0.52836996, ..., 0. , 0. , 0. ],\n",
" ...,\n",
" [-0.27546275, 0.32889456, 0.12410031, ..., 0. , 0. , 0. ],\n",
" [ 0.16264282, 0.49418071, -0.15960945, ..., 0. , 0. , 0. ],\n",
" [ 0.12476666, 0.00516864, 1.16021466, ..., 0. , 0. , 0. ]],\n",
"\n",
" [[ 0.90202141, 1.48541915, 0.92062062, ..., 0. , 0. , 0. ],\n",
" [ 0.82661545, 1.37171340, 0.86746097, ..., 0. , 0. , 0. ],\n",
" [-0.62287915, -0.48645937, 0.35041964, ..., 0. , 0. , 0. ],\n",
" ...,\n",
" [ 0.07376949, 0.07138316, 0.76355994, ..., 0. , 0. , 0. ],\n",
" [-0.32306790, 0.43247896, 1.27311838, ..., 0. , 0. , 0. ],\n",
" [-0.97667056, 0.60747612, 0.79181534, ..., 0. , 0. , 0. ]],\n",
"\n",
" [[ 0.72022128, 0.95428467, 0.92766261, ..., 0.29105374, -0.45564806, -0.62151009],\n",
" [ 0.42083180, 0.49279949, 0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n",
" [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n",
" ...,\n",
" [-0.23009977, 1.06155431, 1.09065628, ..., 0.25581080, 0.53794998, -1.22650719],\n",
" [-1.37693381, 0.30778193, 0.17152318, ..., 0.51650339, 0.25580606, 0.83097816],\n",
" [-1.62180591, 1.30567718, 1.09928656, ..., -0.77590007, 1.27712476, 0.53189957]],\n",
"\n",
" [[ 1.03205252, -0.51535392, 0.21077573, ..., 0.76618457, 1.27425683, 1.52250278],\n",
" [ 0.82059991, 0.43990925, 0.13090958, ..., 0.86662549, 1.01687658, 1.48495352],\n",
" [-0.75489789, -0.01997089, -0.65174174, ..., 0.09061214, -0.55211234, -0.01614586],\n",
" ...,\n",
" [ 0.50985396, 1.84555030, 0.79185146, ..., 1.13666189, 1.19898069, 1.98158395],\n",
" [ 1.98721015, 2.52385354, 1.11714780, ..., 0.19416514, 1.11329341, 0.64460152],\n",
" [ 2.69512844, 1.90993905, 0.50245082, ..., -0.50902629, 0.03333465, -1.24584770]]])\n"
]
}
],
"source": [
"for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
" print('test', text)\n",
" print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
" print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
" print('audio len', audio_len)\n",
" print('test len', text_len)\n",
" print('audio', audio)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "unexpected-skating",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "minus-modern",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,290 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "breeding-haven",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
]
},
{
"data": {
"text/plain": [
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%cd ..\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "appropriate-theta",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LICENSE deepspeech examples\t\t requirements.txt tools\r\n",
"README.md docs\t libsndfile-1.0.28\t setup.sh\t utils\r\n",
"README_cn.md env.sh\t libsndfile-1.0.28.tar.gz tests\r\n"
]
}
],
"source": [
"!ls"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "entire-bloom",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" def convert_to_list(value, n, name, dtype=np.int):\n",
"WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
"WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
"WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
"WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
]
}
],
"source": [
"from deepspeech.modules import loss"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "governmental-aircraft",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
}
],
"source": [
"import paddle"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "proprietary-disaster",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paddle.Tensor.repeat"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "first-diagram",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<property at 0x7fb515eeeb88>"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paddle.Tensor.size"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "intelligent-david",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paddle.Tensor.cat"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "bronze-tenant",
"metadata": {},
"outputs": [],
"source": [
"a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "balanced-bearing",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.size"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "extreme-republic",
"metadata": {},
"outputs": [],
"source": [
"def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
" nargs = len(args)\n",
" assert (nargs <= 1)\n",
" s = paddle.shape(xs)\n",
" if nargs == 1:\n",
" return s[args[0]]\n",
" else:\n",
" return s\n",
"\n",
"# logger.warn(\n",
"# \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
"# )\n",
"paddle.Tensor.size = size"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "gross-addiction",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
" [7])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.size(0)\n",
"a.size()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "adverse-dining",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
" [7])"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "popular-potato",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -1,672 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
]
},
{
"data": {
"text/plain": [
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%cd ..\n",
"%pwd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
"2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
"2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
" from numpy.dual import register_func\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n"
]
}
],
"source": [
"import os\n",
"import time\n",
"import argparse\n",
"import functools\n",
"import paddle\n",
"import numpy as np\n",
"\n",
"from deepspeech.utils.socket_server import warm_up_test\n",
"from deepspeech.utils.socket_server import AsrTCPServer\n",
"from deepspeech.utils.socket_server import AsrRequestHandler\n",
"\n",
"from deepspeech.training.cli import default_argument_parser\n",
"from deepspeech.exps.deepspeech2.config import get_cfg_defaults\n",
"\n",
"from deepspeech.frontend.utility import read_manifest\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"\n",
"from deepspeech.models.ds2 import DeepSpeech2Model\n",
"from deepspeech.models.ds2 import DeepSpeech2InferModel\n",
"from deepspeech.io.dataset import ManifestDataset\n",
"\n",
"\n",
"\n",
"from deepspeech.frontend.utility import read_manifest"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0.0\n",
"e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
"OFF\n",
"OFF\n",
"commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
"None\n",
"0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"data": {
"text/plain": [
"['__builtins__',\n",
" '__cached__',\n",
" '__doc__',\n",
" '__file__',\n",
" '__loader__',\n",
" '__name__',\n",
" '__package__',\n",
" '__spec__',\n",
" 'commit',\n",
" 'full_version',\n",
" 'istaged',\n",
" 'major',\n",
" 'minor',\n",
" 'mkl',\n",
" 'patch',\n",
" 'rc',\n",
" 'show',\n",
" 'with_mkl']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(paddle.__version__)\n",
"print(paddle.version.commit)\n",
"print(paddle.version.with_mkl)\n",
"print(paddle.version.mkl())\n",
"print(paddle.version.show())\n",
"print(paddle.version.patch)\n",
"dir(paddle.version)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data:\n",
" augmentation_config: conf/augmentation.config\n",
" batch_size: 64\n",
" dev_manifest: data/manifest.dev\n",
" keep_transcription_text: False\n",
" max_duration: 27.0\n",
" max_freq: None\n",
" mean_std_filepath: examples/aishell/data/mean_std.npz\n",
" min_duration: 0.0\n",
" n_fft: None\n",
" num_workers: 0\n",
" random_seed: 0\n",
" shuffle_method: batch_shuffle\n",
" sortagrad: True\n",
" specgram_type: linear\n",
" stride_ms: 10.0\n",
" target_dB: -20\n",
" target_sample_rate: 16000\n",
" test_manifest: examples/aishell/data/manifest.test\n",
" train_manifest: data/manifest.train\n",
" use_dB_normalization: True\n",
" vocab_filepath: examples/aishell/data/vocab.txt\n",
" window_ms: 20.0\n",
"decoding:\n",
" alpha: 2.6\n",
" batch_size: 128\n",
" beam_size: 300\n",
" beta: 5.0\n",
" cutoff_prob: 0.99\n",
" cutoff_top_n: 40\n",
" decoding_method: ctc_beam_search\n",
" error_rate_type: cer\n",
" lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm\n",
" num_proc_bsearch: 10\n",
"model:\n",
" num_conv_layers: 2\n",
" num_rnn_layers: 3\n",
" rnn_layer_size: 1024\n",
" share_rnn_weights: False\n",
" use_gru: True\n",
"training:\n",
" global_grad_clip: 5.0\n",
" lr: 0.0005\n",
" lr_decay: 0.83\n",
" n_epoch: 30\n",
" weight_decay: 1e-06\n",
"----------- Configuration Arguments -----------\n",
"checkpoint_path: examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725\n",
"config: examples/aishell/conf/deepspeech2.yaml\n",
"device: gpu\n",
"dump_config: None\n",
"export_path: None\n",
"host_ip: localhost\n",
"host_port: 8086\n",
"model_dir: None\n",
"model_file: examples/aishell/jit.model.pdmodel\n",
"nprocs: 1\n",
"opts: ['data.test_manifest', 'examples/aishell/data/manifest.test', 'data.mean_std_filepath', 'examples/aishell/data/mean_std.npz', 'data.vocab_filepath', 'examples/aishell/data/vocab.txt']\n",
"output: None\n",
"params_file: examples/aishell/jit.model.pdiparams\n",
"speech_save_dir: demo_cache\n",
"use_gpu: False\n",
"warmup_manifest: examples/aishell/data/manifest.test\n",
"------------------------------------------------\n"
]
}
],
"source": [
"parser = default_argument_parser()\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"add_arg('host_ip', str,\n",
" 'localhost',\n",
" \"Server's IP address.\")\n",
"add_arg('host_port', int, 8086, \"Server's IP port.\")\n",
"add_arg('speech_save_dir', str,\n",
" 'demo_cache',\n",
" \"Directory to save demo audios.\")\n",
"add_arg('warmup_manifest', \n",
" str, \n",
" \"examples/aishell/data/manifest.test\", \n",
" \"Filepath of manifest to warm up.\")\n",
"add_arg(\n",
" \"--model_file\",\n",
" type=str,\n",
" default=\"examples/aishell/jit.model.pdmodel\",\n",
" help=\"Model filename, Specify this when your model is a combined model.\"\n",
")\n",
"add_arg(\n",
" \"--params_file\",\n",
" type=str,\n",
" default=\"examples/aishell/jit.model.pdiparams\",\n",
" help=\n",
" \"Parameter filename, Specify this when your model is a combined model.\"\n",
")\n",
"add_arg(\n",
" \"--model_dir\",\n",
" type=str,\n",
" default=None,\n",
" help=\n",
" \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
")\n",
"add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
"\n",
"\n",
"args = parser.parse_args(\n",
" \"--checkpoint_path examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725 --config examples/aishell/conf/deepspeech2.yaml --opts data.test_manifest examples/aishell/data/manifest.test data.mean_std_filepath examples/aishell/data/mean_std.npz data.vocab_filepath examples/aishell/data/vocab.txt\".split()\n",
")\n",
"\n",
"\n",
"config = get_cfg_defaults()\n",
"if args.config:\n",
" config.merge_from_file(args.config)\n",
"if args.opts:\n",
" config.merge_from_list(args.opts)\n",
"config.freeze()\n",
"print(config)\n",
"\n",
"args.warmup_manifest = config.data.test_manifest\n",
"\n",
"print_arguments(args)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"dataset = ManifestDataset(\n",
" config.data.test_manifest,\n",
" config.data.unit_type,\n",
" config.data.vocab_filepath,\n",
" config.data.mean_std_filepath,\n",
" augmentation_config=\"{}\",\n",
" max_duration=config.data.max_duration,\n",
" min_duration=config.data.min_duration,\n",
" stride_ms=config.data.stride_ms,\n",
" window_ms=config.data.window_ms,\n",
" n_fft=config.data.n_fft,\n",
" max_freq=config.data.max_freq,\n",
" target_sample_rate=config.data.target_sample_rate,\n",
" specgram_type=config.data.specgram_type,\n",
" feat_dim=config.data.feat_dim,\n",
" delta_delta=config.data.delat_delta,\n",
" use_dB_normalization=config.data.use_dB_normalization,\n",
" target_dB=config.data.target_dB,\n",
" random_seed=config.data.random_seed,\n",
" keep_transcription_text=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"layer summary:\n",
"encoder.conv.conv_in.conv.weight|[32, 1, 41, 11]|14432\n",
"encoder.conv.conv_in.bn.weight|[32]|32\n",
"encoder.conv.conv_in.bn.bias|[32]|32\n",
"encoder.conv.conv_in.bn._mean|[32]|32\n",
"encoder.conv.conv_in.bn._variance|[32]|32\n",
"encoder.conv.conv_stack.0.conv.weight|[32, 32, 21, 11]|236544\n",
"encoder.conv.conv_stack.0.bn.weight|[32]|32\n",
"encoder.conv.conv_stack.0.bn.bias|[32]|32\n",
"encoder.conv.conv_stack.0.bn._mean|[32]|32\n",
"encoder.conv.conv_stack.0.bn._variance|[32]|32\n",
"encoder.rnn.rnn_stacks.0.fw_fc.weight|[1312, 3072]|4030464\n",
"encoder.rnn.rnn_stacks.0.fw_bn.weight|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.fw_bn.bias|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.fw_bn._mean|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.fw_bn._variance|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.bw_fc.weight|[1312, 3072]|4030464\n",
"encoder.rnn.rnn_stacks.0.bw_bn.weight|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.bw_bn.bias|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.bw_bn._mean|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.bw_bn._variance|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.fw_cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.0.fw_cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.bw_cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.0.bw_cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.0.fw_rnn.cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.0.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.0.bw_rnn.cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.fw_fc.weight|[2048, 3072]|6291456\n",
"encoder.rnn.rnn_stacks.1.fw_bn.weight|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.fw_bn.bias|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.fw_bn._mean|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.fw_bn._variance|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.bw_fc.weight|[2048, 3072]|6291456\n",
"encoder.rnn.rnn_stacks.1.bw_bn.weight|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.bw_bn.bias|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.bw_bn._mean|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.bw_bn._variance|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.fw_cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.1.fw_cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.bw_cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.1.bw_cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.1.fw_rnn.cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.1.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.1.bw_rnn.cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.fw_fc.weight|[2048, 3072]|6291456\n",
"encoder.rnn.rnn_stacks.2.fw_bn.weight|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.fw_bn.bias|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.fw_bn._mean|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.fw_bn._variance|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.bw_fc.weight|[2048, 3072]|6291456\n",
"encoder.rnn.rnn_stacks.2.bw_bn.weight|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.bw_bn.bias|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.bw_bn._mean|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.bw_bn._variance|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.fw_cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.2.fw_cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.bw_cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.2.bw_cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.2.fw_rnn.cell.bias_hh|[3072]|3072\n",
"encoder.rnn.rnn_stacks.2.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n",
"encoder.rnn.rnn_stacks.2.bw_rnn.cell.bias_hh|[3072]|3072\n",
"decoder.ctc_lo.weight|[2048, 4300]|8806400\n",
"decoder.ctc_lo.bias|[4300]|4300\n",
"layer has 66 parameters, 80148012 elements.\n"
]
}
],
"source": [
"model = DeepSpeech2InferModel.from_pretrained(dataset, config,\n",
" args.checkpoint_path)\n",
"model.eval()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"examples/aishell/jit.model.pdmodel\n",
"examples/aishell/jit.model.pdiparams\n",
"0\n",
"False\n"
]
}
],
"source": [
"\n",
"from paddle.inference import Config\n",
"from paddle.inference import PrecisionType\n",
"from paddle.inference import create_predictor\n",
"\n",
"args.use_gpu=False\n",
"paddle.set_device('cpu')\n",
"\n",
"def init_predictor(args):\n",
" if args.model_dir is not None:\n",
" config = Config(args.model_dir)\n",
" else:\n",
" config = Config(args.model_file, args.params_file)\n",
"\n",
" if args.use_gpu:\n",
" config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
"# config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
"# use_calib_mode=True) # 开启TensorRT预测精度为fp32开启int8离线量化\n",
" else:\n",
" # If not specific mkldnn, you can set the blas thread.\n",
" # The thread num should not be greater than the number of cores in the CPU.\n",
" config.set_cpu_math_library_num_threads(1)\n",
" config.enable_mkldnn()\n",
" \n",
" config.enable_memory_optim()\n",
" config.switch_ir_optim(True)\n",
" \n",
" print(config.model_dir())\n",
" print(config.prog_file())\n",
" print(config.params_file())\n",
" print(config.gpu_device_id())\n",
" print(args.use_gpu)\n",
" predictor = create_predictor(config)\n",
" return predictor\n",
"\n",
"def run(predictor, audio, audio_len):\n",
" # copy img data to input tensor\n",
" input_names = predictor.get_input_names()\n",
" for i, name in enumerate(input_names):\n",
" print(\"input:\", i, name)\n",
" \n",
" audio_tensor = predictor.get_input_handle('audio')\n",
" audio_tensor.reshape(audio.shape)\n",
" audio_tensor.copy_from_cpu(audio.copy())\n",
" \n",
" audiolen_tensor = predictor.get_input_handle('audio_len')\n",
" audiolen_tensor.reshape(audio_len.shape)\n",
" audiolen_tensor.copy_from_cpu(audio_len.copy())\n",
"\n",
" output_names = predictor.get_output_names()\n",
" for i, name in enumerate(output_names):\n",
" print(\"output:\", i, name)\n",
"\n",
" # do the inference\n",
" predictor.run()\n",
"\n",
" results = []\n",
" # get out data from output tensor\n",
" output_names = predictor.get_output_names()\n",
" for i, name in enumerate(output_names):\n",
" output_tensor = predictor.get_output_handle(name)\n",
" output_data = output_tensor.copy_to_cpu()\n",
" results.append(output_data)\n",
"\n",
" return results\n",
"\n",
"\n",
"predictor = init_predictor(args)\n",
"\n",
"def file_to_transcript(filename):\n",
" print(filename)\n",
" feature = dataset.process_utterance(filename, \"\")\n",
" audio = np.array([feature[0]]).astype('float32') #[1, D, T]\n",
" audio_len = feature[0].shape[1]\n",
" audio_len = np.array([audio_len]).astype('int64') # [1]\n",
" \n",
" \n",
" i_probs = run(predictor, audio, audio_len)\n",
" print('jit:', i_probs[0], type(i_probs[0]))\n",
" \n",
" audio = paddle.to_tensor(audio)\n",
" audio_len = paddle.to_tensor(audio_len)\n",
" print(audio.shape)\n",
" print(audio_len.shape)\n",
" \n",
" #eouts, eouts_len = model.encoder(audio, audio_len)\n",
" #probs = model.decoder.softmax(eouts)\n",
" probs = model.forward(audio, audio_len)\n",
" print('paddle:', probs.numpy())\n",
" \n",
" flag = np.allclose(i_probs[0], probs.numpy())\n",
" print(flag)\n",
" \n",
" return probs\n",
"\n",
"# result_transcript = model.decode(\n",
"# audio,\n",
"# audio_len,\n",
"# vocab_list=dataset.vocab_list,\n",
"# decoding_method=config.decoding.decoding_method,\n",
"# lang_model_path=config.decoding.lang_model_path,\n",
"# beam_alpha=config.decoding.alpha,\n",
"# beam_beta=config.decoding.beta,\n",
"# beam_size=config.decoding.beam_size,\n",
"# cutoff_prob=config.decoding.cutoff_prob,\n",
"# cutoff_top_n=config.decoding.cutoff_top_n,\n",
"# num_processes=config.decoding.num_proc_bsearch)\n",
"# return result_transcript[0]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warm-up Test Case %d: %s 0 /home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n",
"input: 0 audio\n",
"input: 1 audio_len\n",
"output: 0 tmp_75\n",
"jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
" 8.91573707e-12 4.64317296e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
" 1.24587264e-17 1.00000000e+00]\n",
" ...\n",
" [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
" 3.89252270e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
"[1, 161, 522]\n",
"[1]\n",
"paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
" 8.91577090e-12 4.64319072e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
" ...\n",
" [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
" 3.89253761e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]]\n",
"False\n"
]
}
],
"source": [
"manifest = read_manifest(args.warmup_manifest)\n",
"\n",
"for idx, sample in enumerate(manifest[:1]):\n",
" print(\"Warm-up Test Case %d: %s\", idx, sample['audio_filepath'])\n",
" start_time = time.time()\n",
" transcript = file_to_transcript(sample['audio_filepath'])\n",
" finish_time = time.time()\n",
"# print(\"Response Time: %f, Transcript: %s\" %\n",
"# (finish_time - start_time, transcript))\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 161, 522) (1,)\n",
"input: 0 audio\n",
"input: 1 audio_len\n",
"output: 0 tmp_75\n",
"jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
" 8.91577090e-12 4.64319072e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
" ...\n",
" [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
" 3.89253761e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]]\n"
]
}
],
"source": [
"def test(filename):\n",
" feature = dataset.process_utterance(filename, \"\")\n",
" audio = np.array([feature[0]]).astype('float32') #[1, D, T]\n",
" audio_len = feature[0].shape[1]\n",
" audio_len = np.array([audio_len]).astype('int64') # [1]\n",
" \n",
" print(audio.shape, audio_len.shape)\n",
"\n",
" i_probs = run(predictor, audio, audio_len)\n",
" print('jit:', i_probs[0])\n",
" return i_probs\n",
" \n",
"probs = test(sample['audio_filepath'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -1,229 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 32,
"id": "academic-surname",
"metadata": {},
"outputs": [],
"source": [
"import paddle\n",
"from paddle import nn"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "fundamental-treasure",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parameter containing:\n",
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
" [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
"Parameter containing:\n",
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
" [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
]
}
],
"source": [
"L = nn.LayerNorm(256, epsilon=1e-12)\n",
"for p in L.parameters():\n",
" print(p)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "consolidated-elephant",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "moderate-noise",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"float64\n"
]
}
],
"source": [
"x = np.random.randn(2, 51, 256)\n",
"print(x.dtype)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "cooked-progressive",
"metadata": {},
"outputs": [],
"source": [
"y = L(paddle.to_tensor(x, dtype='float32'))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "optimum-milwaukee",
"metadata": {},
"outputs": [],
"source": [
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "viral-indian",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parameter containing:\n",
"tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
" 1., 1., 1., 1.], requires_grad=True)\n",
"Parameter containing:\n",
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
" requires_grad=True)\n"
]
}
],
"source": [
"TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
"for p in TL.parameters():\n",
" print(p)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "skilled-vietnamese",
"metadata": {},
"outputs": [],
"source": [
"ty = TL(torch.tensor(x, dtype=torch.float32))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "incorrect-allah",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.allclose(y.numpy(), ty.detach().numpy())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "prostate-cameroon",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 52,
"id": "governmental-surge",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x = np.random.randn(2, 256)\n",
"y = L(paddle.to_tensor(x, dtype='float32'))\n",
"ty = TL(torch.tensor(x, dtype=torch.float32))\n",
"np.allclose(y.numpy(), ty.detach().numpy())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "confidential-jacket",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -1,449 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "primary-organic",
"metadata": {},
"outputs": [],
"source": [
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "stopped-semester",
"metadata": {},
"outputs": [],
"source": [
"def mask_finished_scores(score: torch.Tensor,\n",
" flag: torch.Tensor) -> torch.Tensor:\n",
" \"\"\"\n",
" If a sequence is finished, we only allow one alive branch. This function\n",
" aims to give one branch a zero score and the rest -inf score.\n",
" Args:\n",
" score (torch.Tensor): A real value array with shape\n",
" (batch_size * beam_size, beam_size).\n",
" flag (torch.Tensor): A bool array with shape\n",
" (batch_size * beam_size, 1).\n",
" Returns:\n",
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
" \"\"\"\n",
" beam_size = score.size(-1)\n",
" zero_mask = torch.zeros_like(flag, dtype=torch.bool)\n",
" if beam_size > 1:\n",
" unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),\n",
" dim=1)\n",
" finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),\n",
" dim=1)\n",
" else:\n",
" unfinished = zero_mask\n",
" finished = flag\n",
" print(unfinished)\n",
" print(finished)\n",
" score.masked_fill_(unfinished, -float('inf'))\n",
" score.masked_fill_(finished, 0)\n",
" return score"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "agreed-portuguese",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ True],\n",
" [False]])\n",
"tensor([[-0.8841, 0.7381, -0.9986],\n",
" [ 0.2675, -0.7971, 0.3798]])\n",
"tensor([[ True, True],\n",
" [False, False]])\n"
]
}
],
"source": [
"score = torch.randn((2, 3))\n",
"flag = torch.ones((2, 1), dtype=torch.bool)\n",
"flag[1] = False\n",
"print(flag)\n",
"print(score)\n",
"print(flag.repeat([1, 2]))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "clean-aspect",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[False, True, True],\n",
" [False, False, False]])\n",
"tensor([[ True, False, False],\n",
" [False, False, False]])\n",
"tensor([[ 0.0000, -inf, -inf],\n",
" [ 0.2675, -0.7971, 0.3798]])\n",
"tensor([[ 0.0000, -inf, -inf],\n",
" [ 0.2675, -0.7971, 0.3798]])\n"
]
}
],
"source": [
"r = mask_finished_scores(score, flag)\n",
"print(r)\n",
"print(score)"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "thrown-airline",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tensor(shape=[2, 1], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
" [[True ],\n",
" [False]])\n",
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
" [[True , True ],\n",
" [False, False]])\n"
]
}
],
"source": [
"import paddle\n",
"\n",
"score = paddle.randn((2, 3))\n",
"flag = paddle.ones((2, 1), dtype='bool')\n",
"flag[1] = False\n",
"print(flag)\n",
"print(score)\n",
"print(flag.tile([1, 2]))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "internal-patent",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
" [[False, True , True ],\n",
" [False, False, False]])\n",
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
" [[True , False, False],\n",
" [False, False, False]])\n",
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 2.05994511, -inf. , -inf. ],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 2.05994511, -inf. , -inf. ],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 2.05994511, -inf. , -inf. ],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 0. , -inf. , -inf. ],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n",
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 0. , -inf. , -inf. ],\n",
" [-0.40165186, 0.77547729, -0.64469045]])\n"
]
}
],
"source": [
"paddle.bool = 'bool'\n",
"\n",
"def masked_fill(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
" print(xs)\n",
" trues = paddle.ones_like(xs) * value\n",
" assert xs.shape == mask.shape\n",
" xs = paddle.where(mask, trues, xs)\n",
" return xs\n",
"\n",
"def masked_fill_(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
" print('x', xs)\n",
" trues = paddle.ones_like(xs) * value\n",
" assert xs.shape == mask.shape\n",
" ret = paddle.where(mask, trues, xs)\n",
" print('2', xs)\n",
" paddle.assign(ret, output=xs)\n",
" print('3', xs)\n",
"\n",
"paddle.Tensor.masked_fill = masked_fill\n",
"paddle.Tensor.masked_fill_ = masked_fill_\n",
"\n",
"def mask_finished_scores_pd(score: paddle.Tensor,\n",
" flag: paddle.Tensor) -> paddle.Tensor:\n",
" \"\"\"\n",
" If a sequence is finished, we only allow one alive branch. This function\n",
" aims to give one branch a zero score and the rest -inf score.\n",
" Args:\n",
" score (torch.Tensor): A real value array with shape\n",
" (batch_size * beam_size, beam_size).\n",
" flag (torch.Tensor): A bool array with shape\n",
" (batch_size * beam_size, 1).\n",
" Returns:\n",
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
" \"\"\"\n",
" beam_size = score.shape[-1]\n",
" zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)\n",
" if beam_size > 1:\n",
" unfinished = paddle.concat((zero_mask, flag.tile([1, beam_size - 1])),\n",
" axis=1)\n",
" finished = paddle.concat((flag, zero_mask.tile([1, beam_size - 1])),\n",
" axis=1)\n",
" else:\n",
" unfinished = zero_mask\n",
" finished = flag\n",
" print(unfinished)\n",
" print(finished)\n",
" \n",
" #score.masked_fill_(unfinished, -float('inf'))\n",
" #score.masked_fill_(finished, 0)\n",
"# infs = paddle.ones_like(score) * -float('inf')\n",
"# score = paddle.where(unfinished, infs, score)\n",
"# score = paddle.where(finished, paddle.zeros_like(score), score)\n",
"\n",
"# score = score.masked_fill(unfinished, -float('inf'))\n",
"# score = score.masked_fill(finished, 0)\n",
" score.masked_fill_(unfinished, -float('inf'))\n",
" score.masked_fill_(finished, 0)\n",
" return score\n",
"\n",
"r = mask_finished_scores_pd(score, flag)\n",
"print(r)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "vocal-prime",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<bound method PyCapsule.value of Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
" [[ 0. , -inf. , -inf. ],\n",
" [-0.40165186, 0.77547729, -0.64469045]])>"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score.value"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "bacterial-adolescent",
"metadata": {},
"outputs": [],
"source": [
"from typing import Union, Any"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "absent-fiber",
"metadata": {},
"outputs": [],
"source": [
"def repeat(xs : paddle.Tensor, *size: Any):\n",
" print(size)\n",
" return paddle.tile(xs, size)\n",
"paddle.Tensor.repeat = repeat"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "material-harbor",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 2)\n",
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
" [[True , True ],\n",
" [False, False]])\n"
]
}
],
"source": [
"flag = paddle.ones((2, 1), dtype='bool')\n",
"flag[1] = False\n",
"print(flag.repeat(1, 2))"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "acute-brighton",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [1]), 2)\n",
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
" [[True , True ],\n",
" [False, False]])\n"
]
}
],
"source": [
"flag = paddle.ones((2, 1), dtype='bool')\n",
"flag[1] = False\n",
"print(flag.repeat(paddle.to_tensor(1), 2))"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "european-rugby",
"metadata": {},
"outputs": [],
"source": [
"def size(xs, *args: int):\n",
" nargs = len(args)\n",
" s = paddle.shape(xs)\n",
" assert(nargs <= 1)\n",
" if nargs == 1:\n",
" return s[args[0]]\n",
" else:\n",
" return s\n",
"paddle.Tensor.size = size"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "moral-special",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Tensor(shape=[2], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
" [2, 1])"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flag.size()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "ahead-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
" [1])"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flag.size(1)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "incomplete-fitness",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
" [2])"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"flag.size(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "upset-connectivity",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -1,231 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "designing-borough",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
" 0.0000000e+00 0.0000000e+00]\n",
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
" 1.1547816e-04 1.0746076e-04]\n",
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
" 2.3095631e-04 2.1492151e-04]\n",
" ...\n",
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
" 1.1201146e-02 1.0423505e-02]\n",
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
" 1.1316618e-02 1.0530960e-02]\n",
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
" 1.1432089e-02 1.0638415e-02]]\n",
"True\n",
"True\n"
]
}
],
"source": [
"import torch\n",
"import math\n",
"import numpy as np\n",
"\n",
"max_len=100\n",
"d_model=256\n",
"\n",
"pe = torch.zeros(max_len, d_model)\n",
"position = torch.arange(0, max_len,\n",
" dtype=torch.float32).unsqueeze(1)\n",
"toruch_position = position\n",
"div_term = torch.exp(\n",
" torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
" -(math.log(10000.0) / d_model))\n",
"tourch_div_term = div_term.cpu().detach().numpy()\n",
"\n",
"\n",
"\n",
"torhc_sin = torch.sin(position * div_term)\n",
"torhc_cos = torch.cos(position * div_term)\n",
"print(torhc_sin.cpu().detach().numpy())\n",
"np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
"np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
"print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
"print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
"pe[:, 0::2] = torhc_sin\n",
"pe[:, 1::2] = torhc_cos\n",
"tourch_pe = pe.cpu().detach().numpy()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "swiss-referral",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"True\n",
"False\n",
"False\n",
"False\n",
"False\n",
"[[ 1. 1. 1. ... 1. 1.\n",
" 1. ]\n",
" [ 0.5403023 0.59737533 0.6479059 ... 1. 1.\n",
" 1. ]\n",
" [-0.41614684 -0.28628543 -0.1604359 ... 0.99999994 1.\n",
" 1. ]\n",
" ...\n",
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.99993724\n",
" 0.9999457 ]\n",
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
" 0.99994457]\n",
" [ 0.03982088 -0.52298605 -0.6157435 ... 0.99992454 0.9999347\n",
" 0.99994344]]\n",
"----\n",
"[[ 1. 1. 1. ... 1. 1.\n",
" 1. ]\n",
" [ 0.54030234 0.59737533 0.6479059 ... 1. 1.\n",
" 1. ]\n",
" [-0.41614684 -0.28628543 -0.1604359 ... 1. 1.\n",
" 1. ]\n",
" ...\n",
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.9999373\n",
" 0.9999457 ]\n",
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
" 0.99994457]\n",
" [ 0.03982088 -0.5229861 -0.6157435 ... 0.99992454 0.9999347\n",
" 0.99994344]]\n",
")))))))\n",
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
" 0.0000000e+00 0.0000000e+00]\n",
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
" 1.1547816e-04 1.0746076e-04]\n",
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
" 2.3095631e-04 2.1492151e-04]\n",
" ...\n",
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
" 1.1201146e-02 1.0423505e-02]\n",
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
" 1.1316618e-02 1.0530960e-02]\n",
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
" 1.1432089e-02 1.0638415e-02]]\n",
"----\n",
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
" 0.0000000e+00 0.0000000e+00]\n",
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
" 1.1547816e-04 1.0746076e-04]\n",
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
" 2.3095631e-04 2.1492151e-04]\n",
" ...\n",
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
" 1.1201146e-02 1.0423505e-02]\n",
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
" 1.1316618e-02 1.0530960e-02]\n",
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
" 1.1432089e-02 1.0638415e-02]]\n"
]
}
],
"source": [
"import paddle\n",
"paddle.set_device('cpu')\n",
"ppe = paddle.zeros((max_len, d_model), dtype='float32')\n",
"position = paddle.arange(0, max_len,\n",
" dtype='float32').unsqueeze(1)\n",
"print(np.allclose(position.numpy(), toruch_position))\n",
"div_term = paddle.exp(\n",
" paddle.arange(0, d_model, 2, dtype='float32') *\n",
" -(math.log(10000.0) / d_model))\n",
"print(np.allclose(div_term.numpy(), tourch_div_term))\n",
"\n",
"\n",
"\n",
"p_sin = paddle.sin(position * div_term)\n",
"p_cos = paddle.cos(position * div_term)\n",
"print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
"print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
"ppe[:, 0::2] = p_sin\n",
"ppe[:, 1::2] = p_cos\n",
"print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
"print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n",
"print(p_cos.numpy())\n",
"print(\"----\")\n",
"print(torhc_cos.cpu().detach().numpy())\n",
"print(\")))))))\")\n",
"print(p_sin.numpy())\n",
"print(\"----\")\n",
"print(torhc_sin.cpu().detach().numpy())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "integrated-boards",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
}
],
"source": [
"print(np.allclose(ppe.numpy(), pe.numpy()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "flying-reserve",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "revised-divide",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

@ -1,5 +1,3 @@
[中文版](README_cn.md)
# PaddlePaddle Speech to Any toolkit # PaddlePaddle Speech to Any toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg) ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
@ -11,31 +9,29 @@
## Features ## Features
See [feature list](doc/src/feature_list.md) for more information. See [feature list](docs/src/feature_list.md) for more information.
## Setup ## Setup
All tested under: All tested under:
* Ubuntu 16.04 * Ubuntu 16.04
* python>=3.7 * python>=3.7
* paddlepaddle>=2.1.2 * paddlepaddle>=2.2.0rc
Please see [install](doc/src/install.md). Please see [install](docs/src/install.md).
## Getting Started ## Getting Started
Please see [Getting Started](doc/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md). Please see [Getting Started](docs/src/getting_started.md) and [tiny egs](examples/tiny/s0/README.md).
## More Information ## More Information
* [Data Prepration](doc/src/data_preparation.md) * [Data Prepration](docs/src/data_preparation.md)
* [Data Augmentation](doc/src/augmentation.md) * [Data Augmentation](docs/src/augmentation.md)
* [Ngram LM](doc/src/ngram_lm.md) * [Ngram LM](docs/src/ngram_lm.md)
* [Server Demo](doc/src/server.md) * [Benchmark](docs/src/benchmark.md)
* [Benchmark](doc/src/benchmark.md) * [Relased Model](docs/src/released_model.md)
* [Relased Model](doc/src/released_model.md)
* [FAQ](doc/src/faq.md)
## Questions and Help ## Questions and Help
@ -45,8 +41,8 @@ You are welcome to submit questions in [Github Discussions](https://github.com/P
## License ## License
DeepASR is provided under the [Apache-2.0 License](./LICENSE). DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement ## Acknowledgement
We depends on many open source repos. See [References](doc/src/reference.md) for more information. We depends on many open source repos. See [References](docs/src/reference.md) for more information.

@ -1,51 +0,0 @@
[English](README.md)
# PaddlePaddle Speech to Any toolkit
![License](https://img.shields.io/badge/license-Apache%202-red.svg)
![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
![support os](https://img.shields.io/badge/os-linux-yellow.svg)
*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目,
我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效、小型化和可扩展的工具,包括训练,推理,以及 部署。
## 特性
参看 [特性列表](doc/src/feature_list.md)。
## 安装
在以下环境测试验证过:
* Ubuntu 16.04
* python>=3.7
* paddlepaddle>=2.1.2
参看 [安装](doc/src/install.md)。
## 开始
请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
## 更多信息
* [数据处理](doc/src/data_preparation.md)
* [数据增强](doc/src/augmentation.md)
* [语言模型](doc/src/ngram_lm.md)
* [服务部署](doc/src/server.md)
* [Benchmark](doc/src/benchmark.md)
* [Relased Model](doc/src/released_model.md)
* [FAQ](doc/src/faq.md)
## 问题和帮助
欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题,[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
## License
DeepASR 遵循[Apache-2.0开源协议](./LICENSE)。
## 感谢
开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。

@ -80,23 +80,23 @@ def convert_dtype_to_string(tensor_dtype):
if not hasattr(paddle, 'softmax'): if not hasattr(paddle, 'softmax'):
logger.warn("register user softmax to paddle, remove this when fixed!") logger.debug("register user softmax to paddle, remove this when fixed!")
setattr(paddle, 'softmax', paddle.nn.functional.softmax) setattr(paddle, 'softmax', paddle.nn.functional.softmax)
if not hasattr(paddle, 'log_softmax'): if not hasattr(paddle, 'log_softmax'):
logger.warn("register user log_softmax to paddle, remove this when fixed!") logger.debug("register user log_softmax to paddle, remove this when fixed!")
setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax) setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
if not hasattr(paddle, 'sigmoid'): if not hasattr(paddle, 'sigmoid'):
logger.warn("register user sigmoid to paddle, remove this when fixed!") logger.debug("register user sigmoid to paddle, remove this when fixed!")
setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid) setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
if not hasattr(paddle, 'log_sigmoid'): if not hasattr(paddle, 'log_sigmoid'):
logger.warn("register user log_sigmoid to paddle, remove this when fixed!") logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid) setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
if not hasattr(paddle, 'relu'): if not hasattr(paddle, 'relu'):
logger.warn("register user relu to paddle, remove this when fixed!") logger.debug("register user relu to paddle, remove this when fixed!")
setattr(paddle, 'relu', paddle.nn.functional.relu) setattr(paddle, 'relu', paddle.nn.functional.relu)
@ -105,7 +105,7 @@ def cat(xs, dim=0):
if not hasattr(paddle, 'cat'): if not hasattr(paddle, 'cat'):
logger.warn( logger.debug(
"override cat of paddle if exists or register, remove this when fixed!") "override cat of paddle if exists or register, remove this when fixed!")
paddle.cat = cat paddle.cat = cat
@ -116,7 +116,7 @@ def item(x: paddle.Tensor):
if not hasattr(paddle.Tensor, 'item'): if not hasattr(paddle.Tensor, 'item'):
logger.warn( logger.debug(
"override item of paddle.Tensor if exists or register, remove this when fixed!" "override item of paddle.Tensor if exists or register, remove this when fixed!"
) )
paddle.Tensor.item = item paddle.Tensor.item = item
@ -127,13 +127,13 @@ def func_long(x: paddle.Tensor):
if not hasattr(paddle.Tensor, 'long'): if not hasattr(paddle.Tensor, 'long'):
logger.warn( logger.debug(
"override long of paddle.Tensor if exists or register, remove this when fixed!" "override long of paddle.Tensor if exists or register, remove this when fixed!"
) )
paddle.Tensor.long = func_long paddle.Tensor.long = func_long
if not hasattr(paddle.Tensor, 'numel'): if not hasattr(paddle.Tensor, 'numel'):
logger.warn( logger.debug(
"override numel of paddle.Tensor if exists or register, remove this when fixed!" "override numel of paddle.Tensor if exists or register, remove this when fixed!"
) )
paddle.Tensor.numel = paddle.numel paddle.Tensor.numel = paddle.numel
@ -147,7 +147,7 @@ def new_full(x: paddle.Tensor,
if not hasattr(paddle.Tensor, 'new_full'): if not hasattr(paddle.Tensor, 'new_full'):
logger.warn( logger.debug(
"override new_full of paddle.Tensor if exists or register, remove this when fixed!" "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
) )
paddle.Tensor.new_full = new_full paddle.Tensor.new_full = new_full
@ -162,13 +162,13 @@ def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'eq'): if not hasattr(paddle.Tensor, 'eq'):
logger.warn( logger.debug(
"override eq of paddle.Tensor if exists or register, remove this when fixed!" "override eq of paddle.Tensor if exists or register, remove this when fixed!"
) )
paddle.Tensor.eq = eq paddle.Tensor.eq = eq
if not hasattr(paddle, 'eq'): if not hasattr(paddle, 'eq'):
logger.warn( logger.debug(
"override eq of paddle if exists or register, remove this when fixed!") "override eq of paddle if exists or register, remove this when fixed!")
paddle.eq = eq paddle.eq = eq
@ -178,7 +178,7 @@ def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'contiguous'): if not hasattr(paddle.Tensor, 'contiguous'):
logger.warn( logger.debug(
"override contiguous of paddle.Tensor if exists or register, remove this when fixed!" "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
) )
paddle.Tensor.contiguous = contiguous paddle.Tensor.contiguous = contiguous
@ -195,7 +195,7 @@ def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
#`to_static` do not process `size` property, maybe some `paddle` api dependent on it. #`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
logger.warn( logger.debug(
"override size of paddle.Tensor " "override size of paddle.Tensor "
"(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!" "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
) )
@ -207,7 +207,7 @@ def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'view'): if not hasattr(paddle.Tensor, 'view'):
logger.warn("register user view to paddle.Tensor, remove this when fixed!") logger.debug("register user view to paddle.Tensor, remove this when fixed!")
paddle.Tensor.view = view paddle.Tensor.view = view
@ -216,7 +216,7 @@ def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'view_as'): if not hasattr(paddle.Tensor, 'view_as'):
logger.warn( logger.debug(
"register user view_as to paddle.Tensor, remove this when fixed!") "register user view_as to paddle.Tensor, remove this when fixed!")
paddle.Tensor.view_as = view_as paddle.Tensor.view_as = view_as
@ -242,7 +242,7 @@ def masked_fill(xs: paddle.Tensor,
if not hasattr(paddle.Tensor, 'masked_fill'): if not hasattr(paddle.Tensor, 'masked_fill'):
logger.warn( logger.debug(
"register user masked_fill to paddle.Tensor, remove this when fixed!") "register user masked_fill to paddle.Tensor, remove this when fixed!")
paddle.Tensor.masked_fill = masked_fill paddle.Tensor.masked_fill = masked_fill
@ -260,7 +260,7 @@ def masked_fill_(xs: paddle.Tensor,
if not hasattr(paddle.Tensor, 'masked_fill_'): if not hasattr(paddle.Tensor, 'masked_fill_'):
logger.warn( logger.debug(
"register user masked_fill_ to paddle.Tensor, remove this when fixed!") "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
paddle.Tensor.masked_fill_ = masked_fill_ paddle.Tensor.masked_fill_ = masked_fill_
@ -272,7 +272,8 @@ def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'fill_'): if not hasattr(paddle.Tensor, 'fill_'):
logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!") logger.debug(
"register user fill_ to paddle.Tensor, remove this when fixed!")
paddle.Tensor.fill_ = fill_ paddle.Tensor.fill_ = fill_
@ -281,22 +282,22 @@ def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'repeat'): if not hasattr(paddle.Tensor, 'repeat'):
logger.warn( logger.debug(
"register user repeat to paddle.Tensor, remove this when fixed!") "register user repeat to paddle.Tensor, remove this when fixed!")
paddle.Tensor.repeat = repeat paddle.Tensor.repeat = repeat
if not hasattr(paddle.Tensor, 'softmax'): if not hasattr(paddle.Tensor, 'softmax'):
logger.warn( logger.debug(
"register user softmax to paddle.Tensor, remove this when fixed!") "register user softmax to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax) setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
if not hasattr(paddle.Tensor, 'sigmoid'): if not hasattr(paddle.Tensor, 'sigmoid'):
logger.warn( logger.debug(
"register user sigmoid to paddle.Tensor, remove this when fixed!") "register user sigmoid to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid) setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
if not hasattr(paddle.Tensor, 'relu'): if not hasattr(paddle.Tensor, 'relu'):
logger.warn("register user relu to paddle.Tensor, remove this when fixed!") logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu) setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
@ -305,7 +306,7 @@ def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'type_as'): if not hasattr(paddle.Tensor, 'type_as'):
logger.warn( logger.debug(
"register user type_as to paddle.Tensor, remove this when fixed!") "register user type_as to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'type_as', type_as) setattr(paddle.Tensor, 'type_as', type_as)
@ -321,7 +322,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'to'): if not hasattr(paddle.Tensor, 'to'):
logger.warn("register user to to paddle.Tensor, remove this when fixed!") logger.debug("register user to to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'to', to) setattr(paddle.Tensor, 'to', to)
@ -330,7 +331,8 @@ def func_float(x: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'float'): if not hasattr(paddle.Tensor, 'float'):
logger.warn("register user float to paddle.Tensor, remove this when fixed!") logger.debug(
"register user float to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'float', func_float) setattr(paddle.Tensor, 'float', func_float)
@ -339,7 +341,7 @@ def func_int(x: paddle.Tensor) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'int'): if not hasattr(paddle.Tensor, 'int'):
logger.warn("register user int to paddle.Tensor, remove this when fixed!") logger.debug("register user int to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'int', func_int) setattr(paddle.Tensor, 'int', func_int)
@ -348,23 +350,6 @@ def tolist(x: paddle.Tensor) -> List[Any]:
if not hasattr(paddle.Tensor, 'tolist'): if not hasattr(paddle.Tensor, 'tolist'):
logger.warn( logger.debug(
"register user tolist to paddle.Tensor, remove this when fixed!") "register user tolist to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'tolist', tolist) setattr(paddle.Tensor, 'tolist', tolist)
########### hcak paddle.nn #############
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""
def __init__(self, dim: int=-1):
super().__init__()
self.dim = dim
def forward(self, xs):
return F.glu(xs, axis=self.dim)
if not hasattr(paddle.nn, 'GLU'):
logger.warn("register user GLU to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'GLU', GLU)

@ -35,7 +35,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
size_t beam_size, size_t beam_size,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer) { Scorer *ext_scorer,
size_t blank_id) {
// dimension check // dimension check
size_t num_time_steps = probs_seq.size(); size_t num_time_steps = probs_seq.size();
for (size_t i = 0; i < num_time_steps; ++i) { for (size_t i = 0; i < num_time_steps; ++i) {
@ -48,7 +49,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
// assign blank id // assign blank id
// size_t blank_id = vocabulary.size(); // size_t blank_id = vocabulary.size();
size_t blank_id = 0; // size_t blank_id = 0;
// assign space id // assign space id
auto it = std::find(vocabulary.begin(), vocabulary.end(), " "); auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
@ -57,7 +58,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
if ((size_t)space_id >= vocabulary.size()) { if ((size_t)space_id >= vocabulary.size()) {
space_id = -2; space_id = -2;
} }
// init prefixes' root // init prefixes' root
PathTrie root; PathTrie root;
root.score = root.log_prob_b_prev = 0.0; root.score = root.log_prob_b_prev = 0.0;
@ -218,7 +218,8 @@ ctc_beam_search_decoder_batch(
size_t num_processes, size_t num_processes,
double cutoff_prob, double cutoff_prob,
size_t cutoff_top_n, size_t cutoff_top_n,
Scorer *ext_scorer) { Scorer *ext_scorer,
size_t blank_id) {
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!"); VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
// thread pool // thread pool
ThreadPool pool(num_processes); ThreadPool pool(num_processes);
@ -234,7 +235,8 @@ ctc_beam_search_decoder_batch(
beam_size, beam_size,
cutoff_prob, cutoff_prob,
cutoff_top_n, cutoff_top_n,
ext_scorer)); ext_scorer,
blank_id));
} }
// get decoding results // get decoding results

@ -43,7 +43,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
size_t beam_size, size_t beam_size,
double cutoff_prob = 1.0, double cutoff_prob = 1.0,
size_t cutoff_top_n = 40, size_t cutoff_top_n = 40,
Scorer *ext_scorer = nullptr); Scorer *ext_scorer = nullptr,
size_t blank_id = 0);
/* CTC Beam Search Decoder for batch data /* CTC Beam Search Decoder for batch data
@ -70,6 +71,7 @@ ctc_beam_search_decoder_batch(
size_t num_processes, size_t num_processes,
double cutoff_prob = 1.0, double cutoff_prob = 1.0,
size_t cutoff_top_n = 40, size_t cutoff_top_n = 40,
Scorer *ext_scorer = nullptr); Scorer *ext_scorer = nullptr,
size_t blank_id = 0);
#endif // CTC_BEAM_SEARCH_DECODER_H_ #endif // CTC_BEAM_SEARCH_DECODER_H_

@ -17,17 +17,18 @@
std::string ctc_greedy_decoder( std::string ctc_greedy_decoder(
const std::vector<std::vector<double>> &probs_seq, const std::vector<std::vector<double>> &probs_seq,
const std::vector<std::string> &vocabulary) { const std::vector<std::string> &vocabulary,
size_t blank_id) {
// dimension check // dimension check
size_t num_time_steps = probs_seq.size(); size_t num_time_steps = probs_seq.size();
for (size_t i = 0; i < num_time_steps; ++i) { for (size_t i = 0; i < num_time_steps; ++i) {
VALID_CHECK_EQ(probs_seq[i].size(), VALID_CHECK_EQ(probs_seq[i].size(),
vocabulary.size() + 1, vocabulary.size(),
"The shape of probs_seq does not match with " "The shape of probs_seq does not match with "
"the shape of the vocabulary"); "the shape of the vocabulary");
} }
size_t blank_id = vocabulary.size(); // size_t blank_id = vocabulary.size();
std::vector<size_t> max_idx_vec(num_time_steps, 0); std::vector<size_t> max_idx_vec(num_time_steps, 0);
std::vector<size_t> idx_vec; std::vector<size_t> idx_vec;

@ -29,6 +29,7 @@
*/ */
std::string ctc_greedy_decoder( std::string ctc_greedy_decoder(
const std::vector<std::vector<double>>& probs_seq, const std::vector<std::vector<double>>& probs_seq,
const std::vector<std::string>& vocabulary); const std::vector<std::string>& vocabulary,
size_t blank_id);
#endif // CTC_GREEDY_DECODER_H #endif // CTC_GREEDY_DECODER_H

@ -85,9 +85,8 @@ FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
# yapf: disable # yapf: disable
FILES = [ FILES = [
fn for fn in FILES fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith( or fn.endswith('unittest.cc'))
'unittest.cc'))
] ]
# yapf: enable # yapf: enable

@ -32,7 +32,7 @@ class Scorer(swig_decoders.Scorer):
swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary) swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary)
def ctc_greedy_decoder(probs_seq, vocabulary): def ctc_greedy_decoder(probs_seq, vocabulary, blank_id):
"""Wrapper for ctc best path decoder in swig. """Wrapper for ctc best path decoder in swig.
:param probs_seq: 2-D list of probability distributions over each time :param probs_seq: 2-D list of probability distributions over each time
@ -44,7 +44,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
:return: Decoding result string. :return: Decoding result string.
:rtype: str :rtype: str
""" """
result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary) result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary,
blank_id)
return result return result
@ -53,7 +54,8 @@ def ctc_beam_search_decoder(probs_seq,
beam_size, beam_size,
cutoff_prob=1.0, cutoff_prob=1.0,
cutoff_top_n=40, cutoff_top_n=40,
ext_scoring_func=None): ext_scoring_func=None,
blank_id=0):
"""Wrapper for the CTC Beam Search Decoder. """Wrapper for the CTC Beam Search Decoder.
:param probs_seq: 2-D list of probability distributions over each time :param probs_seq: 2-D list of probability distributions over each time
@ -81,7 +83,7 @@ def ctc_beam_search_decoder(probs_seq,
""" """
beam_results = swig_decoders.ctc_beam_search_decoder( beam_results = swig_decoders.ctc_beam_search_decoder(
probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n, probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
ext_scoring_func) ext_scoring_func, blank_id)
beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results] beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
return beam_results return beam_results
@ -92,7 +94,8 @@ def ctc_beam_search_decoder_batch(probs_split,
num_processes, num_processes,
cutoff_prob=1.0, cutoff_prob=1.0,
cutoff_top_n=40, cutoff_top_n=40,
ext_scoring_func=None): ext_scoring_func=None,
blank_id=0):
"""Wrapper for the batched CTC beam search decoder. """Wrapper for the batched CTC beam search decoder.
:param probs_seq: 3-D list with each element as an instance of 2-D list :param probs_seq: 3-D list with each element as an instance of 2-D list
@ -125,7 +128,7 @@ def ctc_beam_search_decoder_batch(probs_split,
batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch( batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch(
probs_split, vocabulary, beam_size, num_processes, cutoff_prob, probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
cutoff_top_n, ext_scoring_func) cutoff_top_n, ext_scoring_func, blank_id)
batch_beam_results = [[(res[0], res[1]) for res in beam_results] batch_beam_results = [[(res[0], res[1]) for res in beam_results]
for beam_results in batch_beam_results] for beam_results in batch_beam_results]
return batch_beam_results return batch_beam_results

@ -27,7 +27,7 @@ def main_sp(config, args):
def main(config, args): def main(config, args):
if args.device == "gpu" and args.nprocs > 1: if args.nprocs > 0:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else: else:
main_sp(config, args) main_sp(config, args)

@ -1,191 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Beam search parameters tuning for DeepSpeech2 model."""
import functools
import sys
import numpy as np
from paddle.io import DataLoader
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.training.cli import default_argument_parser
from deepspeech.utils import error_rate
from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments
def tune(config, args):
"""Tune parameters alpha and beta incrementally."""
if not args.num_alphas >= 0:
raise ValueError("num_alphas must be non-negative!")
if not args.num_betas >= 0:
raise ValueError("num_betas must be non-negative!")
config.defrost()
config.data.manfiest = config.data.dev_manifest
config.data.augmentation_config = ""
config.data.keep_transcription_text = True
dev_dataset = ManifestDataset.from_config(config)
valid_loader = DataLoader(
dev_dataset,
batch_size=config.data.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator(keep_transcription_text=True))
model = DeepSpeech2Model.from_pretrained(valid_loader, config,
args.checkpoint_path)
model.eval()
# decoders only accept string encoded in utf-8
vocab_list = valid_loader.dataset.vocab_list
errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
# create grid for search
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
params_grid = [(alpha, beta) for alpha in cand_alphas
for beta in cand_betas]
err_sum = [0.0 for i in range(len(params_grid))]
err_ave = [0.0 for i in range(len(params_grid))]
num_ins, len_refs, cur_batch = 0, 0, 0
# initialize external scorer
model.decoder.init_decode(args.alpha_from, args.beta_from,
config.decoding.lang_model_path, vocab_list,
config.decoding.decoding_method)
## incremental tuning parameters over multiple batches
print("start tuning ...")
for infer_data in valid_loader():
if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
break
def ordid2token(texts, texts_len):
""" ord() id to chr() chr """
trans = []
for text, n in zip(texts, texts_len):
n = n.numpy().item()
ids = text[:n]
trans.append(''.join([chr(i) for i in ids]))
return trans
audio, audio_len, text, text_len = infer_data
target_transcripts = ordid2token(text, text_len)
num_ins += audio.shape[0]
# model infer
eouts, eouts_len = model.encoder(audio, audio_len)
probs = model.decoder.softmax(eouts)
# grid search
for index, (alpha, beta) in enumerate(params_grid):
print(f"tuneing: alpha={alpha} beta={beta}")
result_transcripts = model.decoder.decode_probs(
probs.numpy(), eouts_len, vocab_list,
config.decoding.decoding_method,
config.decoding.lang_model_path, alpha, beta,
config.decoding.beam_size, config.decoding.cutoff_prob,
config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
for target, result in zip(target_transcripts, result_transcripts):
errors, len_ref = errors_func(target, result)
err_sum[index] += errors
# accumulate the length of references of every batchπ
# in the first iteration
if args.alpha_from == alpha and args.beta_from == beta:
len_refs += len_ref
err_ave[index] = err_sum[index] / len_refs
if index % 2 == 0:
sys.stdout.write('.')
sys.stdout.flush()
print("tuneing: one grid done!")
# output on-line tuning result at the end of current batch
err_ave_min = min(err_ave)
min_index = err_ave.index(err_ave_min)
print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
" min [%s] = %f" %
(cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
"%.3f" % params_grid[min_index][1],
config.decoding.error_rate_type, err_ave_min))
cur_batch += 1
# output WER/CER at every (alpha, beta)
print("\nFinal %s:\n" % config.decoding.error_rate_type)
for index in range(len(params_grid)):
print("(alpha, beta) = (%s, %s), [%s] = %f" %
("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
config.decoding.error_rate_type, err_ave[index]))
err_ave_min = min(err_ave)
min_index = err_ave.index(err_ave_min)
print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
(cur_batch, "%.3f" % params_grid[min_index][0],
"%.3f" % params_grid[min_index][1]))
print("finish tuning")
def main(config, args):
tune(config, args)
if __name__ == "__main__":
parser = default_argument_parser()
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('num_batches', int, -1, "# of batches tuning on. "
"Default -1, on whole dev set.")
add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
add_arg('batch_size', int, 256, "# of samples per batch.")
add_arg('beam_size', int, 500, "Beam search width.")
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
args = parser.parse_args()
print_arguments(args, globals())
# https://yaml.org/type/float.html
config = get_cfg_defaults()
if args.config:
config.merge_from_file(args.config)
if args.opts:
config.merge_from_list(args.opts)
config.data.batch_size = args.batch_size
config.decoding.beam_size = args.beam_size
config.decoding.num_proc_bsearch = args.num_proc_bsearch
config.decoding.cutoff_prob = args.cutoff_prob
config.decoding.cutoff_top_n = args.cutoff_top_n
config.freeze()
print(config)
if args.dump_config:
with open(args.dump_config, 'w') as f:
print(config, file=f)
main(config, args)

@ -15,9 +15,11 @@
import os import os
import time import time
from collections import defaultdict from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
@ -34,12 +36,14 @@ from deepspeech.models.ds2 import DeepSpeech2Model
from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.reporter import report
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils.log import Autolog from deepspeech.utils.log import Autolog
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -65,29 +69,52 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args) super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
batch_size = self.config.collator.batch_size
accum_grad = self.config.training.accum_grad
start = time.time() start = time.time()
# forward
utt, audio, audio_len, text, text_len = batch_data utt, audio, audio_len, text, text_len = batch_data
loss = self.model(audio, audio_len, text, text_len) loss = self.model(audio, audio_len, text, text_len)
losses_np = {
'train_loss': float(loss),
}
# loss backward
if (batch_index + 1) % accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward() loss.backward()
layer_tools.print_grads(self.model, print_func=None) layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % accum_grad == 0:
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
self.iteration += 1
iteration_time = time.time() - start iteration_time = time.time() - start
losses_np = { for k, v in losses_np.items():
'train_loss': float(loss), report(k, v)
} report("batch_size", batch_size)
msg += "train time: {:>.3f}s, ".format(iteration_time) report("accum", accum_grad)
msg += "batch size: {}, ".format(self.config.collator.batch_size) report("step_cost", iteration_time)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
logger.info(msg)
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
for k, v in losses_np.items(): for k, v in losses_np.items():
# `step -1` since we update `step` after optimizer.step().
self.visualizer.add_scalar("train/{}".format(k), v, self.visualizer.add_scalar("train/{}".format(k), v,
self.iteration) self.iteration - 1)
self.iteration += 1
@paddle.no_grad() @paddle.no_grad()
def valid(self): def valid(self):
@ -124,10 +151,9 @@ class DeepSpeech2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config.clone() config = self.config.clone()
config.defrost() with UpdateConfig(config):
config.model.feat_size = self.train_loader.collate_fn.feature_size config.model.feat_size = self.train_loader.collate_fn.feature_size
config.model.dict_size = self.train_loader.collate_fn.vocab_size config.model.dict_size = self.train_loader.collate_fn.vocab_size
config.freeze()
if self.args.model_type == 'offline': if self.args.model_type == 'offline':
model = DeepSpeech2Model.from_config(config.model) model = DeepSpeech2Model.from_config(config.model)
@ -280,9 +306,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
if fout: if fout:
fout.write(utt + " " + result + "\n") fout.write({"utt": utt, "ref": target, "hyp": result})
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" % logger.info(f"Utt: {utt}")
(target, result)) logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")
logger.info("Current error rate [%s] = %f" % logger.info("Current error rate [%s] = %f" %
(cfg.error_rate_type, error_rate_func(target, result))) (cfg.error_rate_type, error_rate_func(target, result)))
@ -325,7 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cfg = self.config cfg = self.config
error_rate_type = None error_rate_type = None
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
with open(self.args.result_file, 'w') as fout: with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
utts, audio, audio_len, texts, texts_len = batch utts, audio, audio_len, texts, texts_len = batch
metrics = self.compute_metrics(utts, audio, audio_len, texts, metrics = self.compute_metrics(utts, audio, audio_len, texts,
@ -378,7 +405,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
paddle.set_device(self.args.device) paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
self.setup_output_dir() self.setup_output_dir()
self.setup_checkpointer() self.setup_checkpointer()
@ -610,7 +637,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
paddle.set_device(self.args.device) paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
self.setup_output_dir() self.setup_output_dir()

@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
from deepspeech.training.cli import default_argument_parser from deepspeech.training.cli import default_argument_parser
from deepspeech.utils.utility import print_arguments from deepspeech.utils.utility import print_arguments
# from deepspeech.exps.u2.trainer import U2Trainer as Trainer
def main_sp(config, args): def main_sp(config, args):
exp = Trainer(config, args) exp = Trainer(config, args)
@ -30,7 +32,7 @@ def main_sp(config, args):
def main(config, args): def main(config, args):
if args.device == "gpu" and args.nprocs > 1: if args.nprocs > 0:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else: else:
main_sp(config, args) main_sp(config, args)

@ -17,9 +17,12 @@ import os
import sys import sys
import time import time
from collections import defaultdict from collections import defaultdict
from collections import OrderedDict
from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
@ -32,7 +35,10 @@ from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2 import U2Model from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.reporter import ObsScope
from deepspeech.training.reporter import report
from deepspeech.training.scheduler import LRSchedulerFactory from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.utils import ctc_utils from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
@ -41,6 +47,7 @@ from deepspeech.utils import mp_tools
from deepspeech.utils import text_grid from deepspeech.utils import text_grid
from deepspeech.utils import utility from deepspeech.utils import utility
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -79,21 +86,36 @@ class U2Trainer(Trainer):
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training train_conf = self.config.training
start = time.time() start = time.time()
utt, audio, audio_len, text, text_len = batch_data
# forward
utt, audio, audio_len, text, text_len = batch_data
loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
text_len) text_len)
# loss div by `batch_size * accum_grad` # loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad loss /= train_conf.accum_grad
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
losses_np = {'loss': float(loss) * train_conf.accum_grad} losses_np = {'loss': float(loss) * train_conf.accum_grad}
if attention_loss: if attention_loss:
losses_np['att_loss'] = float(attention_loss) losses_np['att_loss'] = float(attention_loss)
if ctc_loss: if ctc_loss:
losses_np['ctc_loss'] = float(ctc_loss) losses_np['ctc_loss'] = float(ctc_loss)
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
# When using cpu w/o DDP, model does not have `no_sync`
context = self.model.no_sync if self.parallel else nullcontext
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0: if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
@ -102,14 +124,13 @@ class U2Trainer(Trainer):
iteration_time = time.time() - start iteration_time = time.time() - start
if (batch_index + 1) % train_conf.log_interval == 0: for k, v in losses_np.items():
msg += "train time: {:>.3f}s, ".format(iteration_time) report(k, v)
msg += "batch size: {}, ".format(self.config.collator.batch_size) report("batch_size", self.config.collator.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad) report("accum", train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v) report("step_cost", iteration_time)
for k, v in losses_np.items())
logger.info(msg)
if (batch_index + 1) % train_conf.accum_grad == 0:
if dist.get_rank() == 0 and self.visualizer: if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy() losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()}) losses_np_v.update({"lr": self.lr_scheduler()})
@ -163,35 +184,47 @@ class U2Trainer(Trainer):
# script_model_path = str(self.checkpoint_dir / 'init') # script_model_path = str(self.checkpoint_dir / 'init')
# paddle.jit.save(script_model, script_model_path) # paddle.jit.save(script_model, script_model_path)
from_scratch = self.resume_or_scratch() self.before_train()
if from_scratch:
# save init model, i.e. 0 epoch
self.save(tag='init')
self.lr_scheduler.step(self.iteration)
if self.parallel:
self.train_loader.batch_sampler.set_epoch(self.epoch)
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch: while self.epoch < self.config.training.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train() self.model.train()
try: try:
data_start_time = time.time() data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader): for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train:"
msg += "epoch: {}, ".format(self.epoch) observation = OrderedDict()
msg += "step: {}, ".format(self.iteration) with ObsScope(observation):
msg += "batch : {}/{}, ".format(batch_index + 1, report("Rank", dist.get_rank())
len(self.train_loader)) report("epoch", self.epoch)
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) report('step', self.iteration)
msg += "data time: {:>.3f}s, ".format(dataload_time) report("lr", self.lr_scheduler())
self.train_batch(batch_index, batch, msg) self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('iter', batch_index + 1)
report('total', len(self.train_loader))
report('reader_cost', dataload_time)
observation['batch_cost'] = observation[
'reader_cost'] + observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation[
'batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += ","
if (batch_index + 1
) % self.config.training.log_interval == 0:
logger.info(msg)
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
raise e raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid() total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1: if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts) num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -294,10 +327,11 @@ class U2Trainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config.model
model_conf.defrost()
with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model_conf.freeze()
model = U2Model.from_config(model_conf) model = U2Model.from_config(model_conf)
if self.parallel: if self.parallel:
@ -433,9 +467,10 @@ class U2Tester(U2Trainer):
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
if fout: if fout:
fout.write(utt + " " + result + "\n") fout.write({"utt": utt, "ref": target, "hyp": result})
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" % logger.info(f"Utt: {utt}")
(target, result)) logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")
logger.info("One example error rate [%s] = %f" % logger.info("One example error rate [%s] = %f" %
(cfg.error_rate_type, error_rate_func(target, result))) (cfg.error_rate_type, error_rate_func(target, result)))
@ -460,7 +495,7 @@ class U2Tester(U2Trainer):
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
num_frames = 0.0 num_frames = 0.0
num_time = 0.0 num_time = 0.0
with open(self.args.result_file, 'w') as fout: with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_metrics(*batch, fout=fout) metrics = self.compute_metrics(*batch, fout=fout)
num_frames += metrics['num_frames'] num_frames += metrics['num_frames']
@ -540,7 +575,7 @@ class U2Tester(U2Trainer):
# 1. Encoder # 1. Encoder
encoder_out, encoder_mask = self.model._forward_encoder( encoder_out, encoder_mask = self.model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim) feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.model.ctc.log_softmax( ctc_probs = self.model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size) encoder_out) # (1, maxlen, vocab_size)
@ -548,26 +583,25 @@ class U2Tester(U2Trainer):
ctc_probs = ctc_probs.squeeze(0) ctc_probs = ctc_probs.squeeze(0)
target = target.squeeze(0) target = target.squeeze(0)
alignment = ctc_utils.forced_align(ctc_probs, target) alignment = ctc_utils.forced_align(ctc_probs, target)
logger.info("align ids", key[0], alignment) logger.info(f"align ids: {key[0]} {alignment}")
fout.write('{} {}\n'.format(key[0], alignment)) fout.write('{} {}\n'.format(key[0], alignment))
# 3. gen praat # 3. gen praat
# segment alignment # segment alignment
align_segs = text_grid.segment_alignment(alignment) align_segs = text_grid.segment_alignment(alignment)
logger.info("align tokens", key[0], align_segs) logger.info(f"align tokens: {key[0]}, {align_segs}")
# IntervalTier, List["start end token\n"] # IntervalTier, List["start end token\n"]
subsample = utility.get_subsample(self.config) subsample = utility.get_subsample(self.config)
tierformat = text_grid.align_to_tierformat( tierformat = text_grid.align_to_tierformat(
align_segs, subsample, token_dict) align_segs, subsample, token_dict)
# write tier # write tier
align_output_path = os.path.join( align_output_path = Path(self.args.result_file).parent / "align"
os.path.dirname(self.args.result_file), "align") align_output_path.mkdir(parents=True, exist_ok=True)
tier_path = os.path.join(align_output_path, key[0] + ".tier") tier_path = align_output_path / (key[0] + ".tier")
with open(tier_path, 'w') as f: with tier_path.open('w') as f:
f.writelines(tierformat) f.writelines(tierformat)
# write textgrid # write textgrid
textgrid_path = os.path.join(align_output_path, textgrid_path = align_output_path / (key[0] + ".TextGrid")
key[0] + ".TextGrid")
second_per_frame = 1. / (1000. / second_per_frame = 1. / (1000. /
stride_ms) # 25ms window, 10ms stride stride_ms) # 25ms window, 10ms stride
second_per_example = ( second_per_example = (
@ -575,7 +609,7 @@ class U2Tester(U2Trainer):
text_grid.generate_textgrid( text_grid.generate_textgrid(
maxtime=second_per_example, maxtime=second_per_example,
intervals=tierformat, intervals=tierformat,
output=textgrid_path) output=str(textgrid_path))
def run_align(self): def run_align(self):
self.resume_or_scratch() self.resume_or_scratch()
@ -621,7 +655,7 @@ class U2Tester(U2Trainer):
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
paddle.set_device(self.args.device) paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
self.setup_output_dir() self.setup_output_dir()
self.setup_checkpointer() self.setup_checkpointer()

@ -0,0 +1,220 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains U2 model."""
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from deepspeech.io.collator import SpeechCollator
from deepspeech.io.dataset import ManifestDataset
from deepspeech.io.sampler import SortagradBatchSampler
from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2 import U2Evaluator
from deepspeech.models.u2 import U2Model
from deepspeech.models.u2 import U2Updater
from deepspeech.training.extensions.snapshot import Snapshot
from deepspeech.training.extensions.visualizer import VisualDL
from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.training.updaters.trainer import Trainer as NewTrainer
from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog()
class U2Trainer(Trainer):
def __init__(self, config, args):
super().__init__(config, args)
def setup_dataloader(self):
config = self.config.clone()
config.defrost()
config.collator.keep_transcription_text = False
# train/valid dataset, return token ids
config.data.manifest = config.data.train_manifest
train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest
dev_dataset = ManifestDataset.from_config(config)
collate_fn_train = SpeechCollator.from_config(config)
config.collator.augmentation_config = ""
collate_fn_dev = SpeechCollator.from_config(config)
if self.parallel:
batch_sampler = SortagradDistributedBatchSampler(
train_dataset,
batch_size=config.collator.batch_size,
num_replicas=None,
rank=None,
shuffle=True,
drop_last=True,
sortagrad=config.collator.sortagrad,
shuffle_method=config.collator.shuffle_method)
else:
batch_sampler = SortagradBatchSampler(
train_dataset,
shuffle=True,
batch_size=config.collator.batch_size,
drop_last=True,
sortagrad=config.collator.sortagrad,
shuffle_method=config.collator.shuffle_method)
self.train_loader = DataLoader(
train_dataset,
batch_sampler=batch_sampler,
collate_fn=collate_fn_train,
num_workers=config.collator.num_workers, )
self.valid_loader = DataLoader(
dev_dataset,
batch_size=config.collator.batch_size,
shuffle=False,
drop_last=False,
collate_fn=collate_fn_dev)
# test dataset, return raw text
config.data.manifest = config.data.test_manifest
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
config.data.min_input_len = 0.0 # second
config.data.max_input_len = float('inf') # second
config.data.min_output_len = 0.0 # tokens
config.data.max_output_len = float('inf') # tokens
config.data.min_output_input_ratio = 0.00
config.data.max_output_input_ratio = float('inf')
test_dataset = ManifestDataset.from_config(config)
# return text ord id
config.collator.keep_transcription_text = True
config.collator.augmentation_config = ""
self.test_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator.from_config(config))
# return text token id
config.collator.keep_transcription_text = False
self.align_loader = DataLoader(
test_dataset,
batch_size=config.decoding.batch_size,
shuffle=False,
drop_last=False,
collate_fn=SpeechCollator.from_config(config))
logger.info("Setup train/valid/test/align Dataloader!")
def setup_model(self):
config = self.config
model_conf = config.model
with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model = U2Model.from_config(model_conf)
if self.parallel:
model = paddle.DataParallel(model)
model.train()
logger.info(f"{model}")
layer_tools.print_params(model, logger.info)
train_config = config.training
optim_type = train_config.optim
optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
scheduler_args = {
"learning_rate": optim_conf.lr,
"verbose": False,
"warmup_steps": scheduler_conf.warmup_steps,
"gamma": scheduler_conf.lr_decay,
"d_model": model_conf.encoder_conf.output_size,
}
lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
scheduler_args)
def optimizer_args(
config,
parameters,
lr_scheduler=None, ):
train_config = config.training
optim_type = train_config.optim
optim_conf = train_config.optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
return {
"grad_clip": train_config.global_grad_clip,
"weight_decay": optim_conf.weight_decay,
"learning_rate": lr_scheduler
if lr_scheduler else optim_conf.lr,
"parameters": parameters,
"epsilon": 1e-9 if optim_type == 'noam' else None,
"beta1": 0.9 if optim_type == 'noam' else None,
"beat2": 0.98 if optim_type == 'noam' else None,
}
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
self.model = model
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
logger.info("Setup model/optimizer/lr_scheduler!")
def setup_updater(self):
output_dir = self.output_dir
config = self.config.training
updater = U2Updater(
model=self.model,
optimizer=self.optimizer,
scheduler=self.lr_scheduler,
dataloader=self.train_loader,
output_dir=output_dir,
accum_grad=config.accum_grad)
trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
evaluator = U2Evaluator(self.model, self.valid_loader)
trainer.extend(evaluator, trigger=(1, "epoch"))
if dist.get_rank() == 0:
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
num_snapshots = config.checkpoint.kbest_n
trainer.extend(
Snapshot(
mode='kbest',
max_size=num_snapshots,
indicator='VALID/LOSS',
less_better=True),
trigger=(1, 'epoch'))
# print(trainer.extensions)
# trainer.run()
self.trainer = trainer
def run(self):
"""The routine of the experiment after setup. This method is intended
to be used by the user.
"""
self.setup_updater()
with Timer("Training Done: {}"):
self.trainer.run()

@ -36,7 +36,7 @@ def main_sp(config, args):
def main(config, args): def main(config, args):
if args.device == "gpu" and args.nprocs > 1: if args.nprocs > 0:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else: else:
main_sp(config, args) main_sp(config, args)

@ -17,9 +17,11 @@ import os
import sys import sys
import time import time
from collections import defaultdict from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
@ -31,6 +33,7 @@ from deepspeech.io.dataloader import BatchDataLoader
from deepspeech.models.u2 import U2Model from deepspeech.models.u2 import U2Model
from deepspeech.training.optimizer import OptimizerFactory from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.utils import ctc_utils from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate from deepspeech.utils import error_rate
@ -39,6 +42,7 @@ from deepspeech.utils import mp_tools
from deepspeech.utils import text_grid from deepspeech.utils import text_grid
from deepspeech.utils import utility from deepspeech.utils import utility
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -83,20 +87,34 @@ class U2Trainer(Trainer):
train_conf = self.config.training train_conf = self.config.training
start = time.time() start = time.time()
# forward
utt, audio, audio_len, text, text_len = batch_data utt, audio, audio_len, text, text_len = batch_data
loss, attention_loss, ctc_loss = self.model(audio, audio_len, text, loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
text_len) text_len)
# loss div by `batch_size * accum_grad` # loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad loss /= train_conf.accum_grad
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
losses_np = {'loss': float(loss) * train_conf.accum_grad} losses_np = {'loss': float(loss) * train_conf.accum_grad}
if attention_loss: if attention_loss:
losses_np['att_loss'] = float(attention_loss) losses_np['att_loss'] = float(attention_loss)
if ctc_loss: if ctc_loss:
losses_np['ctc_loss'] = float(ctc_loss) losses_np['ctc_loss'] = float(ctc_loss)
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0: if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
@ -167,14 +185,11 @@ class U2Trainer(Trainer):
# script_model_path = str(self.checkpoint_dir / 'init') # script_model_path = str(self.checkpoint_dir / 'init')
# paddle.jit.save(script_model, script_model_path) # paddle.jit.save(script_model, script_model_path)
from_scratch = self.resume_or_scratch() self.before_train()
if from_scratch:
# save init model, i.e. 0 epoch
self.save(tag='init')
self.lr_scheduler.step(self.iteration)
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch: while self.epoch < self.config.training.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train() self.model.train()
try: try:
data_start_time = time.time() data_start_time = time.time()
@ -188,11 +203,13 @@ class U2Trainer(Trainer):
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time) msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg) self.train_batch(batch_index, batch, msg)
self.after_train_batch()
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
raise e raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid() total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1: if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts) num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -300,10 +317,10 @@ class U2Trainer(Trainer):
# model # model
model_conf = config.model model_conf = config.model
model_conf.defrost() with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.feat_dim model_conf.input_dim = self.train_loader.feat_dim
model_conf.output_dim = self.train_loader.vocab_size model_conf.output_dim = self.train_loader.vocab_size
model_conf.freeze()
model = U2Model.from_config(model_conf) model = U2Model.from_config(model_conf)
if self.parallel: if self.parallel:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
@ -429,9 +446,10 @@ class U2Tester(U2Trainer):
len_refs += len_ref len_refs += len_ref
num_ins += 1 num_ins += 1
if fout: if fout:
fout.write(utt + " " + result + "\n") fout.write({"utt": utt, "ref": target, "hyp": result})
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" % logger.info(f"Utt: {utt}")
(target, result)) logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")
logger.info("One example error rate [%s] = %f" % logger.info("One example error rate [%s] = %f" %
(cfg.error_rate_type, error_rate_func(target, result))) (cfg.error_rate_type, error_rate_func(target, result)))
@ -456,7 +474,7 @@ class U2Tester(U2Trainer):
errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_sum, len_refs, num_ins = 0.0, 0, 0
num_frames = 0.0 num_frames = 0.0
num_time = 0.0 num_time = 0.0
with open(self.args.result_file, 'w') as fout: with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_metrics(*batch, fout=fout) metrics = self.compute_metrics(*batch, fout=fout)
num_frames += metrics['num_frames'] num_frames += metrics['num_frames']
@ -526,9 +544,8 @@ class U2Tester(U2Trainer):
self.model.eval() self.model.eval()
logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}") logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
stride_ms = self.config.collater.stride_ms stride_ms = self.align_loader.collate_fn.stride_ms
token_dict = self.args.char_list token_dict = self.align_loader.collate_fn.vocab_list
with open(self.args.result_file, 'w') as fout: with open(self.args.result_file, 'w') as fout:
# one example in batch # one example in batch
for i, batch in enumerate(self.align_loader): for i, batch in enumerate(self.align_loader):
@ -537,7 +554,7 @@ class U2Tester(U2Trainer):
# 1. Encoder # 1. Encoder
encoder_out, encoder_mask = self.model._forward_encoder( encoder_out, encoder_mask = self.model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim) feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.model.ctc.log_softmax( ctc_probs = self.model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size) encoder_out) # (1, maxlen, vocab_size)
@ -545,26 +562,25 @@ class U2Tester(U2Trainer):
ctc_probs = ctc_probs.squeeze(0) ctc_probs = ctc_probs.squeeze(0)
target = target.squeeze(0) target = target.squeeze(0)
alignment = ctc_utils.forced_align(ctc_probs, target) alignment = ctc_utils.forced_align(ctc_probs, target)
logger.info("align ids", key[0], alignment) logger.info(f"align ids: {key[0]} {alignment}")
fout.write('{} {}\n'.format(key[0], alignment)) fout.write('{} {}\n'.format(key[0], alignment))
# 3. gen praat # 3. gen praat
# segment alignment # segment alignment
align_segs = text_grid.segment_alignment(alignment) align_segs = text_grid.segment_alignment(alignment)
logger.info("align tokens", key[0], align_segs) logger.info(f"align tokens: {key[0]}, {align_segs}")
# IntervalTier, List["start end token\n"] # IntervalTier, List["start end token\n"]
subsample = utility.get_subsample(self.config) subsample = utility.get_subsample(self.config)
tierformat = text_grid.align_to_tierformat( tierformat = text_grid.align_to_tierformat(
align_segs, subsample, token_dict) align_segs, subsample, token_dict)
# write tier # write tier
align_output_path = os.path.join( align_output_path = Path(self.args.result_file).parent / "align"
os.path.dirname(self.args.result_file), "align") align_output_path.mkdir(parents=True, exist_ok=True)
tier_path = os.path.join(align_output_path, key[0] + ".tier") tier_path = align_output_path / (key[0] + ".tier")
with open(tier_path, 'w') as f: with tier_path.open('w') as f:
f.writelines(tierformat) f.writelines(tierformat)
# write textgrid # write textgrid
textgrid_path = os.path.join(align_output_path, textgrid_path = align_output_path / (key[0] + ".TextGrid")
key[0] + ".TextGrid")
second_per_frame = 1. / (1000. / second_per_frame = 1. / (1000. /
stride_ms) # 25ms window, 10ms stride stride_ms) # 25ms window, 10ms stride
second_per_example = ( second_per_example = (
@ -572,7 +588,7 @@ class U2Tester(U2Trainer):
text_grid.generate_textgrid( text_grid.generate_textgrid(
maxtime=second_per_example, maxtime=second_per_example,
intervals=tierformat, intervals=tierformat,
output=textgrid_path) output=str(textgrid_path))
def run_align(self): def run_align(self):
self.resume_or_scratch() self.resume_or_scratch()
@ -623,7 +639,7 @@ class U2Tester(U2Trainer):
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
paddle.set_device(self.args.device) paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
self.setup_output_dir() self.setup_output_dir()
self.setup_checkpointer() self.setup_checkpointer()

@ -30,7 +30,7 @@ def main_sp(config, args):
def main(config, args): def main(config, args):
if args.device == "gpu" and args.nprocs > 1: if args.nprocs > 0:
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
else: else:
main_sp(config, args) main_sp(config, args)

@ -17,9 +17,11 @@ import os
import sys import sys
import time import time
from collections import defaultdict from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import jsonlines
import numpy as np import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
@ -37,6 +39,7 @@ from deepspeech.io.sampler import SortagradDistributedBatchSampler
from deepspeech.models.u2_st import U2STModel from deepspeech.models.u2_st import U2STModel
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.scheduler import WarmupLR from deepspeech.training.scheduler import WarmupLR
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer from deepspeech.training.trainer import Trainer
from deepspeech.utils import bleu_score from deepspeech.utils import bleu_score
from deepspeech.utils import ctc_utils from deepspeech.utils import ctc_utils
@ -45,6 +48,7 @@ from deepspeech.utils import mp_tools
from deepspeech.utils import text_grid from deepspeech.utils import text_grid
from deepspeech.utils import utility from deepspeech.utils import utility
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@ -83,6 +87,7 @@ class U2STTrainer(Trainer):
def train_batch(self, batch_index, batch_data, msg): def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training train_conf = self.config.training
start = time.time() start = time.time()
# forward
utt, audio, audio_len, text, text_len = batch_data utt, audio, audio_len, text, text_len = batch_data
if isinstance(text, list) and isinstance(text_len, list): if isinstance(text, list) and isinstance(text_len, list):
# joint training with ASR. Two decoding texts [translation, transcription] # joint training with ASR. Two decoding texts [translation, transcription]
@ -94,18 +99,30 @@ class U2STTrainer(Trainer):
else: else:
loss, st_loss, attention_loss, ctc_loss = self.model( loss, st_loss, attention_loss, ctc_loss = self.model(
audio, audio_len, text, text_len) audio, audio_len, text, text_len)
# loss div by `batch_size * accum_grad` # loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad loss /= train_conf.accum_grad
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
losses_np = {'loss': float(loss) * train_conf.accum_grad} losses_np = {'loss': float(loss) * train_conf.accum_grad}
losses_np['st_loss'] = float(st_loss)
if attention_loss: if attention_loss:
losses_np['att_loss'] = float(attention_loss) losses_np['att_loss'] = float(attention_loss)
if ctc_loss: if ctc_loss:
losses_np['ctc_loss'] = float(ctc_loss) losses_np['ctc_loss'] = float(ctc_loss)
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0: if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step() self.optimizer.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
@ -182,17 +199,11 @@ class U2STTrainer(Trainer):
# script_model_path = str(self.checkpoint_dir / 'init') # script_model_path = str(self.checkpoint_dir / 'init')
# paddle.jit.save(script_model, script_model_path) # paddle.jit.save(script_model, script_model_path)
from_scratch = self.resume_or_scratch() self.before_train()
if from_scratch:
# save init model, i.e. 0 epoch
self.save(tag='init')
self.lr_scheduler.step(self.iteration)
if self.parallel:
self.train_loader.batch_sampler.set_epoch(self.epoch)
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch: while self.epoch < self.config.training.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train() self.model.train()
try: try:
data_start_time = time.time() data_start_time = time.time()
@ -206,11 +217,13 @@ class U2STTrainer(Trainer):
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time) msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg) self.train_batch(batch_index, batch, msg)
self.after_train_batch()
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
raise e raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid() total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1: if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts) num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -327,10 +340,10 @@ class U2STTrainer(Trainer):
def setup_model(self): def setup_model(self):
config = self.config config = self.config
model_conf = config.model model_conf = config.model
model_conf.defrost() with UpdateConfig(model_conf):
model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.input_dim = self.train_loader.collate_fn.feature_size
model_conf.output_dim = self.train_loader.collate_fn.vocab_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size
model_conf.freeze()
model = U2STModel.from_config(model_conf) model = U2STModel.from_config(model_conf)
if self.parallel: if self.parallel:
@ -467,8 +480,10 @@ class U2STTester(U2STTrainer):
len_refs += len(target.split()) len_refs += len(target.split())
num_ins += 1 num_ins += 1
if fout: if fout:
fout.write(utt + " " + result + "\n") fout.write({"utt": utt, "ref": target, "hyp": result})
logger.info("\nReference: %s\nHypothesis: %s" % (target, result)) logger.info(f"Utt: {utt}")
logger.info(f"Ref: {target}")
logger.info(f"Hyp: {result}")
logger.info("One example BLEU = %s" % logger.info("One example BLEU = %s" %
(bleu_func([result], [[target]]).prec_str)) (bleu_func([result], [[target]]).prec_str))
@ -496,7 +511,7 @@ class U2STTester(U2STTrainer):
len_refs, num_ins = 0, 0 len_refs, num_ins = 0, 0
num_frames = 0.0 num_frames = 0.0
num_time = 0.0 num_time = 0.0
with open(self.args.result_file, 'w') as fout: with jsonlines.open(self.args.result_file, 'w') as fout:
for i, batch in enumerate(self.test_loader): for i, batch in enumerate(self.test_loader):
metrics = self.compute_translation_metrics( metrics = self.compute_translation_metrics(
*batch, bleu_func=bleu_func, fout=fout) *batch, bleu_func=bleu_func, fout=fout)
@ -569,7 +584,7 @@ class U2STTester(U2STTrainer):
# 1. Encoder # 1. Encoder
encoder_out, encoder_mask = self.model._forward_encoder( encoder_out, encoder_mask = self.model._forward_encoder(
feat, feats_length) # (B, maxlen, encoder_dim) feat, feats_length) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.model.ctc.log_softmax( ctc_probs = self.model.ctc.log_softmax(
encoder_out) # (1, maxlen, vocab_size) encoder_out) # (1, maxlen, vocab_size)
@ -577,26 +592,25 @@ class U2STTester(U2STTrainer):
ctc_probs = ctc_probs.squeeze(0) ctc_probs = ctc_probs.squeeze(0)
target = target.squeeze(0) target = target.squeeze(0)
alignment = ctc_utils.forced_align(ctc_probs, target) alignment = ctc_utils.forced_align(ctc_probs, target)
logger.info("align ids", key[0], alignment) logger.info(f"align ids: {key[0]} {alignment}")
fout.write('{} {}\n'.format(key[0], alignment)) fout.write('{} {}\n'.format(key[0], alignment))
# 3. gen praat # 3. gen praat
# segment alignment # segment alignment
align_segs = text_grid.segment_alignment(alignment) align_segs = text_grid.segment_alignment(alignment)
logger.info("align tokens", key[0], align_segs) logger.info(f"align tokens: {key[0]}, {align_segs}")
# IntervalTier, List["start end token\n"] # IntervalTier, List["start end token\n"]
subsample = utility.get_subsample(self.config) subsample = utility.get_subsample(self.config)
tierformat = text_grid.align_to_tierformat( tierformat = text_grid.align_to_tierformat(
align_segs, subsample, token_dict) align_segs, subsample, token_dict)
# write tier # write tier
align_output_path = os.path.join( align_output_path = Path(self.args.result_file).parent / "align"
os.path.dirname(self.args.result_file), "align") align_output_path.mkdir(parents=True, exist_ok=True)
tier_path = os.path.join(align_output_path, key[0] + ".tier") tier_path = align_output_path / (key[0] + ".tier")
with open(tier_path, 'w') as f: with tier_path.open('w') as f:
f.writelines(tierformat) f.writelines(tierformat)
# write textgrid # write textgrid
textgrid_path = os.path.join(align_output_path, textgrid_path = align_output_path / (key[0] + ".TextGrid")
key[0] + ".TextGrid")
second_per_frame = 1. / (1000. / second_per_frame = 1. / (1000. /
stride_ms) # 25ms window, 10ms stride stride_ms) # 25ms window, 10ms stride
second_per_example = ( second_per_example = (
@ -604,7 +618,7 @@ class U2STTester(U2STTrainer):
text_grid.generate_textgrid( text_grid.generate_textgrid(
maxtime=second_per_example, maxtime=second_per_example,
intervals=tierformat, intervals=tierformat,
output=textgrid_path) output=str(textgrid_path))
def run_align(self): def run_align(self):
self.resume_or_scratch() self.resume_or_scratch()
@ -650,7 +664,7 @@ class U2STTester(U2STTrainer):
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
paddle.set_device(self.args.device) paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
self.setup_output_dir() self.setup_output_dir()
self.setup_checkpointer() self.setup_checkpointer()

@ -196,7 +196,12 @@ class TextFeaturizer():
[(idx, token) for (idx, token) in enumerate(vocab_list)]) [(idx, token) for (idx, token) in enumerate(vocab_list)])
token2id = dict( token2id = dict(
[(token, idx) for (idx, token) in enumerate(vocab_list)]) [(token, idx) for (idx, token) in enumerate(vocab_list)])
if UNK in vocab_list:
unk_id = vocab_list.index(UNK) unk_id = vocab_list.index(UNK)
else:
unk_id = -1
if EOS in vocab_list:
eos_id = vocab_list.index(EOS) eos_id = vocab_list.index(EOS)
else:
eos_id = -1
return token2id, id2token, vocab_list, unk_id, eos_id return token2id, id2token, vocab_list, unk_id, eos_id

@ -130,7 +130,8 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20): def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file.""" """Load mean and std from file."""
mean, istd = load_cmvn(filepath, filetype='json') filetype = filepath.split(".")[-1]
mean, istd = load_cmvn(filepath, filetype=filetype)
self._mean = np.expand_dims(mean, axis=0) self._mean = np.expand_dims(mean, axis=0)
self._istd = np.expand_dims(istd, axis=0) self._istd = np.expand_dims(istd, axis=0)

@ -12,13 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Contains data helper functions.""" """Contains data helper functions."""
import codecs
import json import json
import math import math
from typing import List from typing import List
from typing import Optional from typing import Optional
from typing import Text from typing import Text
import jsonlines
import numpy as np import numpy as np
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
@ -92,12 +92,8 @@ def read_manifest(
""" """
manifest = [] manifest = []
for json_line in codecs.open(manifest_path, 'r', 'utf-8'): with jsonlines.open(manifest_path, 'r') as reader:
try: for json_data in reader:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
feat_len = json_data["feat_shape"][ feat_len = json_data["feat_shape"][
0] if 'feat_shape' in json_data else 1.0 0] if 'feat_shape' in json_data else 1.0
token_len = json_data["token_shape"][ token_len = json_data["token_shape"][
@ -284,6 +280,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
cmvn = _load_json_cmvn(cmvn_file) cmvn = _load_json_cmvn(cmvn_file)
elif filetype == "kaldi": elif filetype == "kaldi":
cmvn = _load_kaldi_cmvn(cmvn_file) cmvn = _load_kaldi_cmvn(cmvn_file)
elif filetype == "npz":
eps = 1e-14
npzfile = np.load(cmvn_file)
mean = np.squeeze(npzfile["mean"])
std = np.squeeze(npzfile["std"])
istd = 1 / (std + eps)
cmvn = [mean, istd]
else: else:
raise ValueError(f"cmvn file type no support: {filetype}") raise ValueError(f"cmvn file type no support: {filetype}")
return cmvn[0], cmvn[1] return cmvn[0], cmvn[1]

@ -292,10 +292,6 @@ class SpeechCollator():
olens = np.array(text_lens).astype(np.int64) olens = np.array(text_lens).astype(np.int64)
return utts, xs_pad, ilens, ys_pad, olens return utts, xs_pad, ilens, ys_pad, olens
@property
def manifest(self):
return self._manifest
@property @property
def vocab_size(self): def vocab_size(self):
return self._speech_featurizer.vocab_size return self._speech_featurizer.vocab_size

@ -44,7 +44,7 @@ def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
def batch_collate(x): def batch_collate(x):
"""de-tuple. """de-minibatch, since user compose batch.
Args: Args:
x (List[Tuple]): [(utts, xs, ilens, ys, olens)] x (List[Tuple]): [(utts, xs, ilens, ys, olens)]

@ -147,3 +147,131 @@ class TransformDataset(Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
"""[] operator.""" """[] operator."""
return self.converter([self.reader(self.data[idx], return_uttid=True)]) return self.converter([self.reader(self.data[idx], return_uttid=True)])
class AudioDataset(Dataset):
def __init__(self,
data_file,
max_length=10240,
min_length=0,
token_max_length=200,
token_min_length=1,
batch_type='static',
batch_size=1,
max_frames_in_batch=0,
sort=True,
raw_wav=True,
stride_ms=10):
"""Dataset for loading audio data.
Attributes::
data_file: input data file
Plain text data file, each line contains following 7 fields,
which is split by '\t':
utt:utt1
feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30
feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames)
text:i love you
token: i <space> l o v e <space> y o u
tokenid: int id of this token
token_shape: M,N # M is the number of token, N is vocab size
max_length: drop utterance which is greater than max_length(10ms), unit 10ms.
min_length: drop utterance which is less than min_length(10ms), unit 10ms.
token_max_length: drop utterance which is greater than token_max_length,
especially when use char unit for english modeling
token_min_length: drop utterance which is less than token_max_length
batch_type: static or dynamic, see max_frames_in_batch(dynamic)
batch_size: number of utterances in a batch,
it's for static batch size.
max_frames_in_batch: max feature frames in a batch,
when batch_type is dynamic, it's for dynamic batch size.
Then batch_size is ignored, we will keep filling the
batch until the total frames in batch up to max_frames_in_batch.
sort: whether to sort all data, so the utterance with the same
length could be filled in a same batch.
raw_wav: use raw wave or extracted featute.
if raw wave is used, dynamic waveform-level augmentation could be used
and the feature is extracted by torchaudio.
if extracted featute(e.g. by kaldi) is used, only feature-level
augmentation such as specaug could be used.
"""
assert batch_type in ['static', 'dynamic']
# read manifest
data = read_manifest(data_file)
if sort:
data = sorted(data, key=lambda x: x["feat_shape"][0])
if raw_wav:
assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
'.scp')
data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))
self.input_dim = data[0]['feat_shape'][1]
self.output_dim = data[0]['token_shape'][1]
# with open(data_file, 'r') as f:
# for line in f:
# arr = line.strip().split('\t')
# if len(arr) != 7:
# continue
# key = arr[0].split(':')[1]
# tokenid = arr[5].split(':')[1]
# output_dim = int(arr[6].split(':')[1].split(',')[1])
# if raw_wav:
# wav_path = ':'.join(arr[1].split(':')[1:])
# duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
# data.append((key, wav_path, duration, tokenid))
# else:
# feat_ark = ':'.join(arr[1].split(':')[1:])
# feat_info = arr[2].split(':')[1].split(',')
# feat_dim = int(feat_info[1].strip())
# num_frames = int(feat_info[0].strip())
# data.append((key, feat_ark, num_frames, tokenid))
# self.input_dim = feat_dim
# self.output_dim = output_dim
valid_data = []
for i in range(len(data)):
length = data[i]['feat_shape'][0]
token_length = data[i]['token_shape'][0]
# remove too lang or too short utt for both input and output
# to prevent from out of memory
if length > max_length or length < min_length:
# logging.warn('ignore utterance {} feature {}'.format(
# data[i][0], length))
pass
elif token_length > token_max_length or token_length < token_min_length:
pass
else:
valid_data.append(data[i])
data = valid_data
self.minibatch = []
num_data = len(data)
# Dynamic batch size
if batch_type == 'dynamic':
assert (max_frames_in_batch > 0)
self.minibatch.append([])
num_frames_in_batch = 0
for i in range(num_data):
length = data[i]['feat_shape'][0]
num_frames_in_batch += length
if num_frames_in_batch > max_frames_in_batch:
self.minibatch.append([])
num_frames_in_batch = length
self.minibatch[-1].append(data[i])
# Static batch size
else:
cur = 0
while cur < num_data:
end = min(cur + batch_size, num_data)
item = []
for i in range(cur, end):
item.append(data[i])
self.minibatch.append(item)
cur = end
def __len__(self):
return len(self.minibatch)
def __getitem__(self, idx):
instance = self.minibatch[idx]
return instance["utt"], instance["feat"], instance["text"]

@ -106,11 +106,9 @@ class ConvBn(nn.Layer):
# reset padding part to 0 # reset padding part to 0
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply # https://github.com/PaddlePaddle/Paddle/pull/29265
# masks = masks.type_as(x) # rhs will type promote to lhs
masks = masks.astype(x.dtype) x = x * masks
x = x.multiply(masks)
return x, x_len return x, x_len

@ -128,8 +128,8 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=3, #Number of stacking RNN layers. num_rnn_layers=3, #Number of stacking RNN layers.
rnn_layer_size=1024, #RNN layer size (number of RNN cells). rnn_layer_size=1024, #RNN layer size (number of RNN cells).
use_gru=True, #Use gru if set True. Use simple rnn if set False. use_gru=True, #Use gru if set True. Use simple rnn if set False.
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
)) ctc_grad_norm_type='instance', ))
if config is not None: if config is not None:
config.merge_from_other_cfg(default) config.merge_from_other_cfg(default)
return default return default
@ -141,7 +141,9 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=True): share_rnn_weights=True,
blank_id=0,
ctc_grad_norm_type='instance'):
super().__init__() super().__init__()
self.encoder = CRNNEncoder( self.encoder = CRNNEncoder(
feat_size=feat_size, feat_size=feat_size,
@ -156,10 +158,11 @@ class DeepSpeech2Model(nn.Layer):
self.decoder = CTCDecoder( self.decoder = CTCDecoder(
odim=dict_size, # <blank> is in vocab odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size, enc_n_units=self.encoder.output_size,
blank_id=0, # first token is <blank> blank_id=blank_id,
dropout_rate=0.0, dropout_rate=0.0,
reduction=True, # sum reduction=True, # sum
batch_average=True) # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type=ctc_grad_norm_type)
def forward(self, audio, audio_len, text, text_len): def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss """Compute Model loss
@ -221,7 +224,8 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=config.model.num_rnn_layers, num_rnn_layers=config.model.num_rnn_layers,
rnn_size=config.model.rnn_layer_size, rnn_size=config.model.rnn_layer_size,
use_gru=config.model.use_gru, use_gru=config.model.use_gru,
share_rnn_weights=config.model.share_rnn_weights) share_rnn_weights=config.model.share_rnn_weights,
blank_id=config.model.blank_id)
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
@ -246,7 +250,8 @@ class DeepSpeech2Model(nn.Layer):
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size, rnn_size=config.rnn_layer_size,
use_gru=config.use_gru, use_gru=config.use_gru,
share_rnn_weights=config.share_rnn_weights) share_rnn_weights=config.share_rnn_weights,
blank_id=config.blank_id)
return model return model
@ -258,7 +263,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
num_rnn_layers=3, num_rnn_layers=3,
rnn_size=1024, rnn_size=1024,
use_gru=False, use_gru=False,
share_rnn_weights=True): share_rnn_weights=True,
blank_id=0):
super().__init__( super().__init__(
feat_size=feat_size, feat_size=feat_size,
dict_size=dict_size, dict_size=dict_size,
@ -266,7 +272,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
num_rnn_layers=num_rnn_layers, num_rnn_layers=num_rnn_layers,
rnn_size=rnn_size, rnn_size=rnn_size,
use_gru=use_gru, use_gru=use_gru,
share_rnn_weights=share_rnn_weights) share_rnn_weights=share_rnn_weights,
blank_id=blank_id)
def forward(self, audio, audio_len): def forward(self, audio, audio_len):
"""export model function """export model function

@ -308,7 +308,8 @@ class RNNStack(nn.Layer):
x, x_len = rnn(x, x_len) x, x_len = rnn(x, x_len)
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1] masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply # https://github.com/PaddlePaddle/Paddle/pull/29265
masks = masks.astype(x.dtype) # rhs will type promote to lhs
x = x.multiply(masks) x = x * masks
return x, x_len return x, x_len

@ -254,6 +254,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
num_fc_layers=2, num_fc_layers=2,
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=True, #Use gru if set True. Use simple rnn if set False. use_gru=True, #Use gru if set True. Use simple rnn if set False.
blank_id=0, # index of blank in vocob.txt
)) ))
if config is not None: if config is not None:
config.merge_from_other_cfg(default) config.merge_from_other_cfg(default)
@ -268,7 +269,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
rnn_direction='forward', rnn_direction='forward',
num_fc_layers=2, num_fc_layers=2,
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=False): use_gru=False,
blank_id=0):
super().__init__() super().__init__()
self.encoder = CRNNEncoder( self.encoder = CRNNEncoder(
feat_size=feat_size, feat_size=feat_size,
@ -284,10 +286,11 @@ class DeepSpeech2ModelOnline(nn.Layer):
self.decoder = CTCDecoder( self.decoder = CTCDecoder(
odim=dict_size, # <blank> is in vocab odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size, enc_n_units=self.encoder.output_size,
blank_id=0, # first token is <blank> blank_id=blank_id,
dropout_rate=0.0, dropout_rate=0.0,
reduction=True, # sum reduction=True, # sum
batch_average=True) # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type='instance')
def forward(self, audio, audio_len, text, text_len): def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss """Compute Model loss
@ -353,7 +356,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
rnn_direction=config.model.rnn_direction, rnn_direction=config.model.rnn_direction,
num_fc_layers=config.model.num_fc_layers, num_fc_layers=config.model.num_fc_layers,
fc_layers_size_list=config.model.fc_layers_size_list, fc_layers_size_list=config.model.fc_layers_size_list,
use_gru=config.model.use_gru) use_gru=config.model.use_gru,
blank_id=config.model.blank_id)
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path) model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}") logger.info(f"checkpoint info: {infos}")
@ -380,7 +384,8 @@ class DeepSpeech2ModelOnline(nn.Layer):
rnn_direction=config.rnn_direction, rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers, num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list, fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru) use_gru=config.use_gru,
blank_id=config.blank_id)
return model return model
@ -394,7 +399,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
rnn_direction='forward', rnn_direction='forward',
num_fc_layers=2, num_fc_layers=2,
fc_layers_size_list=[512, 256], fc_layers_size_list=[512, 256],
use_gru=False): use_gru=False,
blank_id=0):
super().__init__( super().__init__(
feat_size=feat_size, feat_size=feat_size,
dict_size=dict_size, dict_size=dict_size,
@ -404,7 +410,8 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
rnn_direction=rnn_direction, rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers, num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list, fc_layers_size_list=fc_layers_size_list,
use_gru=use_gru) use_gru=use_gru,
blank_id=blank_id)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box, def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box): chunk_state_c_box):

@ -0,0 +1,19 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .u2 import U2InferModel
from .u2 import U2Model
from .updater import U2Evaluator
from .updater import U2Updater
__all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]

@ -48,6 +48,7 @@ from deepspeech.utils.tensor_utils import add_sos_eos
from deepspeech.utils.tensor_utils import pad_sequence from deepspeech.utils.tensor_utils import pad_sequence
from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.tensor_utils import th_accuracy
from deepspeech.utils.utility import log_add from deepspeech.utils.utility import log_add
from deepspeech.utils.utility import UpdateConfig
__all__ = ["U2Model", "U2InferModel"] __all__ = ["U2Model", "U2InferModel"]
@ -115,7 +116,8 @@ class U2BaseModel(nn.Layer):
ctc_weight: float=0.5, ctc_weight: float=0.5,
ignore_id: int=IGNORE_ID, ignore_id: int=IGNORE_ID,
lsm_weight: float=0.0, lsm_weight: float=0.0,
length_normalized_loss: bool=False): length_normalized_loss: bool=False,
**kwargs):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight assert 0.0 <= ctc_weight <= 1.0, ctc_weight
super().__init__() super().__init__()
@ -162,10 +164,7 @@ class U2BaseModel(nn.Layer):
encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
encoder_time = time.time() - start encoder_time = time.time() - start
#logger.debug(f"encoder time: {encoder_time}") #logger.debug(f"encoder time: {encoder_time}")
#TODO(Hui Zhang): sum not support bool type encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
1) #[B, 1, T] -> [B]
# 2a. Attention-decoder branch # 2a. Attention-decoder branch
loss_att = None loss_att = None
@ -299,8 +298,8 @@ class U2BaseModel(nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim) simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
encoder_dim = encoder_out.size(2) encoder_dim = encoder_out.shape[2]
running_size = batch_size * beam_size running_size = batch_size * beam_size
encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim)
@ -320,8 +319,7 @@ class U2BaseModel(nn.Layer):
# 2. Decoder forward step by step # 2. Decoder forward step by step
for i in range(1, maxlen + 1): for i in range(1, maxlen + 1):
# Stop if all batch and all beam produce eos # Stop if all batch and all beam produce eos
# TODO(Hui Zhang): if end_flag.sum() == running_size: if end_flag.sum() == running_size:
if end_flag.cast(paddle.int64).sum() == running_size:
break break
# 2.1 Forward decoder step # 2.1 Forward decoder step
@ -406,10 +404,8 @@ class U2BaseModel(nn.Layer):
encoder_out, encoder_mask = self._forward_encoder( encoder_out, encoder_mask = self._forward_encoder(
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, simulate_streaming) num_decoding_left_chunks, simulate_streaming)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
# (TODO Hui Zhang): bool no support reduce_sum encoder_out_lens = encoder_mask.squeeze(1).sum(1)
# encoder_out_lens = encoder_mask.squeeze(1).sum(1)
encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size)
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
@ -459,7 +455,7 @@ class U2BaseModel(nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim) simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
ctc_probs = self.ctc.log_softmax(encoder_out) # (1, maxlen, vocab_size) ctc_probs = self.ctc.log_softmax(encoder_out) # (1, maxlen, vocab_size)
ctc_probs = ctc_probs.squeeze(0) ctc_probs = ctc_probs.squeeze(0)
@ -587,7 +583,7 @@ class U2BaseModel(nn.Layer):
encoder_out = encoder_out.repeat(beam_size, 1, 1) encoder_out = encoder_out.repeat(beam_size, 1, 1)
encoder_mask = paddle.ones( encoder_mask = paddle.ones(
(beam_size, 1, encoder_out.size(1)), dtype=paddle.bool) (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
decoder_out, _ = self.decoder( decoder_out, _ = self.decoder(
encoder_out, encoder_mask, hyps_pad, encoder_out, encoder_mask, hyps_pad,
hyps_lens) # (beam_size, max_hyps_len, vocab_size) hyps_lens) # (beam_size, max_hyps_len, vocab_size)
@ -667,9 +663,7 @@ class U2BaseModel(nn.Layer):
xs, offset, required_cache_size, subsampling_cache, xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache) elayers_output_cache, conformer_cnn_cache)
# @jit.to_static([ # @jit.to_static
# paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D]
# ])
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log """ Export interface for c++ call, apply linear transform and log
softmax before ctc softmax before ctc
@ -696,13 +690,13 @@ class U2BaseModel(nn.Layer):
Returns: Returns:
paddle.Tensor: decoder output, (B, L) paddle.Tensor: decoder output, (B, L)
""" """
assert encoder_out.size(0) == 1 assert encoder_out.shape[0] == 1
num_hyps = hyps.size(0) num_hyps = hyps.shape[0]
assert hyps_lens.size(0) == num_hyps assert hyps_lens.shape[0] == num_hyps
encoder_out = encoder_out.repeat(num_hyps, 1, 1) encoder_out = encoder_out.repeat(num_hyps, 1, 1)
# (B, 1, T) # (B, 1, T)
encoder_mask = paddle.ones( encoder_mask = paddle.ones(
[num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool) [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
# (num_hyps, max_hyps_len, vocab_size) # (num_hyps, max_hyps_len, vocab_size)
decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
hyps_lens) hyps_lens)
@ -757,7 +751,7 @@ class U2BaseModel(nn.Layer):
Returns: Returns:
List[List[int]]: transcripts. List[List[int]]: transcripts.
""" """
batch_size = feats.size(0) batch_size = feats.shape[0]
if decoding_method in ['ctc_prefix_beam_search', if decoding_method in ['ctc_prefix_beam_search',
'attention_rescoring'] and batch_size > 1: 'attention_rescoring'] and batch_size > 1:
logger.fatal( logger.fatal(
@ -785,7 +779,7 @@ class U2BaseModel(nn.Layer):
# result in List[int], change it to List[List[int]] for compatible # result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode # with other batch decoding mode
elif decoding_method == 'ctc_prefix_beam_search': elif decoding_method == 'ctc_prefix_beam_search':
assert feats.size(0) == 1 assert feats.shape[0] == 1
hyp = self.ctc_prefix_beam_search( hyp = self.ctc_prefix_beam_search(
feats, feats,
feats_lengths, feats_lengths,
@ -795,7 +789,7 @@ class U2BaseModel(nn.Layer):
simulate_streaming=simulate_streaming) simulate_streaming=simulate_streaming)
hyps = [hyp] hyps = [hyp]
elif decoding_method == 'attention_rescoring': elif decoding_method == 'attention_rescoring':
assert feats.size(0) == 1 assert feats.shape[0] == 1
hyp = self.attention_rescoring( hyp = self.attention_rescoring(
feats, feats,
feats_lengths, feats_lengths,
@ -836,6 +830,7 @@ class U2Model(U2BaseModel):
Returns: Returns:
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
""" """
# cmvn
if configs['cmvn_file'] is not None: if configs['cmvn_file'] is not None:
mean, istd = load_cmvn(configs['cmvn_file'], mean, istd = load_cmvn(configs['cmvn_file'],
configs['cmvn_file_type']) configs['cmvn_file_type'])
@ -845,11 +840,13 @@ class U2Model(U2BaseModel):
else: else:
global_cmvn = None global_cmvn = None
# input & output dim
input_dim = configs['input_dim'] input_dim = configs['input_dim']
vocab_size = configs['output_dim'] vocab_size = configs['output_dim']
assert input_dim != 0, input_dim assert input_dim != 0, input_dim
assert vocab_size != 0, vocab_size assert vocab_size != 0, vocab_size
# encoder
encoder_type = configs.get('encoder', 'transformer') encoder_type = configs.get('encoder', 'transformer')
logger.info(f"U2 Encoder type: {encoder_type}") logger.info(f"U2 Encoder type: {encoder_type}")
if encoder_type == 'transformer': if encoder_type == 'transformer':
@ -861,16 +858,21 @@ class U2Model(U2BaseModel):
else: else:
raise ValueError(f"not support encoder type:{encoder_type}") raise ValueError(f"not support encoder type:{encoder_type}")
# decoder
decoder = TransformerDecoder(vocab_size, decoder = TransformerDecoder(vocab_size,
encoder.output_size(), encoder.output_size(),
**configs['decoder_conf']) **configs['decoder_conf'])
# ctc decoder and ctc loss
model_conf = configs['model_conf']
ctc = CTCDecoder( ctc = CTCDecoder(
odim=vocab_size, odim=vocab_size,
enc_n_units=encoder.output_size(), enc_n_units=encoder.output_size(),
blank_id=0, blank_id=0,
dropout_rate=0.0, dropout_rate=model_conf['ctc_dropoutrate'],
reduction=True, # sum reduction=True, # sum
batch_average=True) # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type=model_conf['ctc_grad_norm_type'])
return vocab_size, encoder, decoder, ctc return vocab_size, encoder, decoder, ctc
@ -902,10 +904,10 @@ class U2Model(U2BaseModel):
Returns: Returns:
DeepSpeech2Model: The model built from pretrained result. DeepSpeech2Model: The model built from pretrained result.
""" """
config.defrost() with UpdateConfig(config):
config.input_dim = dataloader.collate_fn.feature_size config.input_dim = dataloader.collate_fn.feature_size
config.output_dim = dataloader.collate_fn.vocab_size config.output_dim = dataloader.collate_fn.vocab_size
config.freeze()
model = cls.from_config(config) model = cls.from_config(config)
if checkpoint_path: if checkpoint_path:

@ -0,0 +1,149 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from contextlib import nullcontext
import paddle
from paddle import distributed as dist
from deepspeech.training.extensions.evaluator import StandardEvaluator
from deepspeech.training.reporter import report
from deepspeech.training.timer import Timer
from deepspeech.training.updaters.standard_updater import StandardUpdater
from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class U2Evaluator(StandardEvaluator):
def __init__(self, model, dataloader):
super().__init__(model, dataloader)
self.msg = ""
self.num_seen_utts = 0
self.total_loss = 0.0
def evaluate_core(self, batch):
self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
losses_dict = {}
loss, attention_loss, ctc_loss = self.model(*batch[1:])
if paddle.isfinite(loss):
num_utts = batch[1].shape[0]
self.num_seen_utts += num_utts
self.total_loss += float(loss) * num_utts
losses_dict['loss'] = float(loss)
if attention_loss:
losses_dict['att_loss'] = float(attention_loss)
if ctc_loss:
losses_dict['ctc_loss'] = float(ctc_loss)
for k, v in losses_dict.items():
report("eval/" + k, v)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
logger.info(self.msg)
return self.total_loss, self.num_seen_utts
class U2Updater(StandardUpdater):
def __init__(self,
model,
optimizer,
scheduler,
dataloader,
init_state=None,
accum_grad=1,
**kwargs):
super().__init__(
model, optimizer, scheduler, dataloader, init_state=init_state)
self.accum_grad = accum_grad
self.forward_count = 0
self.msg = ""
def update_core(self, batch):
"""One Step
Args:
batch (List[Object]): utts, xs, xlens, ys, ylens
"""
losses_dict = {}
self.msg = "Rank: {}, ".format(dist.get_rank())
# forward
batch_size = batch[1].shape[0]
loss, attention_loss, ctc_loss = self.model(*batch[1:])
# loss div by `batch_size * accum_grad`
loss /= self.accum_grad
# loss backward
if (self.forward_count + 1) != self.accum_grad:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
context = self.model.no_sync
else:
# Used for single gpu training and DDP gradient synchronization
# processes.
context = nullcontext
with context():
loss.backward()
layer_tools.print_grads(self.model, print_func=None)
# loss info
losses_dict['loss'] = float(loss) * self.accum_grad
if attention_loss:
losses_dict['att_loss'] = float(attention_loss)
if ctc_loss:
losses_dict['ctc_loss'] = float(ctc_loss)
# report loss
for k, v in losses_dict.items():
report("train/" + k, v)
# loss msg
self.msg += "batch size: {}, ".format(batch_size)
self.msg += "accum: {}, ".format(self.accum_grad)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
# Truncate the graph
loss.detach()
# update parameters
self.forward_count += 1
if self.forward_count != self.accum_grad:
return
self.forward_count = 0
self.optimizer.step()
self.optimizer.clear_grad()
self.scheduler.step()
def update(self):
# model is default in train mode
# training for a step is implemented here
with Timer("data time cost:{}"):
batch = self.read_batch()
with Timer("step time cost:{}"):
self.update_core(batch)
# #iterations with accum_grad > 1
# Ref.: https://github.com/espnet/espnet/issues/777
if self.forward_count == 0:
self.state.iteration += 1
if self.updates_per_epoch is not None:
if self.state.iteration % self.updates_per_epoch == 0:
self.state.epoch += 1

@ -42,6 +42,7 @@ from deepspeech.utils import layer_tools
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.tensor_utils import add_sos_eos from deepspeech.utils.tensor_utils import add_sos_eos
from deepspeech.utils.tensor_utils import th_accuracy from deepspeech.utils.tensor_utils import th_accuracy
from deepspeech.utils.utility import UpdateConfig
__all__ = ["U2STModel", "U2STInferModel"] __all__ = ["U2STModel", "U2STInferModel"]
@ -163,10 +164,7 @@ class U2STBaseModel(nn.Layer):
encoder_out, encoder_mask = self.encoder(speech, speech_lengths) encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
encoder_time = time.time() - start encoder_time = time.time() - start
#logger.debug(f"encoder time: {encoder_time}") #logger.debug(f"encoder time: {encoder_time}")
#TODO(Hui Zhang): sum not support bool type encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
1) #[B, 1, T] -> [B]
# 2a. ST-decoder branch # 2a. ST-decoder branch
start = time.time() start = time.time()
@ -342,8 +340,8 @@ class U2STBaseModel(nn.Layer):
speech, speech_lengths, decoding_chunk_size, speech, speech_lengths, decoding_chunk_size,
num_decoding_left_chunks, num_decoding_left_chunks,
simulate_streaming) # (B, maxlen, encoder_dim) simulate_streaming) # (B, maxlen, encoder_dim)
maxlen = encoder_out.size(1) maxlen = encoder_out.shape[1]
encoder_dim = encoder_out.size(2) encoder_dim = encoder_out.shape[2]
running_size = batch_size * beam_size running_size = batch_size * beam_size
encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim)
@ -363,8 +361,7 @@ class U2STBaseModel(nn.Layer):
# 2. Decoder forward step by step # 2. Decoder forward step by step
for i in range(1, maxlen + 1): for i in range(1, maxlen + 1):
# Stop if all batch and all beam produce eos # Stop if all batch and all beam produce eos
# TODO(Hui Zhang): if end_flag.sum() == running_size: if end_flag.sum() == running_size:
if end_flag.cast(paddle.int64).sum() == running_size:
break break
# 2.1 Forward decoder step # 2.1 Forward decoder step
@ -417,26 +414,26 @@ class U2STBaseModel(nn.Layer):
best_hyps = best_hyps[:, 1:] best_hyps = best_hyps[:, 1:]
return best_hyps return best_hyps
@jit.to_static # @jit.to_static
def subsampling_rate(self) -> int: def subsampling_rate(self) -> int:
""" Export interface for c++ call, return subsampling_rate of the """ Export interface for c++ call, return subsampling_rate of the
model model
""" """
return self.encoder.embed.subsampling_rate return self.encoder.embed.subsampling_rate
@jit.to_static # @jit.to_static
def right_context(self) -> int: def right_context(self) -> int:
""" Export interface for c++ call, return right_context of the model """ Export interface for c++ call, return right_context of the model
""" """
return self.encoder.embed.right_context return self.encoder.embed.right_context
@jit.to_static # @jit.to_static
def sos_symbol(self) -> int: def sos_symbol(self) -> int:
""" Export interface for c++ call, return sos symbol id of the model """ Export interface for c++ call, return sos symbol id of the model
""" """
return self.sos return self.sos
@jit.to_static # @jit.to_static
def eos_symbol(self) -> int: def eos_symbol(self) -> int:
""" Export interface for c++ call, return eos symbol id of the model """ Export interface for c++ call, return eos symbol id of the model
""" """
@ -472,7 +469,7 @@ class U2STBaseModel(nn.Layer):
xs, offset, required_cache_size, subsampling_cache, xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache) elayers_output_cache, conformer_cnn_cache)
@jit.to_static # @jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
""" Export interface for c++ call, apply linear transform and log """ Export interface for c++ call, apply linear transform and log
softmax before ctc softmax before ctc
@ -499,13 +496,13 @@ class U2STBaseModel(nn.Layer):
Returns: Returns:
paddle.Tensor: decoder output, (B, L) paddle.Tensor: decoder output, (B, L)
""" """
assert encoder_out.size(0) == 1 assert encoder_out.shape[0] == 1
num_hyps = hyps.size(0) num_hyps = hyps.shape[0]
assert hyps_lens.size(0) == num_hyps assert hyps_lens.shape[0] == num_hyps
encoder_out = encoder_out.repeat(num_hyps, 1, 1) encoder_out = encoder_out.repeat(num_hyps, 1, 1)
# (B, 1, T) # (B, 1, T)
encoder_mask = paddle.ones( encoder_mask = paddle.ones(
[num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool) [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
# (num_hyps, max_hyps_len, vocab_size) # (num_hyps, max_hyps_len, vocab_size)
decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
hyps_lens) hyps_lens)
@ -560,7 +557,7 @@ class U2STBaseModel(nn.Layer):
Returns: Returns:
List[List[int]]: transcripts. List[List[int]]: transcripts.
""" """
batch_size = feats.size(0) batch_size = feats.shape[0]
if decoding_method == 'fullsentence': if decoding_method == 'fullsentence':
hyps = self.translate( hyps = self.translate(
@ -647,13 +644,16 @@ class U2STModel(U2STBaseModel):
decoder = TransformerDecoder(vocab_size, decoder = TransformerDecoder(vocab_size,
encoder.output_size(), encoder.output_size(),
**configs['decoder_conf']) **configs['decoder_conf'])
# ctc decoder and ctc loss
model_conf = configs['model_conf']
ctc = CTCDecoder( ctc = CTCDecoder(
odim=vocab_size, odim=vocab_size,
enc_n_units=encoder.output_size(), enc_n_units=encoder.output_size(),
blank_id=0, blank_id=0,
dropout_rate=0.0, dropout_rate=model_conf['ctc_dropout_rate'],
reduction=True, # sum reduction=True, # sum
batch_average=True) # sum / batch_size batch_average=True, # sum / batch_size
grad_norm_type=model_conf['ctc_grad_norm_type'])
return vocab_size, encoder, (st_decoder, decoder, ctc) return vocab_size, encoder, (st_decoder, decoder, ctc)
else: else:
@ -687,10 +687,10 @@ class U2STModel(U2STBaseModel):
Returns: Returns:
DeepSpeech2Model: The model built from pretrained result. DeepSpeech2Model: The model built from pretrained result.
""" """
config.defrost() with UpdateConfig(config):
config.input_dim = dataloader.collate_fn.feature_size config.input_dim = dataloader.collate_fn.feature_size
config.output_dim = dataloader.collate_fn.vocab_size config.output_dim = dataloader.collate_fn.vocab_size
config.freeze()
model = cls.from_config(config) model = cls.from_config(config)
if checkpoint_path: if checkpoint_path:

@ -15,12 +15,13 @@ from collections import OrderedDict
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"] __all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock", "GLU"]
def brelu(x, t_min=0.0, t_max=24.0, name=None): def brelu(x, t_min=0.0, t_max=24.0, name=None):
@ -30,6 +31,17 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
return x.maximum(t_min).minimum(t_max) return x.maximum(t_min).minimum(t_max)
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""
def __init__(self, dim: int=-1):
super().__init__()
self.dim = dim
def forward(self, xs):
return F.glu(xs, axis=self.dim)
class LinearGLUBlock(nn.Layer): class LinearGLUBlock(nn.Layer):
"""A linear Gated Linear Units (GLU) block.""" """A linear Gated Linear Units (GLU) block."""
@ -133,13 +145,18 @@ def get_activation(act):
"""Return activation function.""" """Return activation function."""
# Lazy load to avoid unused import # Lazy load to avoid unused import
activation_funcs = { activation_funcs = {
"hardshrink": paddle.nn.Hardshrink,
"hardswish": paddle.nn.Hardswish,
"hardtanh": paddle.nn.Hardtanh, "hardtanh": paddle.nn.Hardtanh,
"tanh": paddle.nn.Tanh, "tanh": paddle.nn.Tanh,
"relu": paddle.nn.ReLU, "relu": paddle.nn.ReLU,
"relu6": paddle.nn.ReLU6,
"leakyrelu": paddle.nn.LeakyReLU,
"selu": paddle.nn.SELU, "selu": paddle.nn.SELU,
"swish": paddle.nn.Swish, "swish": paddle.nn.Swish,
"gelu": paddle.nn.GELU, "gelu": paddle.nn.GELU,
"brelu": brelu, "glu": GLU,
"elu": paddle.nn.ELU,
} }
return activation_funcs[act]() return activation_funcs[act]()

@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor: Transformed value tensor, size paddle.Tensor: Transformed value tensor, size
(#batch, n_head, time2, d_k). (#batch, n_head, time2, d_k).
""" """
n_batch = query.size(0) n_batch = query.shape[0]
q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor: Transformed value weighted paddle.Tensor: Transformed value weighted
by the attention score, (#batch, time1, d_model). by the attention score, (#batch, time1, d_model).
""" """
n_batch = value.size(0) n_batch = value.shape[0]
if mask is not None: if mask is not None:
mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
scores = scores.masked_fill(mask, -float('inf')) scores = scores.masked_fill(mask, -float('inf'))
@ -109,8 +109,8 @@ class MultiHeadedAttention(nn.Layer):
p_attn = self.dropout(attn) p_attn = self.dropout(attn)
x = paddle.matmul(p_attn, value) # (batch, head, time1, d_k) x = paddle.matmul(p_attn, value) # (batch, head, time1, d_k)
x = x.transpose([0, 2, 1, 3]).contiguous().view( x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h *
n_batch, -1, self.h * self.d_k) # (batch, time1, d_model) self.d_k) # (batch, time1, d_model)
return self.linear_out(x) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model)
@ -172,15 +172,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
paddle.Tensor: Output tensor. (batch, head, time1, time1) paddle.Tensor: Output tensor. (batch, head, time1, time1)
""" """
zero_pad = paddle.zeros( zero_pad = paddle.zeros(
(x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype) (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype)
x_padded = paddle.cat([zero_pad, x], dim=-1) x_padded = paddle.cat([zero_pad, x], dim=-1)
x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2)) x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1,
x.shape[2])
x = x_padded[:, :, 1:].view_as(x) # [B, H, T1, T1] x = x_padded[:, :, 1:].view_as(x) # [B, H, T1, T1]
if zero_triu: if zero_triu:
ones = paddle.ones((x.size(2), x.size(3))) ones = paddle.ones((x.shape[2], x.shape[3]))
x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :] x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :]
return x return x
@ -205,7 +206,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
q, k, v = self.forward_qkv(query, key, value) q, k, v = self.forward_qkv(query, key, value)
q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k)
n_batch_pos = pos_emb.size(0) n_batch_pos = pos_emb.shape[0]
p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k)

@ -113,11 +113,9 @@ class ConvBn(nn.Layer):
# reset padding part to 0 # reset padding part to 0
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply # https://github.com/PaddlePaddle/Paddle/pull/29265
# masks = masks.type_as(x) # rhs will type promote to lhs
masks = masks.astype(x.dtype) x = x * masks
x = x.multiply(masks)
return x, x_len return x, x_len

@ -16,15 +16,19 @@ from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from typeguard import check_argument_types from typeguard import check_argument_types
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder
from deepspeech.decoders.swig_wrapper import Scorer
from deepspeech.modules.loss import CTCLoss from deepspeech.modules.loss import CTCLoss
from deepspeech.utils import ctc_utils from deepspeech.utils import ctc_utils
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
try:
from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401
from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder # noqa: F401
from deepspeech.decoders.swig_wrapper import Scorer # noqa: F401
except Exception as e:
logger.info("ctcdecoder not installed!")
__all__ = ['CTCDecoder'] __all__ = ['CTCDecoder']
@ -35,7 +39,8 @@ class CTCDecoder(nn.Layer):
blank_id=0, blank_id=0,
dropout_rate: float=0.0, dropout_rate: float=0.0,
reduction: bool=True, reduction: bool=True,
batch_average: bool=True): batch_average: bool=True,
grad_norm_type: str="instance"):
"""CTC decoder """CTC decoder
Args: Args:
@ -44,6 +49,7 @@ class CTCDecoder(nn.Layer):
dropout_rate (float): dropout rate (0.0 ~ 1.0) dropout_rate (float): dropout rate (0.0 ~ 1.0)
reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none'
batch_average (bool): do batch dim wise average. batch_average (bool): do batch dim wise average.
grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None.
""" """
assert check_argument_types() assert check_argument_types()
super().__init__() super().__init__()
@ -56,7 +62,8 @@ class CTCDecoder(nn.Layer):
self.criterion = CTCLoss( self.criterion = CTCLoss(
blank=self.blank_id, blank=self.blank_id,
reduction=reduction_type, reduction=reduction_type,
batch_average=batch_average) batch_average=batch_average,
grad_norm_type=grad_norm_type)
# CTCDecoder LM Score handle # CTCDecoder LM Score handle
self._ext_scorer = None self._ext_scorer = None
@ -132,7 +139,7 @@ class CTCDecoder(nn.Layer):
results = [] results = []
for i, probs in enumerate(probs_split): for i, probs in enumerate(probs_split):
output_transcription = ctc_greedy_decoder( output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list) probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id)
results.append(output_transcription) results.append(output_transcription)
return results return results
@ -212,13 +219,15 @@ class CTCDecoder(nn.Layer):
num_processes=num_processes, num_processes=num_processes,
ext_scoring_func=self._ext_scorer, ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob, cutoff_prob=cutoff_prob,
cutoff_top_n=cutoff_top_n) cutoff_top_n=cutoff_top_n,
blank_id=self.blank_id)
results = [result[0][1] for result in beam_search_results] results = [result[0][1] for result in beam_search_results]
return results return results
def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list, def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list,
decoding_method): decoding_method):
if decoding_method == "ctc_beam_search": if decoding_method == "ctc_beam_search":
self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path,
vocab_list) vocab_list)

@ -122,11 +122,9 @@ class TransformerDecoder(nn.Layer):
# tgt_mask: (B, 1, L) # tgt_mask: (B, 1, L)
tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1)) tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
# m: (1, L, L) # m: (1, L, L)
m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0) m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0)
# tgt_mask: (B, L, L) # tgt_mask: (B, L, L)
# TODO(Hui Zhang): not support & for tensor tgt_mask = tgt_mask & m
# tgt_mask = tgt_mask & m
tgt_mask = tgt_mask.logical_and(m)
x, _ = self.embed(tgt) x, _ = self.embed(tgt)
for layer in self.decoders: for layer in self.decoders:
@ -137,9 +135,7 @@ class TransformerDecoder(nn.Layer):
if self.use_output_layer: if self.use_output_layer:
x = self.output_layer(x) x = self.output_layer(x)
# TODO(Hui Zhang): reduce_sum not support bool type olens = tgt_mask.sum(1)
# olens = tgt_mask.sum(1)
olens = tgt_mask.astype(paddle.int).sum(1)
return x, olens return x, olens
def forward_one_step( def forward_one_step(

@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
""" """
T = x.shape[1] T = x.shape[1]
assert offset + x.size(1) < self.max_len assert offset + x.shape[1] < self.max_len
#TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + T] pos_emb = self.pe[:, offset:offset + T]
x = x * self.xscale + pos_emb x = x * self.xscale + pos_emb
@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
paddle.Tensor: Encoded tensor (batch, time, `*`). paddle.Tensor: Encoded tensor (batch, time, `*`).
paddle.Tensor: Positional embedding tensor (1, time, `*`). paddle.Tensor: Positional embedding tensor (1, time, `*`).
""" """
assert offset + x.size(1) < self.max_len assert offset + x.shape[1] < self.max_len
x = x * self.xscale x = x * self.xscale
#TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + x.shape[1]] pos_emb = self.pe[:, offset:offset + x.shape[1]]

@ -159,11 +159,10 @@ class BaseEncoder(nn.Layer):
if self.global_cmvn is not None: if self.global_cmvn is not None:
xs = self.global_cmvn(xs) xs = self.global_cmvn(xs)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0) xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool) masks = masks.astype(paddle.bool)
#TODO(Hui Zhang): mask_pad = ~masks mask_pad = ~masks
mask_pad = masks.logical_not()
chunk_masks = add_optional_chunk_mask( chunk_masks = add_optional_chunk_mask(
xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
decoding_chunk_size, self.static_chunk_size, decoding_chunk_size, self.static_chunk_size,
@ -207,11 +206,11 @@ class BaseEncoder(nn.Layer):
chunk computation chunk computation
List[paddle.Tensor]: conformer cnn cache List[paddle.Tensor]: conformer cnn cache
""" """
assert xs.size(0) == 1 # batch size must be one assert xs.shape[0] == 1 # batch size must be one
# tmp_masks is just for interface compatibility # tmp_masks is just for interface compatibility
# TODO(Hui Zhang): stride_slice not support bool tensor # TODO(Hui Zhang): stride_slice not support bool tensor
# tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32) tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
if self.global_cmvn is not None: if self.global_cmvn is not None:
@ -221,25 +220,25 @@ class BaseEncoder(nn.Layer):
xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D) xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D)
if subsampling_cache is not None: if subsampling_cache is not None:
cache_size = subsampling_cache.size(1) #T cache_size = subsampling_cache.shape[1] #T
xs = paddle.cat((subsampling_cache, xs), dim=1) xs = paddle.cat((subsampling_cache, xs), dim=1)
else: else:
cache_size = 0 cache_size = 0
# only used when using `RelPositionMultiHeadedAttention` # only used when using `RelPositionMultiHeadedAttention`
pos_emb = self.embed.position_encoding( pos_emb = self.embed.position_encoding(
offset=offset - cache_size, size=xs.size(1)) offset=offset - cache_size, size=xs.shape[1])
if required_cache_size < 0: if required_cache_size < 0:
next_cache_start = 0 next_cache_start = 0
elif required_cache_size == 0: elif required_cache_size == 0:
next_cache_start = xs.size(1) next_cache_start = xs.shape[1]
else: else:
next_cache_start = xs.size(1) - required_cache_size next_cache_start = xs.shape[1] - required_cache_size
r_subsampling_cache = xs[:, next_cache_start:, :] r_subsampling_cache = xs[:, next_cache_start:, :]
# Real mask for transformer/conformer layers # Real mask for transformer/conformer layers
masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
masks = masks.unsqueeze(1) #[B=1, L'=1, T] masks = masks.unsqueeze(1) #[B=1, L'=1, T]
r_elayers_output_cache = [] r_elayers_output_cache = []
r_conformer_cnn_cache = [] r_conformer_cnn_cache = []
@ -303,7 +302,7 @@ class BaseEncoder(nn.Layer):
stride = subsampling * decoding_chunk_size stride = subsampling * decoding_chunk_size
decoding_window = (decoding_chunk_size - 1) * subsampling + context decoding_window = (decoding_chunk_size - 1) * subsampling + context
num_frames = xs.size(1) num_frames = xs.shape[1]
required_cache_size = decoding_chunk_size * num_decoding_left_chunks required_cache_size = decoding_chunk_size * num_decoding_left_chunks
subsampling_cache: Optional[paddle.Tensor] = None subsampling_cache: Optional[paddle.Tensor] = None
elayers_output_cache: Optional[List[paddle.Tensor]] = None elayers_output_cache: Optional[List[paddle.Tensor]] = None
@ -319,10 +318,10 @@ class BaseEncoder(nn.Layer):
chunk_xs, offset, required_cache_size, subsampling_cache, chunk_xs, offset, required_cache_size, subsampling_cache,
elayers_output_cache, conformer_cnn_cache) elayers_output_cache, conformer_cnn_cache)
outputs.append(y) outputs.append(y)
offset += y.size(1) offset += y.shape[1]
ys = paddle.cat(outputs, 1) ys = paddle.cat(outputs, 1)
# fake mask, just for jit script and compatibility with `forward` api # fake mask, just for jit script and compatibility with `forward` api
masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool) masks = paddle.ones([1, ys.shape[1]], dtype=paddle.bool)
masks = masks.unsqueeze(1) masks = masks.unsqueeze(1)
return ys, masks return ys, masks

@ -23,11 +23,32 @@ __all__ = ['CTCLoss', "LabelSmoothingLoss"]
class CTCLoss(nn.Layer): class CTCLoss(nn.Layer):
def __init__(self, blank=0, reduction='sum', batch_average=False): def __init__(self,
blank=0,
reduction='sum',
batch_average=False,
grad_norm_type=None):
super().__init__() super().__init__()
# last token id as blank id # last token id as blank id
self.loss = nn.CTCLoss(blank=blank, reduction=reduction) self.loss = nn.CTCLoss(blank=blank, reduction=reduction)
self.batch_average = batch_average self.batch_average = batch_average
logger.info(
f"CTCLoss Loss reduction: {reduction}, div-bs: {batch_average}")
# instance for norm_by_times
# batch for norm_by_batchsize
# frame for norm_by_total_logits_len
assert grad_norm_type in ('instance', 'batch', 'frame', None)
self.norm_by_times = False
self.norm_by_batchsize = False
self.norm_by_total_logits_len = False
logger.info(f"CTCLoss Grad Norm Type: {grad_norm_type}")
if grad_norm_type == 'instance':
self.norm_by_times = True
if grad_norm_type == 'batch':
self.norm_by_batchsize = True
if grad_norm_type == 'frame':
self.norm_by_total_logits_len = True
def forward(self, logits, ys_pad, hlens, ys_lens): def forward(self, logits, ys_pad, hlens, ys_lens):
"""Compute CTC loss. """Compute CTC loss.
@ -46,10 +67,15 @@ class CTCLoss(nn.Layer):
# warp-ctc need activation with shape [T, B, V + 1] # warp-ctc need activation with shape [T, B, V + 1]
# logits: (B, L, D) -> (L, B, D) # logits: (B, L, D) -> (L, B, D)
logits = logits.transpose([1, 0, 2]) logits = logits.transpose([1, 0, 2])
# (TODO:Hui Zhang) ctc loss does not support int64 labels
ys_pad = ys_pad.astype(paddle.int32) ys_pad = ys_pad.astype(paddle.int32)
loss = self.loss( loss = self.loss(
logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average) logits,
ys_pad,
hlens,
ys_lens,
norm_by_times=self.norm_by_times,
norm_by_batchsize=self.norm_by_batchsize,
norm_by_total_logits_len=self.norm_by_total_logits_len)
if self.batch_average: if self.batch_average:
# Batch-size average # Batch-size average
loss = loss / B loss = loss / B
@ -124,9 +150,9 @@ class LabelSmoothingLoss(nn.Layer):
# use zeros_like instead of torch.no_grad() for true_dist, # use zeros_like instead of torch.no_grad() for true_dist,
# since no_grad() can not be exported by JIT # since no_grad() can not be exported by JIT
true_dist = paddle.full_like(x, self.smoothing / (self.size - 1)) true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))
ignore = target == self.padding_idx # (B,) ignore = (target == self.padding_idx) # (B,)
# target = target * (1 - ignore) # avoid -1 index #TODO(Hui Zhang): target = target * (1 - ignore) # avoid -1 index
target = target.masked_fill(ignore, 0) # avoid -1 index target = target.masked_fill(ignore, 0) # avoid -1 index
# true_dist.scatter_(1, target.unsqueeze(1), self.confidence) # true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
target_mask = F.one_hot(target, self.size) target_mask = F.one_hot(target, self.size)
@ -135,10 +161,8 @@ class LabelSmoothingLoss(nn.Layer):
kl = self.criterion(F.log_softmax(x, axis=1), true_dist) kl = self.criterion(F.log_softmax(x, axis=1), true_dist)
#TODO(Hui Zhang): sum not support bool type total = len(target) - int(ignore.sum())
#total = len(target) - int(ignore.sum())
total = len(target) - int(ignore.type_as(target).sum())
denom = total if self.normalize_length else B denom = total if self.normalize_length else B
#numer = (kl * (1 - ignore)).sum() #TODO(Hui Zhang): numer = (kl * (1 - ignore)).sum()
numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum() numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()
return numer / denom return numer / denom

@ -69,8 +69,7 @@ def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor:
[1, 1, 1, 0, 0], [1, 1, 1, 0, 0],
[1, 1, 0, 0, 0]] [1, 1, 0, 0, 0]]
""" """
#TODO(Hui Zhang): return ~make_pad_mask(lengths), not support ~ return ~make_pad_mask(lengths)
return make_pad_mask(lengths).logical_not()
def subsequent_mask(size: int) -> paddle.Tensor: def subsequent_mask(size: int) -> paddle.Tensor:
@ -92,12 +91,7 @@ def subsequent_mask(size: int) -> paddle.Tensor:
[1, 1, 1]] [1, 1, 1]]
""" """
ret = paddle.ones([size, size], dtype=paddle.bool) ret = paddle.ones([size, size], dtype=paddle.bool)
#TODO(Hui Zhang): tril not support bool return paddle.tril(ret)
#return paddle.tril(ret)
ret = ret.astype(paddle.float)
ret = paddle.tril(ret)
ret = ret.astype(paddle.bool)
return ret
def subsequent_chunk_mask( def subsequent_chunk_mask(
@ -186,15 +180,13 @@ def add_optional_chunk_mask(xs: paddle.Tensor,
chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size, chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
num_left_chunks) # (L, L) num_left_chunks) # (L, L)
chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L)
# chunk_masks = masks & chunk_masks # (B, L, L) chunk_masks = masks & chunk_masks # (B, L, L)
chunk_masks = masks.logical_and(chunk_masks) # (B, L, L)
elif static_chunk_size > 0: elif static_chunk_size > 0:
num_left_chunks = num_decoding_left_chunks num_left_chunks = num_decoding_left_chunks
chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size, chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
num_left_chunks) # (L, L) num_left_chunks) # (L, L)
chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L)
# chunk_masks = masks & chunk_masks # (B, L, L) chunk_masks = masks & chunk_masks # (B, L, L)
chunk_masks = masks.logical_and(chunk_masks) # (B, L, L)
else: else:
chunk_masks = masks chunk_masks = masks
return chunk_masks return chunk_masks

@ -308,7 +308,7 @@ class RNNStack(nn.Layer):
x, x_len = rnn(x, x_len) x, x_len = rnn(x, x_len)
masks = make_non_pad_mask(x_len) #[B, T] masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1] masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply # https://github.com/PaddlePaddle/Paddle/pull/29265
masks = masks.astype(x.dtype) # rhs will type promote to lhs
x = x.multiply(masks) x = x * masks
return x, x_len return x, x_len

@ -14,6 +14,20 @@
import argparse import argparse
class ExtendAction(argparse.Action):
"""
[Since Python 3.8, the "extend" is available directly in stdlib]
(https://docs.python.org/3.8/library/argparse.html#action).
If you only have to support 3.8+ then defining it yourself is no longer required.
Usage of stdlib "extend" action is exactly the same way as this answer originally described:
"""
def __call__(self, parser, namespace, values, option_string=None):
items = getattr(namespace, self.dest) or []
items.extend(values)
setattr(namespace, self.dest, items)
def default_argument_parser(): def default_argument_parser():
r"""A simple yet genral argument parser for experiments with parakeet. r"""A simple yet genral argument parser for experiments with parakeet.
@ -30,7 +44,7 @@ def default_argument_parser():
The ``--checkpoint_path`` specifies the checkpoint to load from. The ``--checkpoint_path`` specifies the checkpoint to load from.
The ``--device`` and ``--nprocs`` specifies how to run the training. The ``--nprocs`` specifies how to run the training.
See Also See Also
@ -42,29 +56,53 @@ def default_argument_parser():
the parser the parser
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.register('action', 'extend', ExtendAction)
# yapf: disable train_group = parser.add_argument_group(
# data and output title='Train Options', description=None)
parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.") train_group.add_argument(
parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.") "--seed",
parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.") type=int,
default=None,
# load from saved checkpoint help="seed to use for paddle, np and random. None or 0 for random, else set seed."
parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") )
train_group.add_argument(
# running "--nprocs",
parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"], type=int,
help="device type to use, cpu and gpu are supported.") default=1,
parser.add_argument("--nprocs", type=int, default=1, help="number of parallel processes to use.") help="number of parallel processes. 0 for cpu.")
train_group.add_argument(
# overwrite extra config and default config "--config", metavar="CONFIG_FILE", help="config file.")
# parser.add_argument("--opts", nargs=argparse.REMAINDER, train_group.add_argument(
# help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
parser.add_argument("--opts", type=str, default=[], nargs='+', train_group.add_argument(
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") "--checkpoint_path", type=str, help="path to load checkpoint")
train_group.add_argument(
"--opts",
action='extend',
nargs=2,
metavar=('key', 'val'),
help="overwrite --config field, passing (KEY VALUE) pairs")
train_group.add_argument(
"--dump-config", metavar="FILE", help="dump config to `this` file.")
parser.add_argument("--seed", type=int, default=None, profile_group = parser.add_argument_group(
help="seed to use for paddle, np and random. None or 0 for random, else set seed.") title='Benchmark Options', description=None)
# yapd: enable profile_group.add_argument(
'--profiler-options',
type=str,
default=None,
help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
)
profile_group.add_argument(
'--benchmark-batch-size',
type=int,
default=None,
help='batch size for benchmark.')
profile_group.add_argument(
'--benchmark-max-step',
type=int,
default=None,
help='max iteration for benchmark.')
return parser return parser

@ -13,14 +13,18 @@
# limitations under the License. # limitations under the License.
from typing import Dict from typing import Dict
import extension
import paddle import paddle
from paddle import distributed as dist
from paddle.io import DataLoader from paddle.io import DataLoader
from paddle.nn import Layer from paddle.nn import Layer
from . import extension
from ..reporter import DictSummary from ..reporter import DictSummary
from ..reporter import ObsScope
from ..reporter import report from ..reporter import report
from ..reporter import scope from ..timer import Timer
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
class StandardEvaluator(extension.Extension): class StandardEvaluator(extension.Extension):
@ -43,6 +47,27 @@ class StandardEvaluator(extension.Extension):
def evaluate_core(self, batch): def evaluate_core(self, batch):
# compute # compute
self.model(batch) # you may report here self.model(batch) # you may report here
return
def evaluate_sync(self, data):
# dist sync `evaluate_core` outputs
if data is None:
return
numerator, denominator = data
if dist.get_world_size() > 1:
numerator = paddle.to_tensor(numerator)
denominator = paddle.to_tensor(denominator)
# the default operator in all_reduce function is sum.
dist.all_reduce(numerator)
dist.all_reduce(denominator)
value = numerator / denominator
value = float(value)
else:
value = numerator / denominator
# used for `snapshort` to do kbest save.
report("VALID/LOSS", value)
logger.info(f"Valid: all-reduce loss {value}")
def evaluate(self): def evaluate(self):
# switch to eval mode # switch to eval mode
@ -53,12 +78,16 @@ class StandardEvaluator(extension.Extension):
summary = DictSummary() summary = DictSummary()
for batch in self.dataloader: for batch in self.dataloader:
observation = {} observation = {}
with scope(observation): with ObsScope(observation):
# main evaluation computation here. # main evaluation computation here.
with paddle.no_grad(): with paddle.no_grad():
self.evaluate_core(batch) self.evaluate_sync(self.evaluate_core(batch))
summary.add(observation) summary.add(observation)
summary = summary.compute_mean() summary = summary.compute_mean()
# switch to train mode
for model in self.models.values():
model.train()
return summary return summary
def __call__(self, trainer=None): def __call__(self, trainer=None):
@ -66,6 +95,7 @@ class StandardEvaluator(extension.Extension):
# if it is used to extend a trainer, the metrics is reported to # if it is used to extend a trainer, the metrics is reported to
# to observation of the trainer # to observation of the trainer
# or otherwise, you can use your own observation # or otherwise, you can use your own observation
with Timer("Eval Time Cost: {}"):
summary = self.evaluate() summary = self.evaluate()
for k, v in summary.items(): for k, v in summary.items():
report(k, v) report(k, v)

@ -20,8 +20,9 @@ from typing import List
import jsonlines import jsonlines
from deepspeech.training.extensions import extension from . import extension
from deepspeech.training.updaters.trainer import Trainer from ..reporter import get_observations
from ..updaters.trainer import Trainer
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.mp_tools import rank_zero_only from deepspeech.utils.mp_tools import rank_zero_only
@ -52,8 +53,19 @@ class Snapshot(extension.Extension):
priority = -100 priority = -100
default_name = "snapshot" default_name = "snapshot"
def __init__(self, max_size: int=5, snapshot_on_error: bool=False): def __init__(self,
mode='latest',
max_size: int=5,
indicator=None,
less_better=True,
snapshot_on_error: bool=False):
self.records: List[Dict[str, Any]] = [] self.records: List[Dict[str, Any]] = []
assert mode in ('latest', 'kbest'), mode
if mode == 'kbest':
assert indicator is not None
self.mode = mode
self.indicator = indicator
self.less_is_better = less_better
self.max_size = max_size self.max_size = max_size
self._snapshot_on_error = snapshot_on_error self._snapshot_on_error = snapshot_on_error
self._save_all = (max_size == -1) self._save_all = (max_size == -1)
@ -66,16 +78,17 @@ class Snapshot(extension.Extension):
# load existing records # load existing records
record_path: Path = self.checkpoint_dir / "records.jsonl" record_path: Path = self.checkpoint_dir / "records.jsonl"
if record_path.exists(): if record_path.exists():
logger.debug("Loading from an existing checkpoint dir")
self.records = load_records(record_path) self.records = load_records(record_path)
trainer.updater.load(self.records[-1]['path']) ckpt_path = self.records[-1]['path']
logger.info(f"Loading from an existing checkpoint {ckpt_path}")
trainer.updater.load(ckpt_path)
def on_error(self, trainer, exc, tb): def on_error(self, trainer, exc, tb):
if self._snapshot_on_error: if self._snapshot_on_error:
self.save_checkpoint_and_update(trainer) self.save_checkpoint_and_update(trainer, 'latest')
def __call__(self, trainer: Trainer): def __call__(self, trainer: Trainer):
self.save_checkpoint_and_update(trainer) self.save_checkpoint_and_update(trainer, self.mode)
def full(self): def full(self):
"""Whether the number of snapshots it keeps track of is greater """Whether the number of snapshots it keeps track of is greater
@ -83,12 +96,12 @@ class Snapshot(extension.Extension):
return (not self._save_all) and len(self.records) > self.max_size return (not self._save_all) and len(self.records) > self.max_size
@rank_zero_only @rank_zero_only
def save_checkpoint_and_update(self, trainer: Trainer): def save_checkpoint_and_update(self, trainer: Trainer, mode: str):
"""Saving new snapshot and remove the oldest snapshot if needed.""" """Saving new snapshot and remove the oldest snapshot if needed."""
iteration = trainer.updater.state.iteration iteration = trainer.updater.state.iteration
epoch = trainer.updater.state.epoch epoch = trainer.updater.state.epoch
num = epoch if self.trigger[1] == 'epoch' else iteration num = epoch if self.trigger[1] == 'epoch' else iteration
path = self.checkpoint_dir / f"{num}.pdz" path = self.checkpoint_dir / f"{num}.np"
# add the new one # add the new one
trainer.updater.save(path) trainer.updater.save(path)
@ -97,11 +110,17 @@ class Snapshot(extension.Extension):
'path': str(path.resolve()), # use absolute path 'path': str(path.resolve()), # use absolute path
'iteration': iteration, 'iteration': iteration,
'epoch': epoch, 'epoch': epoch,
'indicator': get_observations()[self.indicator]
} }
self.records.append(record) self.records.append(record)
# remove the earist # remove the earist
if self.full(): if self.full():
if mode == 'kbest':
self.records = sorted(
self.records,
key=lambda record: record['indicator'],
reverse=not self.less_is_better)
eariest_record = self.records[0] eariest_record = self.records[0]
os.remove(eariest_record["path"]) os.remove(eariest_record["path"])
self.records.pop(0) self.records.pop(0)

@ -11,8 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from deepspeech.training.extensions import extension from visualdl import LogWriter
from deepspeech.training.updaters.trainer import Trainer
from . import extension
from ..updaters.trainer import Trainer
class VisualDL(extension.Extension): class VisualDL(extension.Extension):
@ -26,8 +28,8 @@ class VisualDL(extension.Extension):
default_name = 'visualdl' default_name = 'visualdl'
priority = extension.PRIORITY_READER priority = extension.PRIORITY_READER
def __init__(self, writer): def __init__(self, output_dir):
self.writer = writer self.writer = LogWriter(str(output_dir))
def __call__(self, trainer: Trainer): def __call__(self, trainer: Trainer):
for k, v in trainer.observation.items(): for k, v in trainer.observation.items():

@ -47,7 +47,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
sum_square = layers.reduce_sum(square) sum_square = layers.reduce_sum(square)
sum_square_list.append(sum_square) sum_square_list.append(sum_square)
# debug log # debug log, not dump all since slow down train process
if i < 10: if i < 10:
logger.debug( logger.debug(
f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }") f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
@ -76,7 +76,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
new_grad = layers.elementwise_mul(x=g, y=clip_var) new_grad = layers.elementwise_mul(x=g, y=clip_var)
params_and_grads.append((p, new_grad)) params_and_grads.append((p, new_grad))
# debug log # debug log, not dump all since slow down train process
if i < 10: if i < 10:
logger.debug( logger.debug(
f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}" f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"

@ -19,7 +19,7 @@ OBSERVATIONS = None
@contextlib.contextmanager @contextlib.contextmanager
def scope(observations): def ObsScope(observations):
# make `observation` the target to report to. # make `observation` the target to report to.
# it is basically a dictionary that stores temporary observations # it is basically a dictionary that stores temporary observations
global OBSERVATIONS global OBSERVATIONS

@ -0,0 +1,50 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datetime
import time
from deepspeech.utils.log import Log
__all__ = ["Timer"]
logger = Log(__name__).getlog()
class Timer():
"""To be used like this:
with Timer("Message") as value:
do some thing
"""
def __init__(self, message=None):
self.message = message
def duration(self) -> str:
elapsed_time = time.time() - self.start
time_str = str(datetime.timedelta(seconds=elapsed_time))
return time_str
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, type, value, traceback):
if self.message:
logger.info(self.message.format(self.duration()))
def __call__(self) -> float:
return time.time() - self.start
def __str__(self):
return self.duration()

@ -11,17 +11,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
import time import time
from collections import OrderedDict
from pathlib import Path from pathlib import Path
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from deepspeech.training.reporter import ObsScope
from deepspeech.training.reporter import report
from deepspeech.training.timer import Timer
from deepspeech.utils import mp_tools from deepspeech.utils import mp_tools
from deepspeech.utils import profiler
from deepspeech.utils.checkpoint import Checkpoint from deepspeech.utils.checkpoint import Checkpoint
from deepspeech.utils.log import Log from deepspeech.utils.log import Log
from deepspeech.utils.utility import seed_all from deepspeech.utils.utility import seed_all
from deepspeech.utils.utility import UpdateConfig
__all__ = ["Trainer"] __all__ = ["Trainer"]
@ -79,7 +86,7 @@ class Trainer():
>>> config.merge_from_list(args.opts) >>> config.merge_from_list(args.opts)
>>> config.freeze() >>> config.freeze()
>>> >>>
>>> if args.nprocs > 1 and args.device == "gpu": >>> if args.nprocs > 0:
>>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs) >>> dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
>>> else: >>> else:
>>> main_sp(config, args) >>> main_sp(config, args)
@ -94,15 +101,25 @@ class Trainer():
self.checkpoint_dir = None self.checkpoint_dir = None
self.iteration = 0 self.iteration = 0
self.epoch = 0 self.epoch = 0
self.rank = dist.get_rank()
logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
if args.seed: if args.seed:
seed_all(args.seed) seed_all(args.seed)
logger.info(f"Set seed {args.seed}") logger.info(f"Set seed {args.seed}")
if self.args.benchmark_batch_size:
with UpdateConfig(self.config):
self.config.collator.batch_size = self.args.benchmark_batch_size
self.config.training.log_interval = 1
logger.info(
f"Benchmark reset batch-size: {self.args.benchmark_batch_size}")
def setup(self): def setup(self):
"""Setup the experiment. """Setup the experiment.
""" """
paddle.set_device(self.args.device) paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
if self.parallel: if self.parallel:
self.init_parallel() self.init_parallel()
@ -122,7 +139,7 @@ class Trainer():
"""A flag indicating whether the experiment should run with """A flag indicating whether the experiment should run with
multiprocessing. multiprocessing.
""" """
return self.args.device == "gpu" and self.args.nprocs > 1 return self.args.nprocs > 0
def init_parallel(self): def init_parallel(self):
"""Init environment for multiprocess training. """Init environment for multiprocess training.
@ -162,56 +179,97 @@ class Trainer():
checkpoint_dir=self.checkpoint_dir, checkpoint_dir=self.checkpoint_dir,
checkpoint_path=self.args.checkpoint_path) checkpoint_path=self.args.checkpoint_path)
if infos: if infos:
# restore from ckpt # just restore ckpt
# lr will resotre from optimizer ckpt
self.iteration = infos["step"] self.iteration = infos["step"]
self.epoch = infos["epoch"] self.epoch = infos["epoch"]
scratch = False scratch = False
logger.info(
f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
else: else:
self.iteration = 0 self.iteration = 0
self.epoch = 0 self.epoch = 0
scratch = True scratch = True
logger.info("Init from scratch!")
return scratch return scratch
def new_epoch(self): def maybe_batch_sampler_step(self):
"""Reset the train loader seed and increment `epoch`. """ batch_sampler seed by epoch """
""" if hasattr(self.train_loader, "batch_sampler"):
self.epoch += 1
if self.parallel and hasattr(self.train_loader, "batch_sampler"):
batch_sampler = self.train_loader.batch_sampler batch_sampler = self.train_loader.batch_sampler
if isinstance(batch_sampler, paddle.io.DistributedBatchSampler): if isinstance(batch_sampler, paddle.io.DistributedBatchSampler):
batch_sampler.set_epoch(self.epoch) batch_sampler.set_epoch(self.epoch)
def train(self): def before_train(self):
"""The training process control by epoch."""
from_scratch = self.resume_or_scratch() from_scratch = self.resume_or_scratch()
if from_scratch: if from_scratch:
# save init model, i.e. 0 epoch # scratch: save init model, i.e. 0 epoch
self.save(tag='init', infos=None) self.save(tag='init', infos=None)
self.lr_scheduler.step(self.epoch) else:
if self.parallel and hasattr(self.train_loader, "batch_sampler"): # resume: train next_epoch and next_iteration
self.train_loader.batch_sampler.set_epoch(self.epoch) self.epoch += 1
self.iteration += 1
logger.info(
f"Resume train: epoch {self.epoch }, step {self.iteration}!")
self.maybe_batch_sampler_step()
def new_epoch(self):
"""Reset the train loader seed and increment `epoch`.
"""
# `iteration` increased by train step
self.epoch += 1
self.maybe_batch_sampler_step()
def after_train_batch(self):
if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
profiler.add_profiler_step(self.args.profiler_options)
logger.info(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
sys.exit(
f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
def train(self):
"""The training process control by epoch."""
self.before_train()
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.training.n_epoch: while self.epoch < self.config.training.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train() self.model.train()
try: try:
data_start_time = time.time() data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader): for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank()) msg = "Train:"
msg += "epoch: {}, ".format(self.epoch) observation = OrderedDict()
msg += "step: {}, ".format(self.iteration) with ObsScope(observation):
msg += "batch : {}/{}, ".format(batch_index + 1, report("Rank", dist.get_rank())
len(self.train_loader)) report("epoch", self.epoch)
msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) report('step', self.iteration)
msg += "data time: {:>.3f}s, ".format(dataload_time) report("lr", self.lr_scheduler())
self.train_batch(batch_index, batch, msg) self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('iter', batch_index + 1)
report('total', len(self.train_loader))
report('reader_cost', dataload_time)
observation['batch_cost'] = observation[
'reader_cost'] + observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation[
'batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time() data_start_time = time.time()
except Exception as e: except Exception as e:
logger.error(e) logger.error(e)
raise e raise e
with Timer("Eval Time Cost: {}"):
total_loss, num_seen_utts = self.valid() total_loss, num_seen_utts = self.valid()
if dist.get_world_size() > 1: if dist.get_world_size() > 1:
num_seen_utts = paddle.to_tensor(num_seen_utts) num_seen_utts = paddle.to_tensor(num_seen_utts)
@ -231,6 +289,7 @@ class Trainer():
'epoch', {'cv_loss': cv_loss, 'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch) 'lr': self.lr_scheduler()}, self.epoch)
# after epoch
self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.save(tag=self.epoch, infos={'val_loss': cv_loss})
# step lr every epoch # step lr every epoch
self.lr_scheduler.step() self.lr_scheduler.step()
@ -240,14 +299,13 @@ class Trainer():
"""The routine of the experiment after setup. This method is intended """The routine of the experiment after setup. This method is intended
to be used by the user. to be used by the user.
""" """
with Timer("Training Done: {}"):
try: try:
self.train() self.train()
except KeyboardInterrupt: except KeyboardInterrupt:
self.save()
exit(-1) exit(-1)
finally: finally:
self.destory() self.destory()
logger.info("Training Done.")
def setup_output_dir(self): def setup_output_dir(self):
"""Create a directory used for output. """Create a directory used for output.

@ -14,12 +14,12 @@
from typing import Dict from typing import Dict
from typing import Optional from typing import Optional
from paddle import Tensor import paddle
from paddle.io import DataLoader from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler from paddle.io import DistributedBatchSampler
from paddle.nn import Layer from paddle.nn import Layer
from paddle.optimizer import Optimizer from paddle.optimizer import Optimizer
from timer import timer from paddle.optimizer.lr import LRScheduler
from deepspeech.training.reporter import report from deepspeech.training.reporter import report
from deepspeech.training.updaters.updater import UpdaterBase from deepspeech.training.updaters.updater import UpdaterBase
@ -39,8 +39,10 @@ class StandardUpdater(UpdaterBase):
def __init__(self, def __init__(self,
model: Layer, model: Layer,
optimizer: Optimizer, optimizer: Optimizer,
scheduler: LRScheduler,
dataloader: DataLoader, dataloader: DataLoader,
init_state: Optional[UpdaterState]=None): init_state: Optional[UpdaterState]=None):
super().__init__(init_state)
# it is designed to hold multiple models # it is designed to hold multiple models
models = {"main": model} models = {"main": model}
self.models: Dict[str, Layer] = models self.models: Dict[str, Layer] = models
@ -51,15 +53,14 @@ class StandardUpdater(UpdaterBase):
self.optimizer = optimizer self.optimizer = optimizer
self.optimizers: Dict[str, Optimizer] = optimizers self.optimizers: Dict[str, Optimizer] = optimizers
# it is designed to hold multiple scheduler
schedulers = {"main": scheduler}
self.scheduler = scheduler
self.schedulers: Dict[str, LRScheduler] = schedulers
# dataloaders # dataloaders
self.dataloader = dataloader self.dataloader = dataloader
# init state
if init_state is None:
self.state = UpdaterState()
else:
self.state = init_state
self.train_iterator = iter(dataloader) self.train_iterator = iter(dataloader)
def update(self): def update(self):
@ -103,7 +104,9 @@ class StandardUpdater(UpdaterBase):
model.train() model.train()
# training for a step is implemented here # training for a step is implemented here
with Timier("data time cost:{}"):
batch = self.read_batch() batch = self.read_batch()
with Timier("step time cost:{}"):
self.update_core(batch) self.update_core(batch)
self.state.iteration += 1 self.state.iteration += 1
@ -115,13 +118,14 @@ class StandardUpdater(UpdaterBase):
"""A simple case for a training step. Basic assumptions are: """A simple case for a training step. Basic assumptions are:
Single model; Single model;
Single optimizer; Single optimizer;
Single scheduler, and update learning rate each step;
A batch from the dataloader is just the input of the model; A batch from the dataloader is just the input of the model;
The model return a single loss, or a dict containing serval losses. The model return a single loss, or a dict containing serval losses.
Parameters updates at every batch, no gradient accumulation. Parameters updates at every batch, no gradient accumulation.
""" """
loss = self.model(*batch) loss = self.model(*batch)
if isinstance(loss, Tensor): if isinstance(loss, paddle.Tensor):
loss_dict = {"main": loss} loss_dict = {"main": loss}
else: else:
# Dict[str, Tensor] # Dict[str, Tensor]
@ -135,14 +139,15 @@ class StandardUpdater(UpdaterBase):
for name, loss_item in loss_dict.items(): for name, loss_item in loss_dict.items():
report(name, float(loss_item)) report(name, float(loss_item))
self.optimizer.clear_gradient() self.optimizer.clear_grad()
loss_dict["main"].backward() loss_dict["main"].backward()
self.optimizer.update() self.optimizer.step()
self.scheduler.step()
@property @property
def updates_per_epoch(self): def updates_per_epoch(self):
"""Number of updater per epoch, determined by the length of the """Number of steps per epoch,
dataloader.""" determined by the length of the dataloader."""
length_of_dataloader = None length_of_dataloader = None
try: try:
length_of_dataloader = len(self.dataloader) length_of_dataloader = len(self.dataloader)
@ -163,18 +168,16 @@ class StandardUpdater(UpdaterBase):
def read_batch(self): def read_batch(self):
"""Read a batch from the data loader, auto renew when data is exhausted.""" """Read a batch from the data loader, auto renew when data is exhausted."""
with timer() as t:
try: try:
batch = next(self.train_iterator) batch = next(self.train_iterator)
except StopIteration: except StopIteration:
self.new_epoch() self.new_epoch()
batch = next(self.train_iterator) batch = next(self.train_iterator)
logger.debug(
f"Read a batch takes {t.elapse}s.") # replace it with logger
return batch return batch
def state_dict(self): def state_dict(self):
"""State dict of a Updater, model, optimizer and updater state are included.""" """State dict of a Updater, model, optimizers/schedulers
and updater state are included."""
state_dict = super().state_dict() state_dict = super().state_dict()
for name, model in self.models.items(): for name, model in self.models.items():
state_dict[f"{name}_params"] = model.state_dict() state_dict[f"{name}_params"] = model.state_dict()
@ -184,7 +187,7 @@ class StandardUpdater(UpdaterBase):
def set_state_dict(self, state_dict): def set_state_dict(self, state_dict):
"""Set state dict for a Updater. Parameters of models, states for """Set state dict for a Updater. Parameters of models, states for
optimizers and UpdaterState are restored.""" optimizers/schedulers and UpdaterState are restored."""
for name, model in self.models.items(): for name, model in self.models.items():
model.set_state_dict(state_dict[f"{name}_params"]) model.set_state_dict(state_dict[f"{name}_params"])
for name, optim in self.optimizers.items(): for name, optim in self.optimizers.items():

@ -24,7 +24,7 @@ import tqdm
from deepspeech.training.extensions.extension import Extension from deepspeech.training.extensions.extension import Extension
from deepspeech.training.extensions.extension import PRIORITY_READER from deepspeech.training.extensions.extension import PRIORITY_READER
from deepspeech.training.reporter import scope from deepspeech.training.reporter import ObsScope
from deepspeech.training.triggers import get_trigger from deepspeech.training.triggers import get_trigger
from deepspeech.training.triggers.limit_trigger import LimitTrigger from deepspeech.training.triggers.limit_trigger import LimitTrigger
from deepspeech.training.updaters.updater import UpdaterBase from deepspeech.training.updaters.updater import UpdaterBase
@ -140,11 +140,11 @@ class Trainer():
try: try:
while not stop_trigger(self): while not stop_trigger(self):
self.observation = {} self.observation = {}
# set observation as the report target # set observation as the `report` target
# you can use report freely in Updater.update() # you can use `report` freely in Updater.update()
# updating parameters and state # updating parameters and state
with scope(self.observation): with ObsScope(self.observation):
update() update()
p.update() p.update()

@ -52,6 +52,7 @@ class UpdaterBase():
""" """
def __init__(self, init_state=None): def __init__(self, init_state=None):
# init state
if init_state is None: if init_state is None:
self.state = UpdaterState() self.state = UpdaterState()
else: else:

@ -114,13 +114,13 @@ class Checkpoint():
params_path = checkpoint_path + ".pdparams" params_path = checkpoint_path + ".pdparams"
model_dict = paddle.load(params_path) model_dict = paddle.load(params_path)
model.set_state_dict(model_dict) model.set_state_dict(model_dict)
logger.info("Rank {}: loaded model from {}".format(rank, params_path)) logger.info("Rank {}: Restore model from {}".format(rank, params_path))
optimizer_path = checkpoint_path + ".pdopt" optimizer_path = checkpoint_path + ".pdopt"
if optimizer and os.path.isfile(optimizer_path): if optimizer and os.path.isfile(optimizer_path):
optimizer_dict = paddle.load(optimizer_path) optimizer_dict = paddle.load(optimizer_path)
optimizer.set_state_dict(optimizer_dict) optimizer.set_state_dict(optimizer_dict)
logger.info("Rank {}: loaded optimizer state from {}".format( logger.info("Rank {}: Restore optimizer state from {}".format(
rank, optimizer_path)) rank, optimizer_path))
info_path = re.sub('.pdparams$', '.json', params_path) info_path = re.sub('.pdparams$', '.json', params_path)

@ -84,19 +84,19 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
y_insert_blank = insert_blank(y, blank_id) #(2L+1) y_insert_blank = insert_blank(y, blank_id) #(2L+1)
log_alpha = paddle.zeros( log_alpha = paddle.zeros(
(ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) (ctc_probs.shape[0], len(y_insert_blank))) #(T, 2L+1)
log_alpha = log_alpha - float('inf') # log of zero log_alpha = log_alpha - float('inf') # log of zero
# TODO(Hui Zhang): zeros not support paddle.int16
# self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
state_path = (paddle.zeros( state_path = (paddle.zeros(
(ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 (ctc_probs.shape[0], len(y_insert_blank)), dtype=paddle.int32) - 1
) # state path, Tuple((T, 2L+1)) ) # state path, Tuple((T, 2L+1))
# init start state # init start state
# TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 log_alpha[0, 0] = ctc_probs[0][y_insert_blank[0]] # State-b, Sb
log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb log_alpha[0, 1] = ctc_probs[0][y_insert_blank[1]] # State-nb, Snb
log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb
for t in range(1, ctc_probs.size(0)): # T for t in range(1, ctc_probs.shape[0]): # T
for s in range(len(y_insert_blank)): # 2L+1 for s in range(len(y_insert_blank)): # 2L+1
if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[
s] == y_insert_blank[s - 2]: s] == y_insert_blank[s - 2]:
@ -110,13 +110,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
log_alpha[t - 1, s - 2], log_alpha[t - 1, s - 2],
]) ])
prev_state = [s, s - 1, s - 2] prev_state = [s, s - 1, s - 2]
# TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][
log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( y_insert_blank[s]]
y_insert_blank[s])]
state_path[t, s] = prev_state[paddle.argmax(candidates)] state_path[t, s] = prev_state[paddle.argmax(candidates)]
# self.__setitem_varbase__(item, value) When assign a value to a paddle.Tensor, the data type of the paddle.Tensor not support int16
# TODO(Hui Zhang): zeros not support paddle.int16 state_seq = -1 * paddle.ones((ctc_probs.shape[0], 1), dtype=paddle.int32)
state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32)
candidates = paddle.to_tensor([ candidates = paddle.to_tensor([
log_alpha[-1, len(y_insert_blank) - 1], # Sb log_alpha[-1, len(y_insert_blank) - 1], # Sb
@ -124,11 +122,11 @@ def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor,
]) ])
prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2]
state_seq[-1] = prev_state[paddle.argmax(candidates)] state_seq[-1] = prev_state[paddle.argmax(candidates)]
for t in range(ctc_probs.size(0) - 2, -1, -1): for t in range(ctc_probs.shape[0] - 2, -1, -1):
state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]]
output_alignment = [] output_alignment = []
for t in range(0, ctc_probs.size(0)): for t in range(0, ctc_probs.shape[0]):
output_alignment.append(y_insert_blank[state_seq[t, 0]]) output_alignment.append(y_insert_blank[state_seq[t, 0]])
return output_alignment return output_alignment

@ -12,19 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import getpass import getpass
import logging
import os import os
import socket import socket
import sys import sys
from loguru import logger
from paddle import inference from paddle import inference
FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
DATE_FMT_STR = '%Y/%m/%d %H:%M:%S'
logging.basicConfig(
level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR)
def find_log_dir(log_dir=None): def find_log_dir(log_dir=None):
"""Returns the most suitable directory to put log files into. """Returns the most suitable directory to put log files into.
@ -98,59 +92,28 @@ def find_log_dir_and_names(program_name=None, log_dir=None):
class Log(): class Log():
"""Default Logger for all."""
log_name = None logger.remove()
logger.add(
def __init__(self, logger=None): sys.stdout,
self.logger = logging.getLogger(logger) level='INFO',
self.logger.setLevel(logging.DEBUG) enqueue=True,
filter=lambda record: record['level'].no >= 20)
file_dir = os.getcwd() + '/log' _, file_prefix, _ = find_log_dir_and_names()
if not os.path.exists(file_dir): sink_prefix = os.path.join("exp/log", file_prefix)
os.mkdir(file_dir) sink_path = sink_prefix[:-3] + "{time}.log"
self.log_dir = file_dir logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB")
actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names( def __init__(self, name=None):
program_name=None, log_dir=self.log_dir)
basename = '%s.DEBUG.%d' % (file_prefix, os.getpid())
filename = os.path.join(actual_log_dir, basename)
if Log.log_name is None:
Log.log_name = filename
# Create a symlink to the log file with a canonical name.
symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG')
try:
if os.path.islink(symlink):
os.unlink(symlink)
os.symlink(os.path.basename(Log.log_name), symlink)
except EnvironmentError:
# If it fails, we're sad but it's no error. Commonly, this
# fails because the symlink was created by another user and so
# we can't modify it
pass pass
if not self.logger.hasHandlers():
formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR)
fh = logging.FileHandler(Log.log_name)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
self.logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
self.logger.addHandler(ch)
# stop propagate for propagating may print
# log multiple times
self.logger.propagate = False
def getlog(self): def getlog(self):
return self.logger return logger
class Autolog: class Autolog:
"""Just used by fullchain project"""
def __init__(self, def __init__(self,
batch_size, batch_size,
model_name="DeepSpeech", model_name="DeepSpeech",

@ -0,0 +1,119 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import paddle
from deepspeech.utils.log import Log
logger = Log(__name__).getlog()
# A global variable to record the number of calling times for profiler
# functions. It is used to specify the tracing range of training steps.
_profiler_step_id = 0
# A global variable to avoid parsing from string every time.
_profiler_options = None
class ProfilerOptions(object):
'''
Use a string to initialize a ProfilerOptions.
The string should be in the format: "key1=value1;key2=value;key3=value3".
For example:
"profile_path=model.profile"
"batch_range=[50, 60]; profile_path=model.profile"
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
ProfilerOptions supports following key-value pair:
batch_range - a integer list, e.g. [100, 110].
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
sorted_key - a string, the optional values are 'calls', 'total',
'max', 'min' or 'ave.
tracer_option - a string, the optional values are 'Default', 'OpDetail',
'AllOpDetail'.
profile_path - a string, the path to save the serialized profile data,
which can be used to generate a timeline.
exit_on_finished - a boolean.
'''
def __init__(self, options_str):
assert isinstance(options_str, str)
self._options = {
'batch_range': [10, 20],
'state': 'All',
'sorted_key': 'total',
'tracer_option': 'Default',
'profile_path': '/tmp/profile',
'exit_on_finished': True
}
self._parse_from_string(options_str)
def _parse_from_string(self, options_str):
if not options_str:
return
for kv in options_str.replace(' ', '').split(';'):
key, value = kv.split('=')
if key == 'batch_range':
value_list = value.replace('[', '').replace(']', '').split(',')
value_list = list(map(int, value_list))
if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
1] > value_list[0]:
self._options[key] = value_list
elif key == 'exit_on_finished':
self._options[key] = value.lower() in ("yes", "true", "t", "1")
elif key in [
'state', 'sorted_key', 'tracer_option', 'profile_path'
]:
self._options[key] = value
def __getitem__(self, name):
if self._options.get(name, None) is None:
raise ValueError(
"ProfilerOptions does not have an option named %s." % name)
return self._options[name]
def add_profiler_step(options_str=None):
'''
Enable the operator-level timing using PaddlePaddle's profiler.
The profiler uses a independent variable to count the profiler steps.
One call of this function is treated as a profiler step.
Args:
profiler_options - a string to initialize the ProfilerOptions.
Default is None, and the profiler is disabled.
'''
if options_str is None:
return
global _profiler_step_id
global _profiler_options
if _profiler_options is None:
_profiler_options = ProfilerOptions(options_str)
logger.info(f"Profiler: {options_str}")
logger.info(f"Profiler: {_profiler_options._options}")
if _profiler_step_id == _profiler_options['batch_range'][0]:
paddle.utils.profiler.start_profiler(_profiler_options['state'],
_profiler_options['tracer_option'])
elif _profiler_step_id == _profiler_options['batch_range'][1]:
paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
_profiler_options['profile_path'])
if _profiler_options['exit_on_finished']:
sys.exit(0)
_profiler_step_id += 1

@ -83,7 +83,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = max_size[1:] if max_size.ndim >= 2 else () trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
max_len = max([s.size(0) for s in sequences]) max_len = max([s.shape[0] for s in sequences])
if batch_first: if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims out_dims = (len(sequences), max_len) + trailing_dims
else: else:
@ -91,12 +91,22 @@ def pad_sequence(sequences: List[paddle.Tensor],
out_tensor = sequences[0].new_full(out_dims, padding_value) out_tensor = sequences[0].new_full(out_dims, padding_value)
for i, tensor in enumerate(sequences): for i, tensor in enumerate(sequences):
length = tensor.size(0) length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor # use index notation to prevent duplicate references to the tensor
if batch_first: if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# out_tensor[i, :length, ...] = tensor
if length != 0:
out_tensor[i, :length, ...] = tensor out_tensor[i, :length, ...] = tensor
else: else:
out_tensor[i, length, ...] = tensor
else:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# out_tensor[:length, i, ...] = tensor
if length != 0:
out_tensor[:length, i, ...] = tensor out_tensor[:length, i, ...] = tensor
else:
out_tensor[length, i, ...] = tensor
return out_tensor return out_tensor
@ -139,7 +149,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
#ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
#ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
#return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
B = ys_pad.size(0) B = ys_pad.shape[0]
_sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
_eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
ys_in = paddle.cat([_sos, ys_pad], dim=1) ys_in = paddle.cat([_sos, ys_pad], dim=1)
@ -165,16 +175,10 @@ def th_accuracy(pad_outputs: paddle.Tensor,
Returns: Returns:
float: Accuracy value (0.0 - 1.0). float: Accuracy value (0.0 - 1.0).
""" """
pad_pred = pad_outputs.view( pad_pred = pad_outputs.view(pad_targets.shape[0], pad_targets.shape[1],
pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2) pad_outputs.shape[1]).argmax(2)
mask = pad_targets != ignore_label mask = pad_targets != ignore_label
#TODO(Hui Zhang): sum not support bool type numerator = paddle.sum(
# numerator = paddle.sum(
# pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
numerator = (
pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
numerator = paddle.sum(numerator.type_as(pad_targets)) denominator = paddle.sum(mask)
#TODO(Hui Zhang): sum not support bool type
# denominator = paddle.sum(mask)
denominator = paddle.sum(mask.type_as(pad_targets))
return float(numerator) / float(denominator) return float(numerator) / float(denominator)

@ -16,15 +16,27 @@ import distutils.util
import math import math
import os import os
import random import random
from contextlib import contextmanager
from typing import List from typing import List
import numpy as np import numpy as np
import paddle import paddle
__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"] __all__ = [
"UpdateConfig", "seed_all", 'print_arguments', 'add_arguments', "log_add"
]
@contextmanager
def UpdateConfig(config):
"""Update yacs config"""
config.defrost()
yield
config.freeze()
def seed_all(seed: int=210329): def seed_all(seed: int=210329):
"""freeze random generator seed."""
np.random.seed(seed) np.random.seed(seed)
random.seed(seed) random.seed(seed)
paddle.seed(seed) paddle.seed(seed)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 108 KiB

@ -1,16 +0,0 @@
# Benchmarks
## Acceleration with Multi-GPUs
We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
<img src="../images/multi_gpu_speedup.png" width=450>
| # of GPU | Acceleration Rate |
| -------- | --------------: |
| 1 | 1.00 X |
| 2 | 1.98 X |
| 4 | 3.73 X |
| 8 | 6.95 X |
`utils/profile.sh` provides such a demo profiling tool, you can change it as need.

@ -1,37 +0,0 @@
# FAQ
1. 音频变速快慢到达什么晨读会影响识别率?
变速会提升识别效果一般用0.9 1.0 1.1 的变速。
2. 音量大小到什么程度会影响识别率?
一般训练会固定音量到一个范围内波动过大会影响训练估计在10dB ~ 20dB吧。
3. 语音模型训练数据的最小时长要求时多少?
Aishell-1大约178h的数据数据越多越好。
4. 那些噪声或背景生会影响识别率?
主要是人生干扰和低信噪比会影响识别率。
5. 单条语音数据的长度限制是多少?
一般训练的语音长度会限制在1s~6s之间和训练配置有关。
6. 背景声在识别前是否需要分离出来,或做降噪处理?
需要分离的,需要结合具体场景考虑。
7. 模型是否带有VAD人生激活识别能力
VAD是单独的模型或模块模型不包含此能力。
8. 是否支持长语音识别?
一般过VAD后识别。
9. Mandarin LM Large语言模型需要的硬件配置时怎样的
内存能放得下LM即可。

@ -1,3 +0,0 @@
# Reference
* [wenet](https://github.com/mobvoi/wenet)

@ -1,9 +0,0 @@
# Released Models
## Language Model Released
Language Model | Training Data | Token-based | Size | Descriptions
:-------------:| :------------:| :-----: | -----: | :-----------------
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings

@ -1,34 +0,0 @@
# Trying Live Demo with Your Own Voice
Until now, an ASR model is trained and tested qualitatively (`infer`) and quantitatively (`test`) with existing audio files. But it is not yet tested with your own speech. We build up a real-time demo ASR engine with the trained model, enabling you to test and play around with the demo, with your own voice.
First, change your directory to `examples/aishell` and `source path.sh`.
To start the demo's server, please run this in one console:
```bash
CUDA_VISIBLE_DEVICES=0 bash local/server.sh
```
For the machine (might not be the same machine) to run the demo's client, please do the following installation before moving on.
For example, on MAC OS X:
```bash
brew install portaudio
pip install pyaudio
pip install keyboard
```
Then to start the client, please run this in another console:
```bash
CUDA_VISIBLE_DEVICES=0 bash local/client.sh
```
Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key.
Notice that `deepspeech/exps/deepspeech2/deploy/client.py` must be run on a machine with a microphone device, while `deepspeech/exps/deepspeech2/deploy/server.py` could be run on one without any audio recording hardware, e.g. any remote server machine. Just be careful to set the `host_ip` and `host_port` argument with the actual accessible IP address and port, if the server and client are running with two separate machines. Nothing should be done if they are running on one single machine.
Please also refer to `examples/aishell/local/server.sh`, which will first download a pre-trained Chinese model (trained with AISHELL1) and then start the demo server with the model. With running `examples/aishell/local/client.sh`, you can speak Chinese to test it. If you would like to try some other models, just update `--checkpoint_path` argument in the script.  

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 93 KiB

@ -0,0 +1,190 @@
# Deepspeech2
## Streaming
The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
To illustrate the model implementation clearly, 3 parts are described in detail.
- Data Preparation
- Encoder
- Decoder
In addition, the training process and the testing process are also introduced.
The arcitecture of the model is shown in Fig.1.
<p align="center">
<img src="../images/ds2onlineModel.png" width=800>
<br/>Fig.1 The Arcitecture of deepspeech2 online model
</p>
### Data Preparation
#### Vocabulary
For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>. For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
```
# The code to build vocabulary
cd examples/aishell/s0
python3 ../../../utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
# vocabulary for aishell dataset (Mandarin)
vi examples/aishell/s0/data/vocab.txt
# vocabulary for librispeech dataset (English)
vi examples/librispeech/s0/data/vocab.txt
```
#### CMVN
For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
```
# The code to compute the feature mean and std
cd examples/aishell/s0
python3 ../../../utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--specgram_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \
--sample_rate=16000 \
--use_dB_normalization=True \
--num_samples=2000 \
--num_workers=10 \
--output_path="data/mean_std.json"
```
#### Feature Extraction
For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
Currently, the released deepspeech2 online model use the linear feature extraction method.
```
The code for feature extraction
vi deepspeech/frontend/featurizer/audio_featurizer.py
```
### Encoder
The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.
The code of Encoder is in:
```
vi deepspeech/models/ds2_online/deepspeech2.py
```
### Decoder
To got the character possibilities of each frame, the feature representation of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
The code of the decoder is in:
```
# The code of constructing the decoder in model
vi deepspeech/models/ds2_online/deepspeech2.py
# The code of CTC Decoder
vi deepspeech/modules/ctc.py
```
## Training Process
Using the command below, you can train the deepspeech2 online model.
```
cd examples/aishell/s0
bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
```
The detail commands are:
```
# The code for training in run.sh
set -e
source path.sh
gpus=2,3,5,7
stage=0
stop_stage=5
conf_path=conf/deepspeech2_online.yaml # conf/deepspeech2.yaml | conf/deepspeech2_online.yaml
avg_num=1
model_type=online # online | offline
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi
```
By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.
## Testing Process
Using the command below, you can test the deepspeech2 online model.
```
bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
```
The detail commands are:
```
conf_path=conf/deepspeech2_online.yaml
avg_num=1
model_type=online
avg_ckpt=avg_${avg_num}
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES=5 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
fi
```
After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
## Non-Streaming
The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
The arcitecture of the model is shown in Fig.2.
<p align="center">
<img src="../images/ds2offlineModel.png" width=800>
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
</p>
For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
The code of encoder and decoder for deepspeech2 offline model is in:
```
vi deepspeech/models/ds2/deepspeech2.py
```
The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
Only some changes should be noticed.
For training and testing, the "model_type" and the "conf_path" must be set.
```
# Training offline
cd examples/aishell/s0
bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
```
```
# Testing offline
cd examples/aishell/s0
bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
```

@ -1,13 +1,20 @@
# Features # Features
### Dataset
* Aishell
* Librispeech
* THCHS30
* TIMIT
### Speech Recognition ### Speech Recognition
* Offline * Non-Streaming
* [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf) * [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
* [Transformer](https://arxiv.org/abs/1706.03762) * [Transformer](https://arxiv.org/abs/1706.03762)
* [Conformer](https://arxiv.org/abs/2005.08100) * [Conformer](https://arxiv.org/abs/2005.08100)
* Online * Streaming
* [Baidu's DeepSpeech2](http://proceedings.mlr.press/v48/amodei16.pdf)
* [U2](https://arxiv.org/pdf/2012.05481.pdf) * [U2](https://arxiv.org/pdf/2012.05481.pdf)
### Language Model ### Language Model
@ -22,6 +29,15 @@
* beam search * beam search
* attention rescore * attention rescore
### Deployment
* Paddle Inference
### Aligment
* MFA
* CTC Aligment
### Speech Frontend ### Speech Frontend
* Audio * Audio

@ -4,15 +4,16 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
## Prerequisites ## Prerequisites
- Python >= 3.7 - Python >= 3.7
- PaddlePaddle 2.0.0 or later (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
## Setup ## Setup (Important)
- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox, and `swig`, e.g. installing them via `apt-get`: - Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox, and `swig`, e.g. installing them via `apt-get`:
```bash ```bash
sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
``` ```
The version of `swig` should >= 3.0
or, installing them via `yum`: or, installing them via `yum`:

@ -35,52 +35,3 @@ Different from the English language model, Mandarin language model is character-
* A whitespace character between two tokens is inserted. * A whitespace character between two tokens is inserted.
Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings. Please notice that the released language models only contain Chinese simplified characters. After preprocessing done we can begin to train the language model. The key training arguments for small LM is '-o 5 --prune 0 1 2 4 4' and '-o 5' for large LM. Please refer above section for the meaning of each argument. We also convert the arpa file to binary file using default settings.
## [KenLM](http://kheafield.com/code/kenlm/)
统计语言模型工具有比较多的选择目前使用比较好的有srilm及kenlm其中kenlm比srilm晚出来训练速度也更快而且支持单机大数据的训练。现在介绍一下kenlm的使用方法。
1. 工具包的下载地址http://kheafield.com/code/kenlm.tar.gz
2. 使用。该工具在linux环境下使用方便。 先确保linux环境已经按照1.36.0的Boost和zlib
```
boost:
yum install boost
yum install boost-devel
zlib:
yum install zlib
yum install zlib-devel
```
然后gcc版本需要是4.8.2及以上。
```
wget -O - https://kheafield.com/code/kenlm.tar.gz |tar xz
mkdir kenlm/build
cd kenlm/build
cmake ..
make -j2
```
3. 训练。使用如下命令进行训练:
```
build/bin/lmplz -o 3 --verbose_header --text people2014corpus_words.txt --arpa result/people2014corpus_words.arps
```
其中,
1people2014corpus_words.txt文件必须是分词以后的文件。
训练语料<人民日报2014版熟语料>,包括: 1标准人工切词及词性数据people2014.tar.gz 2未切词文本数据people2014_words.txt 3kenlm训练字粒度语言模型文件及其二进制文件people2014corpus_chars.arps/klm 4kenlm词粒度语言模型文件及其二进制文件people2014corpus_words.arps/klm。
2-o后面的5表示的是5-gram,一般取到3即可但可以结合自己实际情况判断。
4. 压缩。压缩模型为二进制,方便模型快速加载:
```
build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
```

@ -0,0 +1,8 @@
# Reference
We refer these repos to build `model` and `engine`:
* [delta](https://github.com/Delta-ML/delta.git)
* [espnet](https://github.com/espnet/espnet.git)
* [kaldi](https://github.com/kaldi-asr/kaldi.git)
* [wenet](https://github.com/mobvoi/wenet)

@ -0,0 +1,28 @@
# Released Models
## Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
## Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
## Language Model Released
Language Model | Training Data | Token-based | Size | Descriptions
:-------------:| :------------:| :-----: | -----: | :-----------------
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings

@ -0,0 +1,11 @@
# 1xt2x
Convert Deepspeech 1.8 released model to 2.x.
## Model
* Deepspeech2x
## Exp
* baidu_en8k
* aishell
* librispeech

@ -0,0 +1,4 @@
exp
data
*log
tmp

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save