Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into fix_bug
commit
f4e59293bf
@ -1,605 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "academic-surname",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"from paddle import nn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "fundamental-treasure",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"L = nn.Linear(256, 2048)\n",
|
||||
"L2 = nn.Linear(2048, 256)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "consolidated-elephant",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import torch\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "moderate-noise",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"float64\n",
|
||||
"Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[[-1.54171216, -2.61531472, -1.79881978, ..., -0.31395876, 0.56513089, -0.44516513],\n",
|
||||
" [-0.79492962, 1.91157901, 0.66567147, ..., 0.54825783, -1.01471853, -0.84924090],\n",
|
||||
" [-1.22556651, -0.36225814, 0.65063190, ..., 0.65726501, 0.05563191, 0.09009409],\n",
|
||||
" ...,\n",
|
||||
" [ 0.38615900, -0.77905393, 0.99732304, ..., -1.38463700, -3.32365036, -1.31089687],\n",
|
||||
" [ 0.05579993, 0.06885809, -1.66662002, ..., -0.23346378, -3.29372883, 1.30561364],\n",
|
||||
" [ 1.90676069, 1.95093191, -0.28849599, ..., -0.06860496, 0.95347673, 1.00475824]],\n",
|
||||
"\n",
|
||||
" [[-0.91453546, 0.55298805, -1.06146812, ..., -0.86378336, 1.00454640, 1.26062179],\n",
|
||||
" [ 0.10223761, 0.81301165, 2.36865163, ..., 0.16821407, 0.29240361, 1.05408621],\n",
|
||||
" [-1.33196676, 1.94433689, 0.01934209, ..., 0.48036841, 0.51585966, 1.22893548],\n",
|
||||
" ...,\n",
|
||||
" [-0.19558455, -0.47075930, 0.90796155, ..., -1.28598249, -0.24321797, 0.17734711],\n",
|
||||
" [ 0.89819717, -1.39516675, 0.17138045, ..., 2.39761519, 1.76364994, -0.52177650],\n",
|
||||
" [ 0.94122332, -0.18581429, 1.36099780, ..., 0.67647684, -0.04699665, 1.51205540]]])\n",
|
||||
"tensor([[[-1.5417, -2.6153, -1.7988, ..., -0.3140, 0.5651, -0.4452],\n",
|
||||
" [-0.7949, 1.9116, 0.6657, ..., 0.5483, -1.0147, -0.8492],\n",
|
||||
" [-1.2256, -0.3623, 0.6506, ..., 0.6573, 0.0556, 0.0901],\n",
|
||||
" ...,\n",
|
||||
" [ 0.3862, -0.7791, 0.9973, ..., -1.3846, -3.3237, -1.3109],\n",
|
||||
" [ 0.0558, 0.0689, -1.6666, ..., -0.2335, -3.2937, 1.3056],\n",
|
||||
" [ 1.9068, 1.9509, -0.2885, ..., -0.0686, 0.9535, 1.0048]],\n",
|
||||
"\n",
|
||||
" [[-0.9145, 0.5530, -1.0615, ..., -0.8638, 1.0045, 1.2606],\n",
|
||||
" [ 0.1022, 0.8130, 2.3687, ..., 0.1682, 0.2924, 1.0541],\n",
|
||||
" [-1.3320, 1.9443, 0.0193, ..., 0.4804, 0.5159, 1.2289],\n",
|
||||
" ...,\n",
|
||||
" [-0.1956, -0.4708, 0.9080, ..., -1.2860, -0.2432, 0.1773],\n",
|
||||
" [ 0.8982, -1.3952, 0.1714, ..., 2.3976, 1.7636, -0.5218],\n",
|
||||
" [ 0.9412, -0.1858, 1.3610, ..., 0.6765, -0.0470, 1.5121]]])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 51, 256)\n",
|
||||
"print(x.dtype)\n",
|
||||
"px = paddle.to_tensor(x, dtype='float32')\n",
|
||||
"tx = torch.tensor(x, dtype=torch.float32)\n",
|
||||
"print(px)\n",
|
||||
"print(tx)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cooked-progressive",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "mechanical-prisoner",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
|
||||
"t_norm_ff = data['norm_ff']\n",
|
||||
"t_ff_out = data['ff_out']\n",
|
||||
"t_ff_l_x = data['ff_l_x']\n",
|
||||
"t_ff_l_a_x = data['ff_l_a_x']\n",
|
||||
"t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
|
||||
"t_ps = data['ps']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "indie-marriage",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "assured-zambia",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
|
||||
"L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
|
||||
"\n",
|
||||
"ps = []\n",
|
||||
"for n, p in L.named_parameters():\n",
|
||||
" ps.append(p)\n",
|
||||
"\n",
|
||||
"for n, p in L2.state_dict().items():\n",
|
||||
" ps.append(p)\n",
|
||||
" \n",
|
||||
"for p, tp in zip(ps, t_ps):\n",
|
||||
" print(np.allclose(p.numpy(), tp.T))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "committed-jacob",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "extreme-traffic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "optimum-milwaukee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "viral-indian",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
|
||||
"# t_norm_ff = data['norm_ff']\n",
|
||||
"# t_ff_out = data['ff_out']\n",
|
||||
"# t_ff_l_x = data['ff_l_x']\n",
|
||||
"# t_ff_l_a_x = data['ff_l_a_x']\n",
|
||||
"# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
|
||||
"# t_ps = data['ps']\n",
|
||||
"TL = torch.nn.Linear(256, 2048)\n",
|
||||
"TL2 = torch.nn.Linear(2048, 256)\n",
|
||||
"TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
|
||||
"TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
|
||||
"\n",
|
||||
"# for n, p in TL.named_parameters():\n",
|
||||
"# print(n, p)\n",
|
||||
"# for n, p in TL2.named_parameters():\n",
|
||||
"# print(n, p)\n",
|
||||
"\n",
|
||||
"ps = []\n",
|
||||
"for n, p in TL.state_dict().items():\n",
|
||||
" ps.append(p.data.numpy())\n",
|
||||
" \n",
|
||||
"for n, p in TL2.state_dict().items():\n",
|
||||
" ps.append(p.data.numpy())\n",
|
||||
" \n",
|
||||
"for p, tp in zip(ps, t_ps):\n",
|
||||
" print(np.allclose(p, tp))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "skilled-vietnamese",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[[ 0.67277956 0.08313607 -0.62761104 ... -0.17480263 0.42718208\n",
|
||||
" -0.5787626 ]\n",
|
||||
" [ 0.91516656 0.5393416 1.7159258 ... 0.06144593 0.06486575\n",
|
||||
" -0.03350811]\n",
|
||||
" [ 0.438351 0.6227843 0.24096036 ... 1.0912522 -0.90929437\n",
|
||||
" -1.012989 ]\n",
|
||||
" ...\n",
|
||||
" [ 0.68631977 0.14240924 0.10763275 ... -0.11513516 0.48065388\n",
|
||||
" 0.04070369]\n",
|
||||
" [-0.9525228 0.23197874 0.31264272 ... 0.5312439 0.18773697\n",
|
||||
" -0.8450228 ]\n",
|
||||
" [ 0.42024016 -0.04561988 0.54541194 ... -0.41933843 -0.00436018\n",
|
||||
" -0.06663495]]\n",
|
||||
"\n",
|
||||
" [[-0.11638781 -0.33566502 -0.20887226 ... 0.17423287 -0.9195841\n",
|
||||
" -0.8161046 ]\n",
|
||||
" [-0.3469874 0.88269687 -0.11887559 ... -0.15566081 0.16357468\n",
|
||||
" -0.20766167]\n",
|
||||
" [-0.3847657 0.3984318 -0.06963477 ... -0.00360622 1.2360432\n",
|
||||
" -0.26811332]\n",
|
||||
" ...\n",
|
||||
" [ 0.08230796 -0.46158582 0.54582864 ... 0.15747628 -0.44790155\n",
|
||||
" 0.06020184]\n",
|
||||
" [-0.8095085 0.43163058 -0.42837143 ... 0.8627463 0.90656304\n",
|
||||
" 0.15847842]\n",
|
||||
" [-1.485811 -0.18216592 -0.8882585 ... 0.32596245 0.7822631\n",
|
||||
" -0.6460344 ]]]\n",
|
||||
"[[[ 0.67278004 0.08313602 -0.6276114 ... -0.17480245 0.42718196\n",
|
||||
" -0.5787625 ]\n",
|
||||
" [ 0.91516703 0.5393413 1.7159253 ... 0.06144581 0.06486579\n",
|
||||
" -0.03350812]\n",
|
||||
" [ 0.43835106 0.62278455 0.24096027 ... 1.0912521 -0.9092943\n",
|
||||
" -1.0129892 ]\n",
|
||||
" ...\n",
|
||||
" [ 0.6863195 0.14240888 0.10763284 ... -0.11513527 0.48065376\n",
|
||||
" 0.04070365]\n",
|
||||
" [-0.9525231 0.23197863 0.31264275 ... 0.53124386 0.18773702\n",
|
||||
" -0.84502304]\n",
|
||||
" [ 0.42024007 -0.04561983 0.545412 ... -0.41933888 -0.00436005\n",
|
||||
" -0.066635 ]]\n",
|
||||
"\n",
|
||||
" [[-0.11638767 -0.33566508 -0.20887226 ... 0.17423296 -0.9195838\n",
|
||||
" -0.8161046 ]\n",
|
||||
" [-0.34698725 0.88269705 -0.11887549 ... -0.15566081 0.16357464\n",
|
||||
" -0.20766166]\n",
|
||||
" [-0.3847657 0.3984319 -0.06963488 ... -0.00360619 1.2360426\n",
|
||||
" -0.26811326]\n",
|
||||
" ...\n",
|
||||
" [ 0.08230786 -0.4615857 0.5458287 ... 0.15747619 -0.44790167\n",
|
||||
" 0.06020182]\n",
|
||||
" [-0.8095083 0.4316307 -0.42837155 ... 0.862746 0.9065631\n",
|
||||
" 0.15847899]\n",
|
||||
" [-1.485811 -0.18216613 -0.8882584 ... 0.32596254 0.7822631\n",
|
||||
" -0.6460344 ]]]\n",
|
||||
"True\n",
|
||||
"False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y = L(px)\n",
|
||||
"print(y.numpy())\n",
|
||||
"\n",
|
||||
"ty = TL(tx)\n",
|
||||
"print(ty.data.numpy())\n",
|
||||
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
|
||||
"print(np.allclose(y.numpy(), ty.detach().numpy()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "incorrect-allah",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "prostate-cameroon",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "governmental-surge",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 0.04476918 0.554463 -0.3027508 ... -0.49600336 0.3751858\n",
|
||||
" 0.8254095 ]\n",
|
||||
" [ 0.95594174 -0.29528382 -1.2899452 ... 0.43718258 0.05584608\n",
|
||||
" -0.06974669]]\n",
|
||||
"[[ 0.04476918 0.5544631 -0.3027507 ... -0.49600336 0.37518573\n",
|
||||
" 0.8254096 ]\n",
|
||||
" [ 0.95594174 -0.29528376 -1.2899454 ... 0.4371827 0.05584623\n",
|
||||
" -0.0697467 ]]\n",
|
||||
"True\n",
|
||||
"False\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 256)\n",
|
||||
"px = paddle.to_tensor(x, dtype='float32')\n",
|
||||
"tx = torch.tensor(x, dtype=torch.float32)\n",
|
||||
"y = L(px)\n",
|
||||
"print(y.numpy())\n",
|
||||
"ty = TL(tx)\n",
|
||||
"print(ty.data.numpy())\n",
|
||||
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
|
||||
"print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
|
||||
"print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "confidential-jacket",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "improved-civilization",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5e7e7c9fde8350084abf1898cf52651cfc84b17a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(paddle.version.commit)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "d1e2d3b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['__builtins__',\n",
|
||||
" '__cached__',\n",
|
||||
" '__doc__',\n",
|
||||
" '__file__',\n",
|
||||
" '__loader__',\n",
|
||||
" '__name__',\n",
|
||||
" '__package__',\n",
|
||||
" '__spec__',\n",
|
||||
" 'commit',\n",
|
||||
" 'full_version',\n",
|
||||
" 'istaged',\n",
|
||||
" 'major',\n",
|
||||
" 'minor',\n",
|
||||
" 'mkl',\n",
|
||||
" 'patch',\n",
|
||||
" 'rc',\n",
|
||||
" 'show',\n",
|
||||
" 'with_mkl']"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dir(paddle.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "c880c719",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(paddle.version.full_version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "f26977bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"commit: 5e7e7c9fde8350084abf1898cf52651cfc84b17a\n",
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(paddle.version.show())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "04ad47f6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1.6.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(torch.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "e1e03830",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['__builtins__',\n",
|
||||
" '__cached__',\n",
|
||||
" '__doc__',\n",
|
||||
" '__file__',\n",
|
||||
" '__loader__',\n",
|
||||
" '__name__',\n",
|
||||
" '__package__',\n",
|
||||
" '__spec__',\n",
|
||||
" '__version__',\n",
|
||||
" 'cuda',\n",
|
||||
" 'debug',\n",
|
||||
" 'git_version',\n",
|
||||
" 'hip']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dir(torch.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "4ad0389b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'b31f58de6fa8bbda5353b3c77d9be4914399724d'"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.version.git_version"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "7870ea10",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'10.2'"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.version.cuda"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "db8ee5a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6321ec2a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -1,389 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "emerging-meter",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" def convert_to_list(value, n, name, dtype=np.int):\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
|
||||
" from numpy.dual import register_func\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" long_ = _make_signed(np.long)\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" ulong = _make_unsigned(np.long)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import math\n",
|
||||
"import random\n",
|
||||
"import tarfile\n",
|
||||
"import logging\n",
|
||||
"import numpy as np\n",
|
||||
"from collections import namedtuple\n",
|
||||
"from functools import partial\n",
|
||||
"\n",
|
||||
"import paddle\n",
|
||||
"from paddle.io import Dataset\n",
|
||||
"from paddle.io import DataLoader\n",
|
||||
"from paddle.io import BatchSampler\n",
|
||||
"from paddle.io import DistributedBatchSampler\n",
|
||||
"from paddle import distributed as dist\n",
|
||||
"\n",
|
||||
"from data_utils.utility import read_manifest\n",
|
||||
"from data_utils.augmentor.augmentation import AugmentationPipeline\n",
|
||||
"from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n",
|
||||
"from data_utils.speech import SpeechSegment\n",
|
||||
"from data_utils.normalizer import FeatureNormalizer\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from data_utils.dataset import (\n",
|
||||
" DeepSpeech2Dataset,\n",
|
||||
" DeepSpeech2DistributedBatchSampler,\n",
|
||||
" DeepSpeech2BatchSampler,\n",
|
||||
" SpeechCollator,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "excessive-american",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_dataloader(manifest_path,\t\n",
|
||||
" vocab_filepath,\t\n",
|
||||
" mean_std_filepath,\t\n",
|
||||
" augmentation_config='{}',\t\n",
|
||||
" max_duration=float('inf'),\t\n",
|
||||
" min_duration=0.0,\t\n",
|
||||
" stride_ms=10.0,\t\n",
|
||||
" window_ms=20.0,\t\n",
|
||||
" max_freq=None,\t\n",
|
||||
" specgram_type='linear',\t\n",
|
||||
" use_dB_normalization=True,\t\n",
|
||||
" random_seed=0,\t\n",
|
||||
" keep_transcription_text=False,\t\n",
|
||||
" is_training=False,\t\n",
|
||||
" batch_size=1,\t\n",
|
||||
" num_workers=0,\t\n",
|
||||
" sortagrad=False,\t\n",
|
||||
" shuffle_method=None,\t\n",
|
||||
" dist=False):\t\n",
|
||||
"\n",
|
||||
" dataset = DeepSpeech2Dataset(\t\n",
|
||||
" manifest_path,\t\n",
|
||||
" vocab_filepath,\t\n",
|
||||
" mean_std_filepath,\t\n",
|
||||
" augmentation_config=augmentation_config,\t\n",
|
||||
" max_duration=max_duration,\t\n",
|
||||
" min_duration=min_duration,\t\n",
|
||||
" stride_ms=stride_ms,\t\n",
|
||||
" window_ms=window_ms,\t\n",
|
||||
" max_freq=max_freq,\t\n",
|
||||
" specgram_type=specgram_type,\t\n",
|
||||
" use_dB_normalization=use_dB_normalization,\t\n",
|
||||
" random_seed=random_seed,\t\n",
|
||||
" keep_transcription_text=keep_transcription_text)\t\n",
|
||||
"\n",
|
||||
" if dist:\t\n",
|
||||
" batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n",
|
||||
" dataset,\t\n",
|
||||
" batch_size,\t\n",
|
||||
" num_replicas=None,\t\n",
|
||||
" rank=None,\t\n",
|
||||
" shuffle=is_training,\t\n",
|
||||
" drop_last=is_training,\t\n",
|
||||
" sortagrad=is_training,\t\n",
|
||||
" shuffle_method=shuffle_method)\t\n",
|
||||
" else:\t\n",
|
||||
" batch_sampler = DeepSpeech2BatchSampler(\t\n",
|
||||
" dataset,\t\n",
|
||||
" shuffle=is_training,\t\n",
|
||||
" batch_size=batch_size,\t\n",
|
||||
" drop_last=is_training,\t\n",
|
||||
" sortagrad=is_training,\t\n",
|
||||
" shuffle_method=shuffle_method)\t\n",
|
||||
"\n",
|
||||
" def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n",
|
||||
" \"\"\"\t\n",
|
||||
" Padding audio features with zeros to make them have the same shape (or\t\n",
|
||||
" a user-defined shape) within one bach.\t\n",
|
||||
"\n",
|
||||
" If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n",
|
||||
" as the target shape for padding. Otherwise, `padding_to` will be the\t\n",
|
||||
" target shape (only refers to the second axis).\t\n",
|
||||
"\n",
|
||||
" If `flatten` is True, features will be flatten to 1darray.\t\n",
|
||||
" \"\"\"\t\n",
|
||||
" new_batch = []\t\n",
|
||||
" # get target shape\t\n",
|
||||
" max_length = max([audio.shape[1] for audio, text in batch])\t\n",
|
||||
" if padding_to != -1:\t\n",
|
||||
" if padding_to < max_length:\t\n",
|
||||
" raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n",
|
||||
" \"than any instance's shape in the batch\")\t\n",
|
||||
" max_length = padding_to\t\n",
|
||||
" max_text_length = max([len(text) for audio, text in batch])\t\n",
|
||||
" # padding\t\n",
|
||||
" padded_audios = []\t\n",
|
||||
" audio_lens = []\t\n",
|
||||
" texts, text_lens = [], []\t\n",
|
||||
" for audio, text in batch:\t\n",
|
||||
" padded_audio = np.zeros([audio.shape[0], max_length])\t\n",
|
||||
" padded_audio[:, :audio.shape[1]] = audio\t\n",
|
||||
" if flatten:\t\n",
|
||||
" padded_audio = padded_audio.flatten()\t\n",
|
||||
" padded_audios.append(padded_audio)\t\n",
|
||||
" audio_lens.append(audio.shape[1])\t\n",
|
||||
"\n",
|
||||
" padded_text = np.zeros([max_text_length])\n",
|
||||
" if is_training:\n",
|
||||
" padded_text[:len(text)] = text\t# ids\n",
|
||||
" else:\n",
|
||||
" padded_text[:len(text)] = [ord(t) for t in text] # string\n",
|
||||
" \n",
|
||||
" texts.append(padded_text)\t\n",
|
||||
" text_lens.append(len(text))\t\n",
|
||||
"\n",
|
||||
" padded_audios = np.array(padded_audios).astype('float32')\t\n",
|
||||
" audio_lens = np.array(audio_lens).astype('int64')\t\n",
|
||||
" texts = np.array(texts).astype('int32')\t\n",
|
||||
" text_lens = np.array(text_lens).astype('int64')\t\n",
|
||||
" return padded_audios, texts, audio_lens, text_lens\t\n",
|
||||
"\n",
|
||||
" loader = DataLoader(\t\n",
|
||||
" dataset,\t\n",
|
||||
" batch_sampler=batch_sampler,\t\n",
|
||||
" collate_fn=partial(padding_batch, is_training=is_training),\t\n",
|
||||
" num_workers=num_workers)\t\n",
|
||||
" return loader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "naval-brave",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import argparse\n",
|
||||
"import functools\n",
|
||||
"from utils.utility import add_arguments, print_arguments\n",
|
||||
"parser = argparse.ArgumentParser(description=__doc__)\n",
|
||||
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
|
||||
"# yapf: disable\n",
|
||||
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
|
||||
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
|
||||
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
|
||||
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
|
||||
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
|
||||
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
|
||||
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
|
||||
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
|
||||
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
|
||||
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
|
||||
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
|
||||
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
|
||||
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
|
||||
" \"bi-directional RNNs. Not for GRU.\")\n",
|
||||
"add_arg('infer_manifest', str,\n",
|
||||
" 'examples/aishell/data/manifest.dev',\n",
|
||||
" \"Filepath of manifest to infer.\")\n",
|
||||
"add_arg('mean_std_path', str,\n",
|
||||
" 'examples/aishell/data/mean_std.npz',\n",
|
||||
" \"Filepath of normalizer's mean & std.\")\n",
|
||||
"add_arg('vocab_path', str,\n",
|
||||
" 'examples/aishell/data/vocab.txt',\n",
|
||||
" \"Filepath of vocabulary.\")\n",
|
||||
"add_arg('lang_model_path', str,\n",
|
||||
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
|
||||
" \"Filepath for language model.\")\n",
|
||||
"add_arg('model_path', str,\n",
|
||||
" 'examples/aishell/checkpoints/step_final',\n",
|
||||
" \"If None, the training starts from scratch, \"\n",
|
||||
" \"otherwise, it resumes from the pre-trained model.\")\n",
|
||||
"add_arg('decoding_method', str,\n",
|
||||
" 'ctc_beam_search',\n",
|
||||
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
|
||||
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
|
||||
"add_arg('error_rate_type', str,\n",
|
||||
" 'wer',\n",
|
||||
" \"Error rate type for evaluation.\",\n",
|
||||
" choices=['wer', 'cer'])\n",
|
||||
"add_arg('specgram_type', str,\n",
|
||||
" 'linear',\n",
|
||||
" \"Audio feature type. Options: linear, mfcc.\",\n",
|
||||
" choices=['linear', 'mfcc'])\n",
|
||||
"# yapf: disable\n",
|
||||
"args = parser.parse_args([])\n",
|
||||
"print(vars(args))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "bearing-physics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch_reader = create_dataloader(\n",
|
||||
" manifest_path=args.infer_manifest,\n",
|
||||
" vocab_filepath=args.vocab_path,\n",
|
||||
" mean_std_filepath=args.mean_std_path,\n",
|
||||
" augmentation_config='{}',\n",
|
||||
" #max_duration=float('inf'),\n",
|
||||
" max_duration=27.0,\n",
|
||||
" min_duration=0.0,\n",
|
||||
" stride_ms=10.0,\n",
|
||||
" window_ms=20.0,\n",
|
||||
" max_freq=None,\n",
|
||||
" specgram_type=args.specgram_type,\n",
|
||||
" use_dB_normalization=True,\n",
|
||||
" random_seed=0,\n",
|
||||
" keep_transcription_text=True,\n",
|
||||
" is_training=False,\n",
|
||||
" batch_size=args.num_samples,\n",
|
||||
" sortagrad=True,\n",
|
||||
" shuffle_method=None,\n",
|
||||
" dist=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "classified-melissa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
|
||||
" [[22823, 26102, 20195, 37324, 0 , 0 ],\n",
|
||||
" [22238, 26469, 23601, 22909, 0 , 0 ],\n",
|
||||
" [20108, 26376, 22235, 26085, 0 , 0 ],\n",
|
||||
" [36824, 35201, 20445, 25345, 32654, 24863],\n",
|
||||
" [29042, 27748, 21463, 23456, 0 , 0 ]])\n",
|
||||
"test raw 大时代里\n",
|
||||
"test raw 煲汤受宠\n",
|
||||
"audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
|
||||
" [163, 167, 180, 186, 186])\n",
|
||||
"test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [4, 4, 4, 6, 4])\n",
|
||||
"audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
|
||||
" [[[ 1.11669052, 0.79015088, 0.93658292, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.83549136, 0.72643483, 0.83578080, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.89155018, -0.18894747, -0.53357804, ..., 0. , 0. , 0. ],\n",
|
||||
" ...,\n",
|
||||
" [ 0.33386710, -0.81240511, 0.12869737, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.17537928, 0.58380985, 0.70696265, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.84175998, 1.22041416, 0.07929770, ..., 0. , 0. , 0. ]],\n",
|
||||
"\n",
|
||||
" [[-0.35964420, 0.77392709, 0.71409988, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.15990183, 0.42962283, 0.06222462, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.31166190, -0.74864638, -0.52836996, ..., 0. , 0. , 0. ],\n",
|
||||
" ...,\n",
|
||||
" [-0.27546275, 0.32889456, 0.12410031, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.16264282, 0.49418071, -0.15960945, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.12476666, 0.00516864, 1.16021466, ..., 0. , 0. , 0. ]],\n",
|
||||
"\n",
|
||||
" [[ 0.90202141, 1.48541915, 0.92062062, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.82661545, 1.37171340, 0.86746097, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.62287915, -0.48645937, 0.35041964, ..., 0. , 0. , 0. ],\n",
|
||||
" ...,\n",
|
||||
" [ 0.07376949, 0.07138316, 0.76355994, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.32306790, 0.43247896, 1.27311838, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.97667056, 0.60747612, 0.79181534, ..., 0. , 0. , 0. ]],\n",
|
||||
"\n",
|
||||
" [[ 0.72022128, 0.95428467, 0.92766261, ..., 0.29105374, -0.45564806, -0.62151009],\n",
|
||||
" [ 0.42083180, 0.49279949, 0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n",
|
||||
" [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n",
|
||||
" ...,\n",
|
||||
" [-0.23009977, 1.06155431, 1.09065628, ..., 0.25581080, 0.53794998, -1.22650719],\n",
|
||||
" [-1.37693381, 0.30778193, 0.17152318, ..., 0.51650339, 0.25580606, 0.83097816],\n",
|
||||
" [-1.62180591, 1.30567718, 1.09928656, ..., -0.77590007, 1.27712476, 0.53189957]],\n",
|
||||
"\n",
|
||||
" [[ 1.03205252, -0.51535392, 0.21077573, ..., 0.76618457, 1.27425683, 1.52250278],\n",
|
||||
" [ 0.82059991, 0.43990925, 0.13090958, ..., 0.86662549, 1.01687658, 1.48495352],\n",
|
||||
" [-0.75489789, -0.01997089, -0.65174174, ..., 0.09061214, -0.55211234, -0.01614586],\n",
|
||||
" ...,\n",
|
||||
" [ 0.50985396, 1.84555030, 0.79185146, ..., 1.13666189, 1.19898069, 1.98158395],\n",
|
||||
" [ 1.98721015, 2.52385354, 1.11714780, ..., 0.19416514, 1.11329341, 0.64460152],\n",
|
||||
" [ 2.69512844, 1.90993905, 0.50245082, ..., -0.50902629, 0.03333465, -1.24584770]]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
|
||||
" print('test', text)\n",
|
||||
" print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
|
||||
" print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
|
||||
" print('audio len', audio_len)\n",
|
||||
" print('test len', text_len)\n",
|
||||
" print('audio', audio)\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "unexpected-skating",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "minus-modern",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,290 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "breeding-haven",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%cd ..\n",
|
||||
"%pwd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "appropriate-theta",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"LICENSE deepspeech examples\t\t requirements.txt tools\r\n",
|
||||
"README.md docs\t libsndfile-1.0.28\t setup.sh\t utils\r\n",
|
||||
"README_cn.md env.sh\t libsndfile-1.0.28.tar.gz tests\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "entire-bloom",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" def convert_to_list(value, n, name, dtype=np.int):\n",
|
||||
"WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
|
||||
"WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||
"WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
|
||||
"WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
|
||||
"WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
|
||||
"WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
|
||||
"WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from deepspeech.modules import loss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "governmental-aircraft",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "proprietary-disaster",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.Tensor.repeat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "first-diagram",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<property at 0x7fb515eeeb88>"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.Tensor.size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "intelligent-david",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.Tensor.cat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "bronze-tenant",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "balanced-bearing",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"7"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a.size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "extreme-republic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
|
||||
" nargs = len(args)\n",
|
||||
" assert (nargs <= 1)\n",
|
||||
" s = paddle.shape(xs)\n",
|
||||
" if nargs == 1:\n",
|
||||
" return s[args[0]]\n",
|
||||
" else:\n",
|
||||
" return s\n",
|
||||
"\n",
|
||||
"# logger.warn(\n",
|
||||
"# \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
|
||||
"# )\n",
|
||||
"paddle.Tensor.size = size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "gross-addiction",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [7])"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a.size(0)\n",
|
||||
"a.size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "adverse-dining",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [7])"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a.size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "popular-potato",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,229 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "academic-surname",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"from paddle import nn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "fundamental-treasure",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Parameter containing:\n",
|
||||
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
|
||||
" [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
|
||||
"Parameter containing:\n",
|
||||
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
|
||||
" [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"L = nn.LayerNorm(256, epsilon=1e-12)\n",
|
||||
"for p in L.parameters():\n",
|
||||
" print(p)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "consolidated-elephant",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "moderate-noise",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 51, 256)\n",
|
||||
"print(x.dtype)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "cooked-progressive",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y = L(paddle.to_tensor(x, dtype='float32'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "optimum-milwaukee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "viral-indian",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Parameter containing:\n",
|
||||
"tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1.], requires_grad=True)\n",
|
||||
"Parameter containing:\n",
|
||||
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
|
||||
" requires_grad=True)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
|
||||
"for p in TL.parameters():\n",
|
||||
" print(p)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "skilled-vietnamese",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ty = TL(torch.tensor(x, dtype=torch.float32))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "incorrect-allah",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"np.allclose(y.numpy(), ty.detach().numpy())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "prostate-cameroon",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"id": "governmental-surge",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 256)\n",
|
||||
"y = L(paddle.to_tensor(x, dtype='float32'))\n",
|
||||
"ty = TL(torch.tensor(x, dtype=torch.float32))\n",
|
||||
"np.allclose(y.numpy(), ty.detach().numpy())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "confidential-jacket",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,449 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "primary-organic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "stopped-semester",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def mask_finished_scores(score: torch.Tensor,\n",
|
||||
" flag: torch.Tensor) -> torch.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" If a sequence is finished, we only allow one alive branch. This function\n",
|
||||
" aims to give one branch a zero score and the rest -inf score.\n",
|
||||
" Args:\n",
|
||||
" score (torch.Tensor): A real value array with shape\n",
|
||||
" (batch_size * beam_size, beam_size).\n",
|
||||
" flag (torch.Tensor): A bool array with shape\n",
|
||||
" (batch_size * beam_size, 1).\n",
|
||||
" Returns:\n",
|
||||
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
|
||||
" \"\"\"\n",
|
||||
" beam_size = score.size(-1)\n",
|
||||
" zero_mask = torch.zeros_like(flag, dtype=torch.bool)\n",
|
||||
" if beam_size > 1:\n",
|
||||
" unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),\n",
|
||||
" dim=1)\n",
|
||||
" finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),\n",
|
||||
" dim=1)\n",
|
||||
" else:\n",
|
||||
" unfinished = zero_mask\n",
|
||||
" finished = flag\n",
|
||||
" print(unfinished)\n",
|
||||
" print(finished)\n",
|
||||
" score.masked_fill_(unfinished, -float('inf'))\n",
|
||||
" score.masked_fill_(finished, 0)\n",
|
||||
" return score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "agreed-portuguese",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[ True],\n",
|
||||
" [False]])\n",
|
||||
"tensor([[-0.8841, 0.7381, -0.9986],\n",
|
||||
" [ 0.2675, -0.7971, 0.3798]])\n",
|
||||
"tensor([[ True, True],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"score = torch.randn((2, 3))\n",
|
||||
"flag = torch.ones((2, 1), dtype=torch.bool)\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag)\n",
|
||||
"print(score)\n",
|
||||
"print(flag.repeat([1, 2]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"id": "clean-aspect",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[False, True, True],\n",
|
||||
" [False, False, False]])\n",
|
||||
"tensor([[ True, False, False],\n",
|
||||
" [False, False, False]])\n",
|
||||
"tensor([[ 0.0000, -inf, -inf],\n",
|
||||
" [ 0.2675, -0.7971, 0.3798]])\n",
|
||||
"tensor([[ 0.0000, -inf, -inf],\n",
|
||||
" [ 0.2675, -0.7971, 0.3798]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r = mask_finished_scores(score, flag)\n",
|
||||
"print(r)\n",
|
||||
"print(score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"id": "thrown-airline",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tensor(shape=[2, 1], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True ],\n",
|
||||
" [False]])\n",
|
||||
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , True ],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"\n",
|
||||
"score = paddle.randn((2, 3))\n",
|
||||
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag)\n",
|
||||
"print(score)\n",
|
||||
"print(flag.tile([1, 2]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"id": "internal-patent",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[False, True , True ],\n",
|
||||
" [False, False, False]])\n",
|
||||
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , False, False],\n",
|
||||
" [False, False, False]])\n",
|
||||
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 0. , -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 0. , -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.bool = 'bool'\n",
|
||||
"\n",
|
||||
"def masked_fill(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
|
||||
" print(xs)\n",
|
||||
" trues = paddle.ones_like(xs) * value\n",
|
||||
" assert xs.shape == mask.shape\n",
|
||||
" xs = paddle.where(mask, trues, xs)\n",
|
||||
" return xs\n",
|
||||
"\n",
|
||||
"def masked_fill_(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
|
||||
" print('x', xs)\n",
|
||||
" trues = paddle.ones_like(xs) * value\n",
|
||||
" assert xs.shape == mask.shape\n",
|
||||
" ret = paddle.where(mask, trues, xs)\n",
|
||||
" print('2', xs)\n",
|
||||
" paddle.assign(ret, output=xs)\n",
|
||||
" print('3', xs)\n",
|
||||
"\n",
|
||||
"paddle.Tensor.masked_fill = masked_fill\n",
|
||||
"paddle.Tensor.masked_fill_ = masked_fill_\n",
|
||||
"\n",
|
||||
"def mask_finished_scores_pd(score: paddle.Tensor,\n",
|
||||
" flag: paddle.Tensor) -> paddle.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" If a sequence is finished, we only allow one alive branch. This function\n",
|
||||
" aims to give one branch a zero score and the rest -inf score.\n",
|
||||
" Args:\n",
|
||||
" score (torch.Tensor): A real value array with shape\n",
|
||||
" (batch_size * beam_size, beam_size).\n",
|
||||
" flag (torch.Tensor): A bool array with shape\n",
|
||||
" (batch_size * beam_size, 1).\n",
|
||||
" Returns:\n",
|
||||
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
|
||||
" \"\"\"\n",
|
||||
" beam_size = score.shape[-1]\n",
|
||||
" zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)\n",
|
||||
" if beam_size > 1:\n",
|
||||
" unfinished = paddle.concat((zero_mask, flag.tile([1, beam_size - 1])),\n",
|
||||
" axis=1)\n",
|
||||
" finished = paddle.concat((flag, zero_mask.tile([1, beam_size - 1])),\n",
|
||||
" axis=1)\n",
|
||||
" else:\n",
|
||||
" unfinished = zero_mask\n",
|
||||
" finished = flag\n",
|
||||
" print(unfinished)\n",
|
||||
" print(finished)\n",
|
||||
" \n",
|
||||
" #score.masked_fill_(unfinished, -float('inf'))\n",
|
||||
" #score.masked_fill_(finished, 0)\n",
|
||||
"# infs = paddle.ones_like(score) * -float('inf')\n",
|
||||
"# score = paddle.where(unfinished, infs, score)\n",
|
||||
"# score = paddle.where(finished, paddle.zeros_like(score), score)\n",
|
||||
"\n",
|
||||
"# score = score.masked_fill(unfinished, -float('inf'))\n",
|
||||
"# score = score.masked_fill(finished, 0)\n",
|
||||
" score.masked_fill_(unfinished, -float('inf'))\n",
|
||||
" score.masked_fill_(finished, 0)\n",
|
||||
" return score\n",
|
||||
"\n",
|
||||
"r = mask_finished_scores_pd(score, flag)\n",
|
||||
"print(r)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "vocal-prime",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<bound method PyCapsule.value of Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 0. , -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])>"
|
||||
]
|
||||
},
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"score.value"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "bacterial-adolescent",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Union, Any"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "absent-fiber",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def repeat(xs : paddle.Tensor, *size: Any):\n",
|
||||
" print(size)\n",
|
||||
" return paddle.tile(xs, size)\n",
|
||||
"paddle.Tensor.repeat = repeat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"id": "material-harbor",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(1, 2)\n",
|
||||
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , True ],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag.repeat(1, 2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 84,
|
||||
"id": "acute-brighton",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [1]), 2)\n",
|
||||
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , True ],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag.repeat(paddle.to_tensor(1), 2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 85,
|
||||
"id": "european-rugby",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def size(xs, *args: int):\n",
|
||||
" nargs = len(args)\n",
|
||||
" s = paddle.shape(xs)\n",
|
||||
" assert(nargs <= 1)\n",
|
||||
" if nargs == 1:\n",
|
||||
" return s[args[0]]\n",
|
||||
" else:\n",
|
||||
" return s\n",
|
||||
"paddle.Tensor.size = size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"id": "moral-special",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[2], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [2, 1])"
|
||||
]
|
||||
},
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag.size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"id": "ahead-coach",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [1])"
|
||||
]
|
||||
},
|
||||
"execution_count": 87,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag.size(1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 88,
|
||||
"id": "incomplete-fitness",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [2])"
|
||||
]
|
||||
},
|
||||
"execution_count": 88,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag.size(0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "upset-connectivity",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,231 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "designing-borough",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||
" 0.0000000e+00 0.0000000e+00]\n",
|
||||
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||
" 1.1547816e-04 1.0746076e-04]\n",
|
||||
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||
" 2.3095631e-04 2.1492151e-04]\n",
|
||||
" ...\n",
|
||||
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||
" 1.1201146e-02 1.0423505e-02]\n",
|
||||
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||
" 1.1316618e-02 1.0530960e-02]\n",
|
||||
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||
" 1.1432089e-02 1.0638415e-02]]\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import math\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"max_len=100\n",
|
||||
"d_model=256\n",
|
||||
"\n",
|
||||
"pe = torch.zeros(max_len, d_model)\n",
|
||||
"position = torch.arange(0, max_len,\n",
|
||||
" dtype=torch.float32).unsqueeze(1)\n",
|
||||
"toruch_position = position\n",
|
||||
"div_term = torch.exp(\n",
|
||||
" torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
|
||||
" -(math.log(10000.0) / d_model))\n",
|
||||
"tourch_div_term = div_term.cpu().detach().numpy()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"torhc_sin = torch.sin(position * div_term)\n",
|
||||
"torhc_cos = torch.cos(position * div_term)\n",
|
||||
"print(torhc_sin.cpu().detach().numpy())\n",
|
||||
"np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
|
||||
"np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
|
||||
"print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
|
||||
"print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
|
||||
"pe[:, 0::2] = torhc_sin\n",
|
||||
"pe[:, 1::2] = torhc_cos\n",
|
||||
"tourch_pe = pe.cpu().detach().numpy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "swiss-referral",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"[[ 1. 1. 1. ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [ 0.5403023 0.59737533 0.6479059 ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [-0.41614684 -0.28628543 -0.1604359 ... 0.99999994 1.\n",
|
||||
" 1. ]\n",
|
||||
" ...\n",
|
||||
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.99993724\n",
|
||||
" 0.9999457 ]\n",
|
||||
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
|
||||
" 0.99994457]\n",
|
||||
" [ 0.03982088 -0.52298605 -0.6157435 ... 0.99992454 0.9999347\n",
|
||||
" 0.99994344]]\n",
|
||||
"----\n",
|
||||
"[[ 1. 1. 1. ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [ 0.54030234 0.59737533 0.6479059 ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [-0.41614684 -0.28628543 -0.1604359 ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" ...\n",
|
||||
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.9999373\n",
|
||||
" 0.9999457 ]\n",
|
||||
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
|
||||
" 0.99994457]\n",
|
||||
" [ 0.03982088 -0.5229861 -0.6157435 ... 0.99992454 0.9999347\n",
|
||||
" 0.99994344]]\n",
|
||||
")))))))\n",
|
||||
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||
" 0.0000000e+00 0.0000000e+00]\n",
|
||||
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||
" 1.1547816e-04 1.0746076e-04]\n",
|
||||
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||
" 2.3095631e-04 2.1492151e-04]\n",
|
||||
" ...\n",
|
||||
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||
" 1.1201146e-02 1.0423505e-02]\n",
|
||||
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||
" 1.1316618e-02 1.0530960e-02]\n",
|
||||
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||
" 1.1432089e-02 1.0638415e-02]]\n",
|
||||
"----\n",
|
||||
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||
" 0.0000000e+00 0.0000000e+00]\n",
|
||||
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||
" 1.1547816e-04 1.0746076e-04]\n",
|
||||
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||
" 2.3095631e-04 2.1492151e-04]\n",
|
||||
" ...\n",
|
||||
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||
" 1.1201146e-02 1.0423505e-02]\n",
|
||||
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||
" 1.1316618e-02 1.0530960e-02]\n",
|
||||
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||
" 1.1432089e-02 1.0638415e-02]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"paddle.set_device('cpu')\n",
|
||||
"ppe = paddle.zeros((max_len, d_model), dtype='float32')\n",
|
||||
"position = paddle.arange(0, max_len,\n",
|
||||
" dtype='float32').unsqueeze(1)\n",
|
||||
"print(np.allclose(position.numpy(), toruch_position))\n",
|
||||
"div_term = paddle.exp(\n",
|
||||
" paddle.arange(0, d_model, 2, dtype='float32') *\n",
|
||||
" -(math.log(10000.0) / d_model))\n",
|
||||
"print(np.allclose(div_term.numpy(), tourch_div_term))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"p_sin = paddle.sin(position * div_term)\n",
|
||||
"p_cos = paddle.cos(position * div_term)\n",
|
||||
"print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
|
||||
"print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
|
||||
"ppe[:, 0::2] = p_sin\n",
|
||||
"ppe[:, 1::2] = p_cos\n",
|
||||
"print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
|
||||
"print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n",
|
||||
"print(p_cos.numpy())\n",
|
||||
"print(\"----\")\n",
|
||||
"print(torhc_cos.cpu().detach().numpy())\n",
|
||||
"print(\")))))))\")\n",
|
||||
"print(p_sin.numpy())\n",
|
||||
"print(\"----\")\n",
|
||||
"print(torhc_sin.cpu().detach().numpy())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "integrated-boards",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(np.allclose(ppe.numpy(), pe.numpy()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "flying-reserve",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "revised-divide",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -1,51 +0,0 @@
|
||||
[English](README.md)
|
||||
|
||||
# PaddlePaddle Speech to Any toolkit
|
||||
|
||||

|
||||

|
||||

|
||||
|
||||
*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目,
|
||||
我们的愿景是为语音识别在工业应用和学术研究上,提供易于使用、高效、小型化和可扩展的工具,包括训练,推理,以及 部署。
|
||||
|
||||
## 特性
|
||||
|
||||
参看 [特性列表](doc/src/feature_list.md)。
|
||||
|
||||
|
||||
## 安装
|
||||
|
||||
在以下环境测试验证过:
|
||||
|
||||
* Ubuntu 16.04
|
||||
* python>=3.7
|
||||
* paddlepaddle>=2.1.2
|
||||
|
||||
参看 [安装](doc/src/install.md)。
|
||||
|
||||
## 开始
|
||||
|
||||
请查看 [开始](doc/src/getting_started.md) 和 [tiny egs](examples/tiny/s0/README.md)。
|
||||
|
||||
## 更多信息
|
||||
|
||||
* [数据处理](doc/src/data_preparation.md)
|
||||
* [数据增强](doc/src/augmentation.md)
|
||||
* [语言模型](doc/src/ngram_lm.md)
|
||||
* [服务部署](doc/src/server.md)
|
||||
* [Benchmark](doc/src/benchmark.md)
|
||||
* [Relased Model](doc/src/released_model.md)
|
||||
* [FAQ](doc/src/faq.md)
|
||||
|
||||
## 问题和帮助
|
||||
|
||||
欢迎您在[Github讨论](https://github.com/PaddlePaddle/DeepSpeech/discussions)提交问题,[Github问题](https://github.com/PaddlePaddle/models/issues)中反馈bug。也欢迎您为这个项目做出贡献。
|
||||
|
||||
## License
|
||||
|
||||
DeepASR 遵循[Apache-2.0开源协议](./LICENSE)。
|
||||
|
||||
## 感谢
|
||||
|
||||
开发中参考一些优秀的仓库,详情参见 [References](doc/src/reference.md)。
|
@ -1,191 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Beam search parameters tuning for DeepSpeech2 model."""
|
||||
import functools
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import DataLoader
|
||||
|
||||
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.io.dataset import ManifestDataset
|
||||
from deepspeech.models.ds2 import DeepSpeech2Model
|
||||
from deepspeech.training.cli import default_argument_parser
|
||||
from deepspeech.utils import error_rate
|
||||
from deepspeech.utils.utility import add_arguments
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
|
||||
|
||||
def tune(config, args):
|
||||
"""Tune parameters alpha and beta incrementally."""
|
||||
if not args.num_alphas >= 0:
|
||||
raise ValueError("num_alphas must be non-negative!")
|
||||
if not args.num_betas >= 0:
|
||||
raise ValueError("num_betas must be non-negative!")
|
||||
config.defrost()
|
||||
config.data.manfiest = config.data.dev_manifest
|
||||
config.data.augmentation_config = ""
|
||||
config.data.keep_transcription_text = True
|
||||
dev_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
valid_loader = DataLoader(
|
||||
dev_dataset,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=SpeechCollator(keep_transcription_text=True))
|
||||
|
||||
model = DeepSpeech2Model.from_pretrained(valid_loader, config,
|
||||
args.checkpoint_path)
|
||||
model.eval()
|
||||
|
||||
# decoders only accept string encoded in utf-8
|
||||
vocab_list = valid_loader.dataset.vocab_list
|
||||
errors_func = error_rate.char_errors if config.decoding.error_rate_type == 'cer' else error_rate.word_errors
|
||||
|
||||
# create grid for search
|
||||
cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
|
||||
cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
|
||||
params_grid = [(alpha, beta) for alpha in cand_alphas
|
||||
for beta in cand_betas]
|
||||
|
||||
err_sum = [0.0 for i in range(len(params_grid))]
|
||||
err_ave = [0.0 for i in range(len(params_grid))]
|
||||
|
||||
num_ins, len_refs, cur_batch = 0, 0, 0
|
||||
# initialize external scorer
|
||||
model.decoder.init_decode(args.alpha_from, args.beta_from,
|
||||
config.decoding.lang_model_path, vocab_list,
|
||||
config.decoding.decoding_method)
|
||||
## incremental tuning parameters over multiple batches
|
||||
print("start tuning ...")
|
||||
for infer_data in valid_loader():
|
||||
if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
|
||||
break
|
||||
|
||||
def ordid2token(texts, texts_len):
|
||||
""" ord() id to chr() chr """
|
||||
trans = []
|
||||
for text, n in zip(texts, texts_len):
|
||||
n = n.numpy().item()
|
||||
ids = text[:n]
|
||||
trans.append(''.join([chr(i) for i in ids]))
|
||||
return trans
|
||||
|
||||
audio, audio_len, text, text_len = infer_data
|
||||
target_transcripts = ordid2token(text, text_len)
|
||||
num_ins += audio.shape[0]
|
||||
|
||||
# model infer
|
||||
eouts, eouts_len = model.encoder(audio, audio_len)
|
||||
probs = model.decoder.softmax(eouts)
|
||||
|
||||
# grid search
|
||||
for index, (alpha, beta) in enumerate(params_grid):
|
||||
print(f"tuneing: alpha={alpha} beta={beta}")
|
||||
result_transcripts = model.decoder.decode_probs(
|
||||
probs.numpy(), eouts_len, vocab_list,
|
||||
config.decoding.decoding_method,
|
||||
config.decoding.lang_model_path, alpha, beta,
|
||||
config.decoding.beam_size, config.decoding.cutoff_prob,
|
||||
config.decoding.cutoff_top_n, config.decoding.num_proc_bsearch)
|
||||
|
||||
for target, result in zip(target_transcripts, result_transcripts):
|
||||
errors, len_ref = errors_func(target, result)
|
||||
err_sum[index] += errors
|
||||
|
||||
# accumulate the length of references of every batchπ
|
||||
# in the first iteration
|
||||
if args.alpha_from == alpha and args.beta_from == beta:
|
||||
len_refs += len_ref
|
||||
|
||||
err_ave[index] = err_sum[index] / len_refs
|
||||
if index % 2 == 0:
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
print("tuneing: one grid done!")
|
||||
|
||||
# output on-line tuning result at the end of current batch
|
||||
err_ave_min = min(err_ave)
|
||||
min_index = err_ave.index(err_ave_min)
|
||||
print("\nBatch %d [%d/?], current opt (alpha, beta) = (%s, %s), "
|
||||
" min [%s] = %f" %
|
||||
(cur_batch, num_ins, "%.3f" % params_grid[min_index][0],
|
||||
"%.3f" % params_grid[min_index][1],
|
||||
config.decoding.error_rate_type, err_ave_min))
|
||||
cur_batch += 1
|
||||
|
||||
# output WER/CER at every (alpha, beta)
|
||||
print("\nFinal %s:\n" % config.decoding.error_rate_type)
|
||||
for index in range(len(params_grid)):
|
||||
print("(alpha, beta) = (%s, %s), [%s] = %f" %
|
||||
("%.3f" % params_grid[index][0], "%.3f" % params_grid[index][1],
|
||||
config.decoding.error_rate_type, err_ave[index]))
|
||||
|
||||
err_ave_min = min(err_ave)
|
||||
min_index = err_ave.index(err_ave_min)
|
||||
print("\nFinish tuning on %d batches, final opt (alpha, beta) = (%s, %s)" %
|
||||
(cur_batch, "%.3f" % params_grid[min_index][0],
|
||||
"%.3f" % params_grid[min_index][1]))
|
||||
|
||||
print("finish tuning")
|
||||
|
||||
|
||||
def main(config, args):
|
||||
tune(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
add_arg('num_batches', int, -1, "# of batches tuning on. "
|
||||
"Default -1, on whole dev set.")
|
||||
add_arg('num_alphas', int, 45, "# of alpha candidates for tuning.")
|
||||
add_arg('num_betas', int, 8, "# of beta candidates for tuning.")
|
||||
add_arg('alpha_from', float, 1.0, "Where alpha starts tuning from.")
|
||||
add_arg('alpha_to', float, 3.2, "Where alpha ends tuning with.")
|
||||
add_arg('beta_from', float, 0.1, "Where beta starts tuning from.")
|
||||
add_arg('beta_to', float, 0.45, "Where beta ends tuning with.")
|
||||
|
||||
add_arg('batch_size', int, 256, "# of samples per batch.")
|
||||
add_arg('beam_size', int, 500, "Beam search width.")
|
||||
add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.")
|
||||
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
|
||||
add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.")
|
||||
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
config = get_cfg_defaults()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
|
||||
config.data.batch_size = args.batch_size
|
||||
config.decoding.beam_size = args.beam_size
|
||||
config.decoding.num_proc_bsearch = args.num_proc_bsearch
|
||||
config.decoding.cutoff_prob = args.cutoff_prob
|
||||
config.decoding.cutoff_top_n = args.cutoff_top_n
|
||||
|
||||
config.freeze()
|
||||
print(config)
|
||||
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
main(config, args)
|
@ -0,0 +1,220 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains U2 model."""
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.io.dataset import ManifestDataset
|
||||
from deepspeech.io.sampler import SortagradBatchSampler
|
||||
from deepspeech.io.sampler import SortagradDistributedBatchSampler
|
||||
from deepspeech.models.u2 import U2Evaluator
|
||||
from deepspeech.models.u2 import U2Model
|
||||
from deepspeech.models.u2 import U2Updater
|
||||
from deepspeech.training.extensions.snapshot import Snapshot
|
||||
from deepspeech.training.extensions.visualizer import VisualDL
|
||||
from deepspeech.training.optimizer import OptimizerFactory
|
||||
from deepspeech.training.scheduler import LRSchedulerFactory
|
||||
from deepspeech.training.timer import Timer
|
||||
from deepspeech.training.trainer import Trainer
|
||||
from deepspeech.training.updaters.trainer import Trainer as NewTrainer
|
||||
from deepspeech.utils import layer_tools
|
||||
from deepspeech.utils.log import Log
|
||||
from deepspeech.utils.utility import UpdateConfig
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class U2Trainer(Trainer):
|
||||
def __init__(self, config, args):
|
||||
super().__init__(config, args)
|
||||
|
||||
def setup_dataloader(self):
|
||||
config = self.config.clone()
|
||||
config.defrost()
|
||||
config.collator.keep_transcription_text = False
|
||||
|
||||
# train/valid dataset, return token ids
|
||||
config.data.manifest = config.data.train_manifest
|
||||
train_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
config.data.manifest = config.data.dev_manifest
|
||||
dev_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
collate_fn_train = SpeechCollator.from_config(config)
|
||||
|
||||
config.collator.augmentation_config = ""
|
||||
collate_fn_dev = SpeechCollator.from_config(config)
|
||||
|
||||
if self.parallel:
|
||||
batch_sampler = SortagradDistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.collator.batch_size,
|
||||
num_replicas=None,
|
||||
rank=None,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
sortagrad=config.collator.sortagrad,
|
||||
shuffle_method=config.collator.shuffle_method)
|
||||
else:
|
||||
batch_sampler = SortagradBatchSampler(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=config.collator.batch_size,
|
||||
drop_last=True,
|
||||
sortagrad=config.collator.sortagrad,
|
||||
shuffle_method=config.collator.shuffle_method)
|
||||
self.train_loader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
collate_fn=collate_fn_train,
|
||||
num_workers=config.collator.num_workers, )
|
||||
self.valid_loader = DataLoader(
|
||||
dev_dataset,
|
||||
batch_size=config.collator.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=collate_fn_dev)
|
||||
|
||||
# test dataset, return raw text
|
||||
config.data.manifest = config.data.test_manifest
|
||||
# filter test examples, will cause less examples, but no mismatch with training
|
||||
# and can use large batch size , save training time, so filter test egs now.
|
||||
config.data.min_input_len = 0.0 # second
|
||||
config.data.max_input_len = float('inf') # second
|
||||
config.data.min_output_len = 0.0 # tokens
|
||||
config.data.max_output_len = float('inf') # tokens
|
||||
config.data.min_output_input_ratio = 0.00
|
||||
config.data.max_output_input_ratio = float('inf')
|
||||
|
||||
test_dataset = ManifestDataset.from_config(config)
|
||||
# return text ord id
|
||||
config.collator.keep_transcription_text = True
|
||||
config.collator.augmentation_config = ""
|
||||
self.test_loader = DataLoader(
|
||||
test_dataset,
|
||||
batch_size=config.decoding.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=SpeechCollator.from_config(config))
|
||||
# return text token id
|
||||
config.collator.keep_transcription_text = False
|
||||
self.align_loader = DataLoader(
|
||||
test_dataset,
|
||||
batch_size=config.decoding.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=SpeechCollator.from_config(config))
|
||||
logger.info("Setup train/valid/test/align Dataloader!")
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model_conf = config.model
|
||||
with UpdateConfig(model_conf):
|
||||
model_conf.input_dim = self.train_loader.collate_fn.feature_size
|
||||
model_conf.output_dim = self.train_loader.collate_fn.vocab_size
|
||||
|
||||
model = U2Model.from_config(model_conf)
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
model.train()
|
||||
logger.info(f"{model}")
|
||||
layer_tools.print_params(model, logger.info)
|
||||
|
||||
train_config = config.training
|
||||
optim_type = train_config.optim
|
||||
optim_conf = train_config.optim_conf
|
||||
scheduler_type = train_config.scheduler
|
||||
scheduler_conf = train_config.scheduler_conf
|
||||
|
||||
scheduler_args = {
|
||||
"learning_rate": optim_conf.lr,
|
||||
"verbose": False,
|
||||
"warmup_steps": scheduler_conf.warmup_steps,
|
||||
"gamma": scheduler_conf.lr_decay,
|
||||
"d_model": model_conf.encoder_conf.output_size,
|
||||
}
|
||||
lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
|
||||
scheduler_args)
|
||||
|
||||
def optimizer_args(
|
||||
config,
|
||||
parameters,
|
||||
lr_scheduler=None, ):
|
||||
train_config = config.training
|
||||
optim_type = train_config.optim
|
||||
optim_conf = train_config.optim_conf
|
||||
scheduler_type = train_config.scheduler
|
||||
scheduler_conf = train_config.scheduler_conf
|
||||
return {
|
||||
"grad_clip": train_config.global_grad_clip,
|
||||
"weight_decay": optim_conf.weight_decay,
|
||||
"learning_rate": lr_scheduler
|
||||
if lr_scheduler else optim_conf.lr,
|
||||
"parameters": parameters,
|
||||
"epsilon": 1e-9 if optim_type == 'noam' else None,
|
||||
"beta1": 0.9 if optim_type == 'noam' else None,
|
||||
"beat2": 0.98 if optim_type == 'noam' else None,
|
||||
}
|
||||
|
||||
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
|
||||
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
|
||||
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.lr_scheduler = lr_scheduler
|
||||
logger.info("Setup model/optimizer/lr_scheduler!")
|
||||
|
||||
def setup_updater(self):
|
||||
output_dir = self.output_dir
|
||||
config = self.config.training
|
||||
|
||||
updater = U2Updater(
|
||||
model=self.model,
|
||||
optimizer=self.optimizer,
|
||||
scheduler=self.lr_scheduler,
|
||||
dataloader=self.train_loader,
|
||||
output_dir=output_dir,
|
||||
accum_grad=config.accum_grad)
|
||||
|
||||
trainer = NewTrainer(updater, (config.n_epoch, 'epoch'), output_dir)
|
||||
|
||||
evaluator = U2Evaluator(self.model, self.valid_loader)
|
||||
|
||||
trainer.extend(evaluator, trigger=(1, "epoch"))
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
|
||||
num_snapshots = config.checkpoint.kbest_n
|
||||
trainer.extend(
|
||||
Snapshot(
|
||||
mode='kbest',
|
||||
max_size=num_snapshots,
|
||||
indicator='VALID/LOSS',
|
||||
less_better=True),
|
||||
trigger=(1, 'epoch'))
|
||||
# print(trainer.extensions)
|
||||
# trainer.run()
|
||||
self.trainer = trainer
|
||||
|
||||
def run(self):
|
||||
"""The routine of the experiment after setup. This method is intended
|
||||
to be used by the user.
|
||||
"""
|
||||
self.setup_updater()
|
||||
with Timer("Training Done: {}"):
|
||||
self.trainer.run()
|
@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .u2 import U2InferModel
|
||||
from .u2 import U2Model
|
||||
from .updater import U2Evaluator
|
||||
from .updater import U2Updater
|
||||
|
||||
__all__ = ["U2Model", "U2InferModel", "U2Evaluator", "U2Updater"]
|
@ -0,0 +1,149 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from contextlib import nullcontext
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
|
||||
from deepspeech.training.extensions.evaluator import StandardEvaluator
|
||||
from deepspeech.training.reporter import report
|
||||
from deepspeech.training.timer import Timer
|
||||
from deepspeech.training.updaters.standard_updater import StandardUpdater
|
||||
from deepspeech.utils import layer_tools
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class U2Evaluator(StandardEvaluator):
|
||||
def __init__(self, model, dataloader):
|
||||
super().__init__(model, dataloader)
|
||||
self.msg = ""
|
||||
self.num_seen_utts = 0
|
||||
self.total_loss = 0.0
|
||||
|
||||
def evaluate_core(self, batch):
|
||||
self.msg = "Valid: Rank: {}, ".format(dist.get_rank())
|
||||
losses_dict = {}
|
||||
|
||||
loss, attention_loss, ctc_loss = self.model(*batch[1:])
|
||||
if paddle.isfinite(loss):
|
||||
num_utts = batch[1].shape[0]
|
||||
self.num_seen_utts += num_utts
|
||||
self.total_loss += float(loss) * num_utts
|
||||
|
||||
losses_dict['loss'] = float(loss)
|
||||
if attention_loss:
|
||||
losses_dict['att_loss'] = float(attention_loss)
|
||||
if ctc_loss:
|
||||
losses_dict['ctc_loss'] = float(ctc_loss)
|
||||
|
||||
for k, v in losses_dict.items():
|
||||
report("eval/" + k, v)
|
||||
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
logger.info(self.msg)
|
||||
return self.total_loss, self.num_seen_utts
|
||||
|
||||
|
||||
class U2Updater(StandardUpdater):
|
||||
def __init__(self,
|
||||
model,
|
||||
optimizer,
|
||||
scheduler,
|
||||
dataloader,
|
||||
init_state=None,
|
||||
accum_grad=1,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
model, optimizer, scheduler, dataloader, init_state=init_state)
|
||||
self.accum_grad = accum_grad
|
||||
self.forward_count = 0
|
||||
self.msg = ""
|
||||
|
||||
def update_core(self, batch):
|
||||
"""One Step
|
||||
|
||||
Args:
|
||||
batch (List[Object]): utts, xs, xlens, ys, ylens
|
||||
"""
|
||||
losses_dict = {}
|
||||
self.msg = "Rank: {}, ".format(dist.get_rank())
|
||||
|
||||
# forward
|
||||
batch_size = batch[1].shape[0]
|
||||
loss, attention_loss, ctc_loss = self.model(*batch[1:])
|
||||
# loss div by `batch_size * accum_grad`
|
||||
loss /= self.accum_grad
|
||||
|
||||
# loss backward
|
||||
if (self.forward_count + 1) != self.accum_grad:
|
||||
# Disable gradient synchronizations across DDP processes.
|
||||
# Within this context, gradients will be accumulated on module
|
||||
# variables, which will later be synchronized.
|
||||
context = self.model.no_sync
|
||||
else:
|
||||
# Used for single gpu training and DDP gradient synchronization
|
||||
# processes.
|
||||
context = nullcontext
|
||||
|
||||
with context():
|
||||
loss.backward()
|
||||
layer_tools.print_grads(self.model, print_func=None)
|
||||
|
||||
# loss info
|
||||
losses_dict['loss'] = float(loss) * self.accum_grad
|
||||
if attention_loss:
|
||||
losses_dict['att_loss'] = float(attention_loss)
|
||||
if ctc_loss:
|
||||
losses_dict['ctc_loss'] = float(ctc_loss)
|
||||
# report loss
|
||||
for k, v in losses_dict.items():
|
||||
report("train/" + k, v)
|
||||
# loss msg
|
||||
self.msg += "batch size: {}, ".format(batch_size)
|
||||
self.msg += "accum: {}, ".format(self.accum_grad)
|
||||
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_dict.items())
|
||||
|
||||
# Truncate the graph
|
||||
loss.detach()
|
||||
|
||||
# update parameters
|
||||
self.forward_count += 1
|
||||
if self.forward_count != self.accum_grad:
|
||||
return
|
||||
self.forward_count = 0
|
||||
|
||||
self.optimizer.step()
|
||||
self.optimizer.clear_grad()
|
||||
self.scheduler.step()
|
||||
|
||||
def update(self):
|
||||
# model is default in train mode
|
||||
|
||||
# training for a step is implemented here
|
||||
with Timer("data time cost:{}"):
|
||||
batch = self.read_batch()
|
||||
with Timer("step time cost:{}"):
|
||||
self.update_core(batch)
|
||||
|
||||
# #iterations with accum_grad > 1
|
||||
# Ref.: https://github.com/espnet/espnet/issues/777
|
||||
if self.forward_count == 0:
|
||||
self.state.iteration += 1
|
||||
if self.updates_per_epoch is not None:
|
||||
if self.state.iteration % self.updates_per_epoch == 0:
|
||||
self.state.epoch += 1
|
@ -0,0 +1,50 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import datetime
|
||||
import time
|
||||
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
__all__ = ["Timer"]
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class Timer():
|
||||
"""To be used like this:
|
||||
with Timer("Message") as value:
|
||||
do some thing
|
||||
"""
|
||||
|
||||
def __init__(self, message=None):
|
||||
self.message = message
|
||||
|
||||
def duration(self) -> str:
|
||||
elapsed_time = time.time() - self.start
|
||||
time_str = str(datetime.timedelta(seconds=elapsed_time))
|
||||
return time_str
|
||||
|
||||
def __enter__(self):
|
||||
self.start = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
if self.message:
|
||||
logger.info(self.message.format(self.duration()))
|
||||
|
||||
def __call__(self) -> float:
|
||||
return time.time() - self.start
|
||||
|
||||
def __str__(self):
|
||||
return self.duration()
|
@ -0,0 +1,119 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
|
||||
import paddle
|
||||
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
# A global variable to record the number of calling times for profiler
|
||||
# functions. It is used to specify the tracing range of training steps.
|
||||
_profiler_step_id = 0
|
||||
|
||||
# A global variable to avoid parsing from string every time.
|
||||
_profiler_options = None
|
||||
|
||||
|
||||
class ProfilerOptions(object):
|
||||
'''
|
||||
Use a string to initialize a ProfilerOptions.
|
||||
The string should be in the format: "key1=value1;key2=value;key3=value3".
|
||||
For example:
|
||||
"profile_path=model.profile"
|
||||
"batch_range=[50, 60]; profile_path=model.profile"
|
||||
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
|
||||
ProfilerOptions supports following key-value pair:
|
||||
batch_range - a integer list, e.g. [100, 110].
|
||||
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
|
||||
sorted_key - a string, the optional values are 'calls', 'total',
|
||||
'max', 'min' or 'ave.
|
||||
tracer_option - a string, the optional values are 'Default', 'OpDetail',
|
||||
'AllOpDetail'.
|
||||
profile_path - a string, the path to save the serialized profile data,
|
||||
which can be used to generate a timeline.
|
||||
exit_on_finished - a boolean.
|
||||
'''
|
||||
|
||||
def __init__(self, options_str):
|
||||
assert isinstance(options_str, str)
|
||||
|
||||
self._options = {
|
||||
'batch_range': [10, 20],
|
||||
'state': 'All',
|
||||
'sorted_key': 'total',
|
||||
'tracer_option': 'Default',
|
||||
'profile_path': '/tmp/profile',
|
||||
'exit_on_finished': True
|
||||
}
|
||||
self._parse_from_string(options_str)
|
||||
|
||||
def _parse_from_string(self, options_str):
|
||||
if not options_str:
|
||||
return
|
||||
|
||||
for kv in options_str.replace(' ', '').split(';'):
|
||||
key, value = kv.split('=')
|
||||
if key == 'batch_range':
|
||||
value_list = value.replace('[', '').replace(']', '').split(',')
|
||||
value_list = list(map(int, value_list))
|
||||
if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
|
||||
1] > value_list[0]:
|
||||
self._options[key] = value_list
|
||||
elif key == 'exit_on_finished':
|
||||
self._options[key] = value.lower() in ("yes", "true", "t", "1")
|
||||
elif key in [
|
||||
'state', 'sorted_key', 'tracer_option', 'profile_path'
|
||||
]:
|
||||
self._options[key] = value
|
||||
|
||||
def __getitem__(self, name):
|
||||
if self._options.get(name, None) is None:
|
||||
raise ValueError(
|
||||
"ProfilerOptions does not have an option named %s." % name)
|
||||
return self._options[name]
|
||||
|
||||
|
||||
def add_profiler_step(options_str=None):
|
||||
'''
|
||||
Enable the operator-level timing using PaddlePaddle's profiler.
|
||||
The profiler uses a independent variable to count the profiler steps.
|
||||
One call of this function is treated as a profiler step.
|
||||
|
||||
Args:
|
||||
profiler_options - a string to initialize the ProfilerOptions.
|
||||
Default is None, and the profiler is disabled.
|
||||
'''
|
||||
if options_str is None:
|
||||
return
|
||||
|
||||
global _profiler_step_id
|
||||
global _profiler_options
|
||||
|
||||
if _profiler_options is None:
|
||||
_profiler_options = ProfilerOptions(options_str)
|
||||
logger.info(f"Profiler: {options_str}")
|
||||
logger.info(f"Profiler: {_profiler_options._options}")
|
||||
|
||||
if _profiler_step_id == _profiler_options['batch_range'][0]:
|
||||
paddle.utils.profiler.start_profiler(_profiler_options['state'],
|
||||
_profiler_options['tracer_option'])
|
||||
elif _profiler_step_id == _profiler_options['batch_range'][1]:
|
||||
paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
|
||||
_profiler_options['profile_path'])
|
||||
if _profiler_options['exit_on_finished']:
|
||||
sys.exit(0)
|
||||
|
||||
_profiler_step_id += 1
|
Before Width: | Height: | Size: 206 KiB |
Before Width: | Height: | Size: 108 KiB |
@ -1,16 +0,0 @@
|
||||
# Benchmarks
|
||||
|
||||
## Acceleration with Multi-GPUs
|
||||
|
||||
We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
|
||||
|
||||
<img src="../images/multi_gpu_speedup.png" width=450>
|
||||
|
||||
| # of GPU | Acceleration Rate |
|
||||
| -------- | --------------: |
|
||||
| 1 | 1.00 X |
|
||||
| 2 | 1.98 X |
|
||||
| 4 | 3.73 X |
|
||||
| 8 | 6.95 X |
|
||||
|
||||
`utils/profile.sh` provides such a demo profiling tool, you can change it as need.
|
@ -1,3 +0,0 @@
|
||||
# Reference
|
||||
|
||||
* [wenet](https://github.com/mobvoi/wenet)
|
@ -1,9 +0,0 @@
|
||||
# Released Models
|
||||
|
||||
## Language Model Released
|
||||
|
||||
Language Model | Training Data | Token-based | Size | Descriptions
|
||||
:-------------:| :------------:| :-----: | -----: | :-----------------
|
||||
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
|
||||
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
|
||||
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
|
After Width: | Height: | Size: 93 KiB |
After Width: | Height: | Size: 93 KiB |
@ -0,0 +1,190 @@
|
||||
# Deepspeech2
|
||||
## Streaming
|
||||
|
||||
The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
|
||||
The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
|
||||
|
||||
To illustrate the model implementation clearly, 3 parts are described in detail.
|
||||
- Data Preparation
|
||||
- Encoder
|
||||
- Decoder
|
||||
|
||||
In addition, the training process and the testing process are also introduced.
|
||||
|
||||
The arcitecture of the model is shown in Fig.1.
|
||||
|
||||
<p align="center">
|
||||
<img src="../images/ds2onlineModel.png" width=800>
|
||||
<br/>Fig.1 The Arcitecture of deepspeech2 online model
|
||||
</p>
|
||||
|
||||
### Data Preparation
|
||||
#### Vocabulary
|
||||
For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>. For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
|
||||
```
|
||||
# The code to build vocabulary
|
||||
cd examples/aishell/s0
|
||||
python3 ../../../utils/build_vocab.py \
|
||||
--unit_type="char" \
|
||||
--count_threshold=0 \
|
||||
--vocab_path="data/vocab.txt" \
|
||||
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
|
||||
|
||||
# vocabulary for aishell dataset (Mandarin)
|
||||
vi examples/aishell/s0/data/vocab.txt
|
||||
|
||||
# vocabulary for librispeech dataset (English)
|
||||
vi examples/librispeech/s0/data/vocab.txt
|
||||
```
|
||||
|
||||
#### CMVN
|
||||
For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
|
||||
```
|
||||
# The code to compute the feature mean and std
|
||||
cd examples/aishell/s0
|
||||
python3 ../../../utils/compute_mean_std.py \
|
||||
--manifest_path="data/manifest.train.raw" \
|
||||
--specgram_type="linear" \
|
||||
--delta_delta=false \
|
||||
--stride_ms=10.0 \
|
||||
--window_ms=20.0 \
|
||||
--sample_rate=16000 \
|
||||
--use_dB_normalization=True \
|
||||
--num_samples=2000 \
|
||||
--num_workers=10 \
|
||||
--output_path="data/mean_std.json"
|
||||
|
||||
```
|
||||
|
||||
#### Feature Extraction
|
||||
For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
|
||||
Currently, the released deepspeech2 online model use the linear feature extraction method.
|
||||
```
|
||||
The code for feature extraction
|
||||
vi deepspeech/frontend/featurizer/audio_featurizer.py
|
||||
```
|
||||
|
||||
### Encoder
|
||||
The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.
|
||||
|
||||
The code of Encoder is in:
|
||||
```
|
||||
vi deepspeech/models/ds2_online/deepspeech2.py
|
||||
```
|
||||
|
||||
### Decoder
|
||||
To got the character possibilities of each frame, the feature representation of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
|
||||
|
||||
The code of the decoder is in:
|
||||
```
|
||||
# The code of constructing the decoder in model
|
||||
vi deepspeech/models/ds2_online/deepspeech2.py
|
||||
# The code of CTC Decoder
|
||||
vi deepspeech/modules/ctc.py
|
||||
```
|
||||
|
||||
## Training Process
|
||||
Using the command below, you can train the deepspeech2 online model.
|
||||
```
|
||||
cd examples/aishell/s0
|
||||
bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
|
||||
```
|
||||
The detail commands are:
|
||||
```
|
||||
# The code for training in run.sh
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=2,3,5,7
|
||||
stage=0
|
||||
stop_stage=5
|
||||
conf_path=conf/deepspeech2_online.yaml # conf/deepspeech2.yaml | conf/deepspeech2_online.yaml
|
||||
avg_num=1
|
||||
model_type=online # online | offline
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
avg_ckpt=avg_${avg_num}
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `exp` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# avg n best model
|
||||
avg.sh exp/${ckpt}/checkpoints ${avg_num}
|
||||
fi
|
||||
```
|
||||
|
||||
By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.
|
||||
|
||||
## Testing Process
|
||||
Using the command below, you can test the deepspeech2 online model.
|
||||
```
|
||||
bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
|
||||
```
|
||||
The detail commands are:
|
||||
```
|
||||
conf_path=conf/deepspeech2_online.yaml
|
||||
avg_num=1
|
||||
model_type=online
|
||||
avg_ckpt=avg_${avg_num}
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# export ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=5 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
# test export ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
|
||||
fi
|
||||
```
|
||||
After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
|
||||
|
||||
|
||||
## Non-Streaming
|
||||
The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
|
||||
|
||||
The arcitecture of the model is shown in Fig.2.
|
||||
<p align="center">
|
||||
<img src="../images/ds2offlineModel.png" width=800>
|
||||
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
|
||||
</p>
|
||||
|
||||
|
||||
|
||||
For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
|
||||
|
||||
The code of encoder and decoder for deepspeech2 offline model is in:
|
||||
```
|
||||
vi deepspeech/models/ds2/deepspeech2.py
|
||||
```
|
||||
|
||||
The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
|
||||
Only some changes should be noticed.
|
||||
|
||||
For training and testing, the "model_type" and the "conf_path" must be set.
|
||||
```
|
||||
# Training offline
|
||||
cd examples/aishell/s0
|
||||
bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
|
||||
```
|
||||
```
|
||||
# Testing offline
|
||||
cd examples/aishell/s0
|
||||
bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
|
||||
```
|
@ -0,0 +1,8 @@
|
||||
# Reference
|
||||
|
||||
We refer these repos to build `model` and `engine`:
|
||||
|
||||
* [delta](https://github.com/Delta-ML/delta.git)
|
||||
* [espnet](https://github.com/espnet/espnet.git)
|
||||
* [kaldi](https://github.com/kaldi-asr/kaldi.git)
|
||||
* [wenet](https://github.com/mobvoi/wenet)
|
@ -0,0 +1,28 @@
|
||||
# Released Models
|
||||
|
||||
## Acoustic Model Released in paddle 2.X
|
||||
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
|
||||
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
|
||||
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 | 151 h
|
||||
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 | 151 h
|
||||
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 | 151 h
|
||||
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 | 151 h
|
||||
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0325 | 960 h
|
||||
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0544 | 960 h
|
||||
|
||||
## Acoustic Model Transformed from paddle 1.8
|
||||
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER or WER | Hours of speech
|
||||
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------
|
||||
[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 | 151 h|
|
||||
[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers | 0.0685| 960 h|
|
||||
[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers | 0.0541 | 8628 h|
|
||||
|
||||
|
||||
|
||||
## Language Model Released
|
||||
|
||||
Language Model | Training Data | Token-based | Size | Descriptions
|
||||
:-------------:| :------------:| :-----: | -----: | :-----------------
|
||||
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
|
||||
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
|
||||
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
|
@ -0,0 +1 @@
|
||||
tmp
|
@ -0,0 +1,11 @@
|
||||
# 1xt2x
|
||||
|
||||
Convert Deepspeech 1.8 released model to 2.x.
|
||||
|
||||
## Model
|
||||
* Deepspeech2x
|
||||
|
||||
## Exp
|
||||
* baidu_en8k
|
||||
* aishell
|
||||
* librispeech
|
@ -0,0 +1,4 @@
|
||||
exp
|
||||
data
|
||||
*log
|
||||
tmp
|
@ -0,0 +1 @@
|
||||
[]
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue