parent
5ef4a34e93
commit
472cf70ebd
@ -0,0 +1,15 @@
|
||||
unset GREP_OPTIONS
|
||||
|
||||
# https://zhuanlan.zhihu.com/p/33050965
|
||||
alias nvs='nvidia-smi'
|
||||
alias his='history'
|
||||
alias jobs='jobs -l'
|
||||
alias ports='netstat -tulanp'
|
||||
alias wget='wget -c'
|
||||
|
||||
## Colorize the grep command output for ease of use (good for log files)##
|
||||
alias grep='grep --color=auto'
|
||||
alias egrep='egrep --color=auto'
|
||||
alias fgrep='fgrep --color=auto'
|
||||
|
||||
|
@ -1,605 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "academic-surname",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"from paddle import nn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "fundamental-treasure",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"L = nn.Linear(256, 2048)\n",
|
||||
"L2 = nn.Linear(2048, 256)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "consolidated-elephant",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import torch\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "moderate-noise",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"float64\n",
|
||||
"Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[[-1.54171216, -2.61531472, -1.79881978, ..., -0.31395876, 0.56513089, -0.44516513],\n",
|
||||
" [-0.79492962, 1.91157901, 0.66567147, ..., 0.54825783, -1.01471853, -0.84924090],\n",
|
||||
" [-1.22556651, -0.36225814, 0.65063190, ..., 0.65726501, 0.05563191, 0.09009409],\n",
|
||||
" ...,\n",
|
||||
" [ 0.38615900, -0.77905393, 0.99732304, ..., -1.38463700, -3.32365036, -1.31089687],\n",
|
||||
" [ 0.05579993, 0.06885809, -1.66662002, ..., -0.23346378, -3.29372883, 1.30561364],\n",
|
||||
" [ 1.90676069, 1.95093191, -0.28849599, ..., -0.06860496, 0.95347673, 1.00475824]],\n",
|
||||
"\n",
|
||||
" [[-0.91453546, 0.55298805, -1.06146812, ..., -0.86378336, 1.00454640, 1.26062179],\n",
|
||||
" [ 0.10223761, 0.81301165, 2.36865163, ..., 0.16821407, 0.29240361, 1.05408621],\n",
|
||||
" [-1.33196676, 1.94433689, 0.01934209, ..., 0.48036841, 0.51585966, 1.22893548],\n",
|
||||
" ...,\n",
|
||||
" [-0.19558455, -0.47075930, 0.90796155, ..., -1.28598249, -0.24321797, 0.17734711],\n",
|
||||
" [ 0.89819717, -1.39516675, 0.17138045, ..., 2.39761519, 1.76364994, -0.52177650],\n",
|
||||
" [ 0.94122332, -0.18581429, 1.36099780, ..., 0.67647684, -0.04699665, 1.51205540]]])\n",
|
||||
"tensor([[[-1.5417, -2.6153, -1.7988, ..., -0.3140, 0.5651, -0.4452],\n",
|
||||
" [-0.7949, 1.9116, 0.6657, ..., 0.5483, -1.0147, -0.8492],\n",
|
||||
" [-1.2256, -0.3623, 0.6506, ..., 0.6573, 0.0556, 0.0901],\n",
|
||||
" ...,\n",
|
||||
" [ 0.3862, -0.7791, 0.9973, ..., -1.3846, -3.3237, -1.3109],\n",
|
||||
" [ 0.0558, 0.0689, -1.6666, ..., -0.2335, -3.2937, 1.3056],\n",
|
||||
" [ 1.9068, 1.9509, -0.2885, ..., -0.0686, 0.9535, 1.0048]],\n",
|
||||
"\n",
|
||||
" [[-0.9145, 0.5530, -1.0615, ..., -0.8638, 1.0045, 1.2606],\n",
|
||||
" [ 0.1022, 0.8130, 2.3687, ..., 0.1682, 0.2924, 1.0541],\n",
|
||||
" [-1.3320, 1.9443, 0.0193, ..., 0.4804, 0.5159, 1.2289],\n",
|
||||
" ...,\n",
|
||||
" [-0.1956, -0.4708, 0.9080, ..., -1.2860, -0.2432, 0.1773],\n",
|
||||
" [ 0.8982, -1.3952, 0.1714, ..., 2.3976, 1.7636, -0.5218],\n",
|
||||
" [ 0.9412, -0.1858, 1.3610, ..., 0.6765, -0.0470, 1.5121]]])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 51, 256)\n",
|
||||
"print(x.dtype)\n",
|
||||
"px = paddle.to_tensor(x, dtype='float32')\n",
|
||||
"tx = torch.tensor(x, dtype=torch.float32)\n",
|
||||
"print(px)\n",
|
||||
"print(tx)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "cooked-progressive",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "mechanical-prisoner",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
|
||||
"t_norm_ff = data['norm_ff']\n",
|
||||
"t_ff_out = data['ff_out']\n",
|
||||
"t_ff_l_x = data['ff_l_x']\n",
|
||||
"t_ff_l_a_x = data['ff_l_a_x']\n",
|
||||
"t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
|
||||
"t_ps = data['ps']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "indie-marriage",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "assured-zambia",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
|
||||
"L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
|
||||
"\n",
|
||||
"ps = []\n",
|
||||
"for n, p in L.named_parameters():\n",
|
||||
" ps.append(p)\n",
|
||||
"\n",
|
||||
"for n, p in L2.state_dict().items():\n",
|
||||
" ps.append(p)\n",
|
||||
" \n",
|
||||
"for p, tp in zip(ps, t_ps):\n",
|
||||
" print(np.allclose(p.numpy(), tp.T))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "committed-jacob",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "extreme-traffic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "optimum-milwaukee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "viral-indian",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
|
||||
"# t_norm_ff = data['norm_ff']\n",
|
||||
"# t_ff_out = data['ff_out']\n",
|
||||
"# t_ff_l_x = data['ff_l_x']\n",
|
||||
"# t_ff_l_a_x = data['ff_l_a_x']\n",
|
||||
"# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
|
||||
"# t_ps = data['ps']\n",
|
||||
"TL = torch.nn.Linear(256, 2048)\n",
|
||||
"TL2 = torch.nn.Linear(2048, 256)\n",
|
||||
"TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
|
||||
"TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
|
||||
"\n",
|
||||
"# for n, p in TL.named_parameters():\n",
|
||||
"# print(n, p)\n",
|
||||
"# for n, p in TL2.named_parameters():\n",
|
||||
"# print(n, p)\n",
|
||||
"\n",
|
||||
"ps = []\n",
|
||||
"for n, p in TL.state_dict().items():\n",
|
||||
" ps.append(p.data.numpy())\n",
|
||||
" \n",
|
||||
"for n, p in TL2.state_dict().items():\n",
|
||||
" ps.append(p.data.numpy())\n",
|
||||
" \n",
|
||||
"for p, tp in zip(ps, t_ps):\n",
|
||||
" print(np.allclose(p, tp))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "skilled-vietnamese",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[[ 0.67277956 0.08313607 -0.62761104 ... -0.17480263 0.42718208\n",
|
||||
" -0.5787626 ]\n",
|
||||
" [ 0.91516656 0.5393416 1.7159258 ... 0.06144593 0.06486575\n",
|
||||
" -0.03350811]\n",
|
||||
" [ 0.438351 0.6227843 0.24096036 ... 1.0912522 -0.90929437\n",
|
||||
" -1.012989 ]\n",
|
||||
" ...\n",
|
||||
" [ 0.68631977 0.14240924 0.10763275 ... -0.11513516 0.48065388\n",
|
||||
" 0.04070369]\n",
|
||||
" [-0.9525228 0.23197874 0.31264272 ... 0.5312439 0.18773697\n",
|
||||
" -0.8450228 ]\n",
|
||||
" [ 0.42024016 -0.04561988 0.54541194 ... -0.41933843 -0.00436018\n",
|
||||
" -0.06663495]]\n",
|
||||
"\n",
|
||||
" [[-0.11638781 -0.33566502 -0.20887226 ... 0.17423287 -0.9195841\n",
|
||||
" -0.8161046 ]\n",
|
||||
" [-0.3469874 0.88269687 -0.11887559 ... -0.15566081 0.16357468\n",
|
||||
" -0.20766167]\n",
|
||||
" [-0.3847657 0.3984318 -0.06963477 ... -0.00360622 1.2360432\n",
|
||||
" -0.26811332]\n",
|
||||
" ...\n",
|
||||
" [ 0.08230796 -0.46158582 0.54582864 ... 0.15747628 -0.44790155\n",
|
||||
" 0.06020184]\n",
|
||||
" [-0.8095085 0.43163058 -0.42837143 ... 0.8627463 0.90656304\n",
|
||||
" 0.15847842]\n",
|
||||
" [-1.485811 -0.18216592 -0.8882585 ... 0.32596245 0.7822631\n",
|
||||
" -0.6460344 ]]]\n",
|
||||
"[[[ 0.67278004 0.08313602 -0.6276114 ... -0.17480245 0.42718196\n",
|
||||
" -0.5787625 ]\n",
|
||||
" [ 0.91516703 0.5393413 1.7159253 ... 0.06144581 0.06486579\n",
|
||||
" -0.03350812]\n",
|
||||
" [ 0.43835106 0.62278455 0.24096027 ... 1.0912521 -0.9092943\n",
|
||||
" -1.0129892 ]\n",
|
||||
" ...\n",
|
||||
" [ 0.6863195 0.14240888 0.10763284 ... -0.11513527 0.48065376\n",
|
||||
" 0.04070365]\n",
|
||||
" [-0.9525231 0.23197863 0.31264275 ... 0.53124386 0.18773702\n",
|
||||
" -0.84502304]\n",
|
||||
" [ 0.42024007 -0.04561983 0.545412 ... -0.41933888 -0.00436005\n",
|
||||
" -0.066635 ]]\n",
|
||||
"\n",
|
||||
" [[-0.11638767 -0.33566508 -0.20887226 ... 0.17423296 -0.9195838\n",
|
||||
" -0.8161046 ]\n",
|
||||
" [-0.34698725 0.88269705 -0.11887549 ... -0.15566081 0.16357464\n",
|
||||
" -0.20766166]\n",
|
||||
" [-0.3847657 0.3984319 -0.06963488 ... -0.00360619 1.2360426\n",
|
||||
" -0.26811326]\n",
|
||||
" ...\n",
|
||||
" [ 0.08230786 -0.4615857 0.5458287 ... 0.15747619 -0.44790167\n",
|
||||
" 0.06020182]\n",
|
||||
" [-0.8095083 0.4316307 -0.42837155 ... 0.862746 0.9065631\n",
|
||||
" 0.15847899]\n",
|
||||
" [-1.485811 -0.18216613 -0.8882584 ... 0.32596254 0.7822631\n",
|
||||
" -0.6460344 ]]]\n",
|
||||
"True\n",
|
||||
"False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"y = L(px)\n",
|
||||
"print(y.numpy())\n",
|
||||
"\n",
|
||||
"ty = TL(tx)\n",
|
||||
"print(ty.data.numpy())\n",
|
||||
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
|
||||
"print(np.allclose(y.numpy(), ty.detach().numpy()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "incorrect-allah",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "prostate-cameroon",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "governmental-surge",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 0.04476918 0.554463 -0.3027508 ... -0.49600336 0.3751858\n",
|
||||
" 0.8254095 ]\n",
|
||||
" [ 0.95594174 -0.29528382 -1.2899452 ... 0.43718258 0.05584608\n",
|
||||
" -0.06974669]]\n",
|
||||
"[[ 0.04476918 0.5544631 -0.3027507 ... -0.49600336 0.37518573\n",
|
||||
" 0.8254096 ]\n",
|
||||
" [ 0.95594174 -0.29528376 -1.2899454 ... 0.4371827 0.05584623\n",
|
||||
" -0.0697467 ]]\n",
|
||||
"True\n",
|
||||
"False\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 256)\n",
|
||||
"px = paddle.to_tensor(x, dtype='float32')\n",
|
||||
"tx = torch.tensor(x, dtype=torch.float32)\n",
|
||||
"y = L(px)\n",
|
||||
"print(y.numpy())\n",
|
||||
"ty = TL(tx)\n",
|
||||
"print(ty.data.numpy())\n",
|
||||
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
|
||||
"print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
|
||||
"print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "confidential-jacket",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "improved-civilization",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5e7e7c9fde8350084abf1898cf52651cfc84b17a\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(paddle.version.commit)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "d1e2d3b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['__builtins__',\n",
|
||||
" '__cached__',\n",
|
||||
" '__doc__',\n",
|
||||
" '__file__',\n",
|
||||
" '__loader__',\n",
|
||||
" '__name__',\n",
|
||||
" '__package__',\n",
|
||||
" '__spec__',\n",
|
||||
" 'commit',\n",
|
||||
" 'full_version',\n",
|
||||
" 'istaged',\n",
|
||||
" 'major',\n",
|
||||
" 'minor',\n",
|
||||
" 'mkl',\n",
|
||||
" 'patch',\n",
|
||||
" 'rc',\n",
|
||||
" 'show',\n",
|
||||
" 'with_mkl']"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dir(paddle.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "c880c719",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(paddle.version.full_version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "f26977bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"commit: 5e7e7c9fde8350084abf1898cf52651cfc84b17a\n",
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(paddle.version.show())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "04ad47f6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1.6.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(torch.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "e1e03830",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['__builtins__',\n",
|
||||
" '__cached__',\n",
|
||||
" '__doc__',\n",
|
||||
" '__file__',\n",
|
||||
" '__loader__',\n",
|
||||
" '__name__',\n",
|
||||
" '__package__',\n",
|
||||
" '__spec__',\n",
|
||||
" '__version__',\n",
|
||||
" 'cuda',\n",
|
||||
" 'debug',\n",
|
||||
" 'git_version',\n",
|
||||
" 'hip']"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dir(torch.version)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "4ad0389b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'b31f58de6fa8bbda5353b3c77d9be4914399724d'"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.version.git_version"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "7870ea10",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'10.2'"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.version.cuda"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "db8ee5a7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6321ec2a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,389 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "emerging-meter",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" def convert_to_list(value, n, name, dtype=np.int):\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
|
||||
" from numpy.dual import register_func\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" long_ = _make_signed(np.long)\n",
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" ulong = _make_unsigned(np.long)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import math\n",
|
||||
"import random\n",
|
||||
"import tarfile\n",
|
||||
"import logging\n",
|
||||
"import numpy as np\n",
|
||||
"from collections import namedtuple\n",
|
||||
"from functools import partial\n",
|
||||
"\n",
|
||||
"import paddle\n",
|
||||
"from paddle.io import Dataset\n",
|
||||
"from paddle.io import DataLoader\n",
|
||||
"from paddle.io import BatchSampler\n",
|
||||
"from paddle.io import DistributedBatchSampler\n",
|
||||
"from paddle import distributed as dist\n",
|
||||
"\n",
|
||||
"from data_utils.utility import read_manifest\n",
|
||||
"from data_utils.augmentor.augmentation import AugmentationPipeline\n",
|
||||
"from data_utils.featurizer.speech_featurizer import SpeechFeaturizer\n",
|
||||
"from data_utils.speech import SpeechSegment\n",
|
||||
"from data_utils.normalizer import FeatureNormalizer\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from data_utils.dataset import (\n",
|
||||
" DeepSpeech2Dataset,\n",
|
||||
" DeepSpeech2DistributedBatchSampler,\n",
|
||||
" DeepSpeech2BatchSampler,\n",
|
||||
" SpeechCollator,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "excessive-american",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_dataloader(manifest_path,\t\n",
|
||||
" vocab_filepath,\t\n",
|
||||
" mean_std_filepath,\t\n",
|
||||
" augmentation_config='{}',\t\n",
|
||||
" max_duration=float('inf'),\t\n",
|
||||
" min_duration=0.0,\t\n",
|
||||
" stride_ms=10.0,\t\n",
|
||||
" window_ms=20.0,\t\n",
|
||||
" max_freq=None,\t\n",
|
||||
" specgram_type='linear',\t\n",
|
||||
" use_dB_normalization=True,\t\n",
|
||||
" random_seed=0,\t\n",
|
||||
" keep_transcription_text=False,\t\n",
|
||||
" is_training=False,\t\n",
|
||||
" batch_size=1,\t\n",
|
||||
" num_workers=0,\t\n",
|
||||
" sortagrad=False,\t\n",
|
||||
" shuffle_method=None,\t\n",
|
||||
" dist=False):\t\n",
|
||||
"\n",
|
||||
" dataset = DeepSpeech2Dataset(\t\n",
|
||||
" manifest_path,\t\n",
|
||||
" vocab_filepath,\t\n",
|
||||
" mean_std_filepath,\t\n",
|
||||
" augmentation_config=augmentation_config,\t\n",
|
||||
" max_duration=max_duration,\t\n",
|
||||
" min_duration=min_duration,\t\n",
|
||||
" stride_ms=stride_ms,\t\n",
|
||||
" window_ms=window_ms,\t\n",
|
||||
" max_freq=max_freq,\t\n",
|
||||
" specgram_type=specgram_type,\t\n",
|
||||
" use_dB_normalization=use_dB_normalization,\t\n",
|
||||
" random_seed=random_seed,\t\n",
|
||||
" keep_transcription_text=keep_transcription_text)\t\n",
|
||||
"\n",
|
||||
" if dist:\t\n",
|
||||
" batch_sampler = DeepSpeech2DistributedBatchSampler(\t\n",
|
||||
" dataset,\t\n",
|
||||
" batch_size,\t\n",
|
||||
" num_replicas=None,\t\n",
|
||||
" rank=None,\t\n",
|
||||
" shuffle=is_training,\t\n",
|
||||
" drop_last=is_training,\t\n",
|
||||
" sortagrad=is_training,\t\n",
|
||||
" shuffle_method=shuffle_method)\t\n",
|
||||
" else:\t\n",
|
||||
" batch_sampler = DeepSpeech2BatchSampler(\t\n",
|
||||
" dataset,\t\n",
|
||||
" shuffle=is_training,\t\n",
|
||||
" batch_size=batch_size,\t\n",
|
||||
" drop_last=is_training,\t\n",
|
||||
" sortagrad=is_training,\t\n",
|
||||
" shuffle_method=shuffle_method)\t\n",
|
||||
"\n",
|
||||
" def padding_batch(batch, padding_to=-1, flatten=False, is_training=True):\t\n",
|
||||
" \"\"\"\t\n",
|
||||
" Padding audio features with zeros to make them have the same shape (or\t\n",
|
||||
" a user-defined shape) within one bach.\t\n",
|
||||
"\n",
|
||||
" If ``padding_to`` is -1, the maximun shape in the batch will be used\t\n",
|
||||
" as the target shape for padding. Otherwise, `padding_to` will be the\t\n",
|
||||
" target shape (only refers to the second axis).\t\n",
|
||||
"\n",
|
||||
" If `flatten` is True, features will be flatten to 1darray.\t\n",
|
||||
" \"\"\"\t\n",
|
||||
" new_batch = []\t\n",
|
||||
" # get target shape\t\n",
|
||||
" max_length = max([audio.shape[1] for audio, text in batch])\t\n",
|
||||
" if padding_to != -1:\t\n",
|
||||
" if padding_to < max_length:\t\n",
|
||||
" raise ValueError(\"If padding_to is not -1, it should be larger \"\t\n",
|
||||
" \"than any instance's shape in the batch\")\t\n",
|
||||
" max_length = padding_to\t\n",
|
||||
" max_text_length = max([len(text) for audio, text in batch])\t\n",
|
||||
" # padding\t\n",
|
||||
" padded_audios = []\t\n",
|
||||
" audio_lens = []\t\n",
|
||||
" texts, text_lens = [], []\t\n",
|
||||
" for audio, text in batch:\t\n",
|
||||
" padded_audio = np.zeros([audio.shape[0], max_length])\t\n",
|
||||
" padded_audio[:, :audio.shape[1]] = audio\t\n",
|
||||
" if flatten:\t\n",
|
||||
" padded_audio = padded_audio.flatten()\t\n",
|
||||
" padded_audios.append(padded_audio)\t\n",
|
||||
" audio_lens.append(audio.shape[1])\t\n",
|
||||
"\n",
|
||||
" padded_text = np.zeros([max_text_length])\n",
|
||||
" if is_training:\n",
|
||||
" padded_text[:len(text)] = text\t# ids\n",
|
||||
" else:\n",
|
||||
" padded_text[:len(text)] = [ord(t) for t in text] # string\n",
|
||||
" \n",
|
||||
" texts.append(padded_text)\t\n",
|
||||
" text_lens.append(len(text))\t\n",
|
||||
"\n",
|
||||
" padded_audios = np.array(padded_audios).astype('float32')\t\n",
|
||||
" audio_lens = np.array(audio_lens).astype('int64')\t\n",
|
||||
" texts = np.array(texts).astype('int32')\t\n",
|
||||
" text_lens = np.array(text_lens).astype('int64')\t\n",
|
||||
" return padded_audios, texts, audio_lens, text_lens\t\n",
|
||||
"\n",
|
||||
" loader = DataLoader(\t\n",
|
||||
" dataset,\t\n",
|
||||
" batch_sampler=batch_sampler,\t\n",
|
||||
" collate_fn=partial(padding_batch, is_training=is_training),\t\n",
|
||||
" num_workers=num_workers)\t\n",
|
||||
" return loader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "naval-brave",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'infer_manifest': 'examples/aishell/data/manifest.dev', 'mean_std_path': 'examples/aishell/data/mean_std.npz', 'vocab_path': 'examples/aishell/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'linear'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import argparse\n",
|
||||
"import functools\n",
|
||||
"from utils.utility import add_arguments, print_arguments\n",
|
||||
"parser = argparse.ArgumentParser(description=__doc__)\n",
|
||||
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
|
||||
"# yapf: disable\n",
|
||||
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
|
||||
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
|
||||
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
|
||||
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
|
||||
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
|
||||
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
|
||||
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
|
||||
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
|
||||
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
|
||||
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
|
||||
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
|
||||
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
|
||||
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
|
||||
" \"bi-directional RNNs. Not for GRU.\")\n",
|
||||
"add_arg('infer_manifest', str,\n",
|
||||
" 'examples/aishell/data/manifest.dev',\n",
|
||||
" \"Filepath of manifest to infer.\")\n",
|
||||
"add_arg('mean_std_path', str,\n",
|
||||
" 'examples/aishell/data/mean_std.npz',\n",
|
||||
" \"Filepath of normalizer's mean & std.\")\n",
|
||||
"add_arg('vocab_path', str,\n",
|
||||
" 'examples/aishell/data/vocab.txt',\n",
|
||||
" \"Filepath of vocabulary.\")\n",
|
||||
"add_arg('lang_model_path', str,\n",
|
||||
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
|
||||
" \"Filepath for language model.\")\n",
|
||||
"add_arg('model_path', str,\n",
|
||||
" 'examples/aishell/checkpoints/step_final',\n",
|
||||
" \"If None, the training starts from scratch, \"\n",
|
||||
" \"otherwise, it resumes from the pre-trained model.\")\n",
|
||||
"add_arg('decoding_method', str,\n",
|
||||
" 'ctc_beam_search',\n",
|
||||
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
|
||||
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
|
||||
"add_arg('error_rate_type', str,\n",
|
||||
" 'wer',\n",
|
||||
" \"Error rate type for evaluation.\",\n",
|
||||
" choices=['wer', 'cer'])\n",
|
||||
"add_arg('specgram_type', str,\n",
|
||||
" 'linear',\n",
|
||||
" \"Audio feature type. Options: linear, mfcc.\",\n",
|
||||
" choices=['linear', 'mfcc'])\n",
|
||||
"# yapf: disable\n",
|
||||
"args = parser.parse_args([])\n",
|
||||
"print(vars(args))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "bearing-physics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"batch_reader = create_dataloader(\n",
|
||||
" manifest_path=args.infer_manifest,\n",
|
||||
" vocab_filepath=args.vocab_path,\n",
|
||||
" mean_std_filepath=args.mean_std_path,\n",
|
||||
" augmentation_config='{}',\n",
|
||||
" #max_duration=float('inf'),\n",
|
||||
" max_duration=27.0,\n",
|
||||
" min_duration=0.0,\n",
|
||||
" stride_ms=10.0,\n",
|
||||
" window_ms=20.0,\n",
|
||||
" max_freq=None,\n",
|
||||
" specgram_type=args.specgram_type,\n",
|
||||
" use_dB_normalization=True,\n",
|
||||
" random_seed=0,\n",
|
||||
" keep_transcription_text=True,\n",
|
||||
" is_training=False,\n",
|
||||
" batch_size=args.num_samples,\n",
|
||||
" sortagrad=True,\n",
|
||||
" shuffle_method=None,\n",
|
||||
" dist=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "classified-melissa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"test Tensor(shape=[5, 6], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
|
||||
" [[22823, 26102, 20195, 37324, 0 , 0 ],\n",
|
||||
" [22238, 26469, 23601, 22909, 0 , 0 ],\n",
|
||||
" [20108, 26376, 22235, 26085, 0 , 0 ],\n",
|
||||
" [36824, 35201, 20445, 25345, 32654, 24863],\n",
|
||||
" [29042, 27748, 21463, 23456, 0 , 0 ]])\n",
|
||||
"test raw 大时代里\n",
|
||||
"test raw 煲汤受宠\n",
|
||||
"audio len Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
|
||||
" [163, 167, 180, 186, 186])\n",
|
||||
"test len Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [4, 4, 4, 6, 4])\n",
|
||||
"audio Tensor(shape=[5, 161, 186], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
|
||||
" [[[ 1.11669052, 0.79015088, 0.93658292, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.83549136, 0.72643483, 0.83578080, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.89155018, -0.18894747, -0.53357804, ..., 0. , 0. , 0. ],\n",
|
||||
" ...,\n",
|
||||
" [ 0.33386710, -0.81240511, 0.12869737, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.17537928, 0.58380985, 0.70696265, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.84175998, 1.22041416, 0.07929770, ..., 0. , 0. , 0. ]],\n",
|
||||
"\n",
|
||||
" [[-0.35964420, 0.77392709, 0.71409988, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.15990183, 0.42962283, 0.06222462, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.31166190, -0.74864638, -0.52836996, ..., 0. , 0. , 0. ],\n",
|
||||
" ...,\n",
|
||||
" [-0.27546275, 0.32889456, 0.12410031, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.16264282, 0.49418071, -0.15960945, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.12476666, 0.00516864, 1.16021466, ..., 0. , 0. , 0. ]],\n",
|
||||
"\n",
|
||||
" [[ 0.90202141, 1.48541915, 0.92062062, ..., 0. , 0. , 0. ],\n",
|
||||
" [ 0.82661545, 1.37171340, 0.86746097, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.62287915, -0.48645937, 0.35041964, ..., 0. , 0. , 0. ],\n",
|
||||
" ...,\n",
|
||||
" [ 0.07376949, 0.07138316, 0.76355994, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.32306790, 0.43247896, 1.27311838, ..., 0. , 0. , 0. ],\n",
|
||||
" [-0.97667056, 0.60747612, 0.79181534, ..., 0. , 0. , 0. ]],\n",
|
||||
"\n",
|
||||
" [[ 0.72022128, 0.95428467, 0.92766261, ..., 0.29105374, -0.45564806, -0.62151009],\n",
|
||||
" [ 0.42083180, 0.49279949, 0.82724041, ..., -0.17333922, -1.45363355, -0.61673522],\n",
|
||||
" [-0.76116520, -0.84750438, -0.09512503, ..., -1.01497340, -1.42781055, -0.80859023],\n",
|
||||
" ...,\n",
|
||||
" [-0.23009977, 1.06155431, 1.09065628, ..., 0.25581080, 0.53794998, -1.22650719],\n",
|
||||
" [-1.37693381, 0.30778193, 0.17152318, ..., 0.51650339, 0.25580606, 0.83097816],\n",
|
||||
" [-1.62180591, 1.30567718, 1.09928656, ..., -0.77590007, 1.27712476, 0.53189957]],\n",
|
||||
"\n",
|
||||
" [[ 1.03205252, -0.51535392, 0.21077573, ..., 0.76618457, 1.27425683, 1.52250278],\n",
|
||||
" [ 0.82059991, 0.43990925, 0.13090958, ..., 0.86662549, 1.01687658, 1.48495352],\n",
|
||||
" [-0.75489789, -0.01997089, -0.65174174, ..., 0.09061214, -0.55211234, -0.01614586],\n",
|
||||
" ...,\n",
|
||||
" [ 0.50985396, 1.84555030, 0.79185146, ..., 1.13666189, 1.19898069, 1.98158395],\n",
|
||||
" [ 1.98721015, 2.52385354, 1.11714780, ..., 0.19416514, 1.11329341, 0.64460152],\n",
|
||||
" [ 2.69512844, 1.90993905, 0.50245082, ..., -0.50902629, 0.03333465, -1.24584770]]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n",
|
||||
" print('test', text)\n",
|
||||
" print(\"test raw\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n",
|
||||
" print(\"test raw\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n",
|
||||
" print('audio len', audio_len)\n",
|
||||
" print('test len', text_len)\n",
|
||||
" print('audio', audio)\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "unexpected-skating",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "minus-modern",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,290 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "breeding-haven",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%cd ..\n",
|
||||
"%pwd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "appropriate-theta",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"LICENSE deepspeech examples\t\t requirements.txt tools\r\n",
|
||||
"README.md docs\t libsndfile-1.0.28\t setup.sh\t utils\r\n",
|
||||
"README_cn.md env.sh\t libsndfile-1.0.28.tar.gz tests\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!ls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "entire-bloom",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" def convert_to_list(value, n, name, dtype=np.int):\n",
|
||||
"WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||
"WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
|
||||
"WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||
"WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
|
||||
"WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
|
||||
"WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
|
||||
"WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
|
||||
"WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from deepspeech.modules import loss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "governmental-aircraft",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "proprietary-disaster",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.Tensor.repeat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "first-diagram",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<property at 0x7fb515eeeb88>"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.Tensor.size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "intelligent-david",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.Tensor.cat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "bronze-tenant",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "balanced-bearing",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"7"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a.size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "extreme-republic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
|
||||
" nargs = len(args)\n",
|
||||
" assert (nargs <= 1)\n",
|
||||
" s = paddle.shape(xs)\n",
|
||||
" if nargs == 1:\n",
|
||||
" return s[args[0]]\n",
|
||||
" else:\n",
|
||||
" return s\n",
|
||||
"\n",
|
||||
"# logger.warn(\n",
|
||||
"# \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
|
||||
"# )\n",
|
||||
"paddle.Tensor.size = size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "gross-addiction",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [7])"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a.size(0)\n",
|
||||
"a.size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "adverse-dining",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [7])"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a.size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "popular-potato",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,229 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "academic-surname",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"from paddle import nn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "fundamental-treasure",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Parameter containing:\n",
|
||||
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
|
||||
" [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
|
||||
"Parameter containing:\n",
|
||||
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
|
||||
" [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"L = nn.LayerNorm(256, epsilon=1e-12)\n",
|
||||
"for p in L.parameters():\n",
|
||||
" print(p)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "consolidated-elephant",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"id": "moderate-noise",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 51, 256)\n",
|
||||
"print(x.dtype)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"id": "cooked-progressive",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y = L(paddle.to_tensor(x, dtype='float32'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "optimum-milwaukee",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"id": "viral-indian",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Parameter containing:\n",
|
||||
"tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||
" 1., 1., 1., 1.], requires_grad=True)\n",
|
||||
"Parameter containing:\n",
|
||||
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
|
||||
" requires_grad=True)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
|
||||
"for p in TL.parameters():\n",
|
||||
" print(p)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "skilled-vietnamese",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ty = TL(torch.tensor(x, dtype=torch.float32))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "incorrect-allah",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"np.allclose(y.numpy(), ty.detach().numpy())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "prostate-cameroon",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"id": "governmental-surge",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x = np.random.randn(2, 256)\n",
|
||||
"y = L(paddle.to_tensor(x, dtype='float32'))\n",
|
||||
"ty = TL(torch.tensor(x, dtype=torch.float32))\n",
|
||||
"np.allclose(y.numpy(), ty.detach().numpy())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "confidential-jacket",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,449 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "primary-organic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "stopped-semester",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def mask_finished_scores(score: torch.Tensor,\n",
|
||||
" flag: torch.Tensor) -> torch.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" If a sequence is finished, we only allow one alive branch. This function\n",
|
||||
" aims to give one branch a zero score and the rest -inf score.\n",
|
||||
" Args:\n",
|
||||
" score (torch.Tensor): A real value array with shape\n",
|
||||
" (batch_size * beam_size, beam_size).\n",
|
||||
" flag (torch.Tensor): A bool array with shape\n",
|
||||
" (batch_size * beam_size, 1).\n",
|
||||
" Returns:\n",
|
||||
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
|
||||
" \"\"\"\n",
|
||||
" beam_size = score.size(-1)\n",
|
||||
" zero_mask = torch.zeros_like(flag, dtype=torch.bool)\n",
|
||||
" if beam_size > 1:\n",
|
||||
" unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),\n",
|
||||
" dim=1)\n",
|
||||
" finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),\n",
|
||||
" dim=1)\n",
|
||||
" else:\n",
|
||||
" unfinished = zero_mask\n",
|
||||
" finished = flag\n",
|
||||
" print(unfinished)\n",
|
||||
" print(finished)\n",
|
||||
" score.masked_fill_(unfinished, -float('inf'))\n",
|
||||
" score.masked_fill_(finished, 0)\n",
|
||||
" return score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"id": "agreed-portuguese",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[ True],\n",
|
||||
" [False]])\n",
|
||||
"tensor([[-0.8841, 0.7381, -0.9986],\n",
|
||||
" [ 0.2675, -0.7971, 0.3798]])\n",
|
||||
"tensor([[ True, True],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"score = torch.randn((2, 3))\n",
|
||||
"flag = torch.ones((2, 1), dtype=torch.bool)\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag)\n",
|
||||
"print(score)\n",
|
||||
"print(flag.repeat([1, 2]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"id": "clean-aspect",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[False, True, True],\n",
|
||||
" [False, False, False]])\n",
|
||||
"tensor([[ True, False, False],\n",
|
||||
" [False, False, False]])\n",
|
||||
"tensor([[ 0.0000, -inf, -inf],\n",
|
||||
" [ 0.2675, -0.7971, 0.3798]])\n",
|
||||
"tensor([[ 0.0000, -inf, -inf],\n",
|
||||
" [ 0.2675, -0.7971, 0.3798]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r = mask_finished_scores(score, flag)\n",
|
||||
"print(r)\n",
|
||||
"print(score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"id": "thrown-airline",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tensor(shape=[2, 1], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True ],\n",
|
||||
" [False]])\n",
|
||||
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , True ],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"\n",
|
||||
"score = paddle.randn((2, 3))\n",
|
||||
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag)\n",
|
||||
"print(score)\n",
|
||||
"print(flag.tile([1, 2]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 56,
|
||||
"id": "internal-patent",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[False, True , True ],\n",
|
||||
" [False, False, False]])\n",
|
||||
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , False, False],\n",
|
||||
" [False, False, False]])\n",
|
||||
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 0. , -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 0. , -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"paddle.bool = 'bool'\n",
|
||||
"\n",
|
||||
"def masked_fill(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
|
||||
" print(xs)\n",
|
||||
" trues = paddle.ones_like(xs) * value\n",
|
||||
" assert xs.shape == mask.shape\n",
|
||||
" xs = paddle.where(mask, trues, xs)\n",
|
||||
" return xs\n",
|
||||
"\n",
|
||||
"def masked_fill_(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
|
||||
" print('x', xs)\n",
|
||||
" trues = paddle.ones_like(xs) * value\n",
|
||||
" assert xs.shape == mask.shape\n",
|
||||
" ret = paddle.where(mask, trues, xs)\n",
|
||||
" print('2', xs)\n",
|
||||
" paddle.assign(ret, output=xs)\n",
|
||||
" print('3', xs)\n",
|
||||
"\n",
|
||||
"paddle.Tensor.masked_fill = masked_fill\n",
|
||||
"paddle.Tensor.masked_fill_ = masked_fill_\n",
|
||||
"\n",
|
||||
"def mask_finished_scores_pd(score: paddle.Tensor,\n",
|
||||
" flag: paddle.Tensor) -> paddle.Tensor:\n",
|
||||
" \"\"\"\n",
|
||||
" If a sequence is finished, we only allow one alive branch. This function\n",
|
||||
" aims to give one branch a zero score and the rest -inf score.\n",
|
||||
" Args:\n",
|
||||
" score (torch.Tensor): A real value array with shape\n",
|
||||
" (batch_size * beam_size, beam_size).\n",
|
||||
" flag (torch.Tensor): A bool array with shape\n",
|
||||
" (batch_size * beam_size, 1).\n",
|
||||
" Returns:\n",
|
||||
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
|
||||
" \"\"\"\n",
|
||||
" beam_size = score.shape[-1]\n",
|
||||
" zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)\n",
|
||||
" if beam_size > 1:\n",
|
||||
" unfinished = paddle.concat((zero_mask, flag.tile([1, beam_size - 1])),\n",
|
||||
" axis=1)\n",
|
||||
" finished = paddle.concat((flag, zero_mask.tile([1, beam_size - 1])),\n",
|
||||
" axis=1)\n",
|
||||
" else:\n",
|
||||
" unfinished = zero_mask\n",
|
||||
" finished = flag\n",
|
||||
" print(unfinished)\n",
|
||||
" print(finished)\n",
|
||||
" \n",
|
||||
" #score.masked_fill_(unfinished, -float('inf'))\n",
|
||||
" #score.masked_fill_(finished, 0)\n",
|
||||
"# infs = paddle.ones_like(score) * -float('inf')\n",
|
||||
"# score = paddle.where(unfinished, infs, score)\n",
|
||||
"# score = paddle.where(finished, paddle.zeros_like(score), score)\n",
|
||||
"\n",
|
||||
"# score = score.masked_fill(unfinished, -float('inf'))\n",
|
||||
"# score = score.masked_fill(finished, 0)\n",
|
||||
" score.masked_fill_(unfinished, -float('inf'))\n",
|
||||
" score.masked_fill_(finished, 0)\n",
|
||||
" return score\n",
|
||||
"\n",
|
||||
"r = mask_finished_scores_pd(score, flag)\n",
|
||||
"print(r)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"id": "vocal-prime",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<bound method PyCapsule.value of Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[ 0. , -inf. , -inf. ],\n",
|
||||
" [-0.40165186, 0.77547729, -0.64469045]])>"
|
||||
]
|
||||
},
|
||||
"execution_count": 57,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"score.value"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"id": "bacterial-adolescent",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Union, Any"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"id": "absent-fiber",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def repeat(xs : paddle.Tensor, *size: Any):\n",
|
||||
" print(size)\n",
|
||||
" return paddle.tile(xs, size)\n",
|
||||
"paddle.Tensor.repeat = repeat"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"id": "material-harbor",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(1, 2)\n",
|
||||
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , True ],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag.repeat(1, 2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 84,
|
||||
"id": "acute-brighton",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [1]), 2)\n",
|
||||
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||
" [[True , True ],\n",
|
||||
" [False, False]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||
"flag[1] = False\n",
|
||||
"print(flag.repeat(paddle.to_tensor(1), 2))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 85,
|
||||
"id": "european-rugby",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def size(xs, *args: int):\n",
|
||||
" nargs = len(args)\n",
|
||||
" s = paddle.shape(xs)\n",
|
||||
" assert(nargs <= 1)\n",
|
||||
" if nargs == 1:\n",
|
||||
" return s[args[0]]\n",
|
||||
" else:\n",
|
||||
" return s\n",
|
||||
"paddle.Tensor.size = size"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 86,
|
||||
"id": "moral-special",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[2], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [2, 1])"
|
||||
]
|
||||
},
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag.size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"id": "ahead-coach",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [1])"
|
||||
]
|
||||
},
|
||||
"execution_count": 87,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag.size(1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 88,
|
||||
"id": "incomplete-fitness",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||
" [2])"
|
||||
]
|
||||
},
|
||||
"execution_count": 88,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"flag.size(0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "upset-connectivity",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,231 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "designing-borough",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||
" and should_run_async(code)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||
" 0.0000000e+00 0.0000000e+00]\n",
|
||||
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||
" 1.1547816e-04 1.0746076e-04]\n",
|
||||
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||
" 2.3095631e-04 2.1492151e-04]\n",
|
||||
" ...\n",
|
||||
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||
" 1.1201146e-02 1.0423505e-02]\n",
|
||||
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||
" 1.1316618e-02 1.0530960e-02]\n",
|
||||
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||
" 1.1432089e-02 1.0638415e-02]]\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import math\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"max_len=100\n",
|
||||
"d_model=256\n",
|
||||
"\n",
|
||||
"pe = torch.zeros(max_len, d_model)\n",
|
||||
"position = torch.arange(0, max_len,\n",
|
||||
" dtype=torch.float32).unsqueeze(1)\n",
|
||||
"toruch_position = position\n",
|
||||
"div_term = torch.exp(\n",
|
||||
" torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
|
||||
" -(math.log(10000.0) / d_model))\n",
|
||||
"tourch_div_term = div_term.cpu().detach().numpy()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"torhc_sin = torch.sin(position * div_term)\n",
|
||||
"torhc_cos = torch.cos(position * div_term)\n",
|
||||
"print(torhc_sin.cpu().detach().numpy())\n",
|
||||
"np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
|
||||
"np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
|
||||
"print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
|
||||
"print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
|
||||
"pe[:, 0::2] = torhc_sin\n",
|
||||
"pe[:, 1::2] = torhc_cos\n",
|
||||
"tourch_pe = pe.cpu().detach().numpy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "swiss-referral",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"[[ 1. 1. 1. ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [ 0.5403023 0.59737533 0.6479059 ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [-0.41614684 -0.28628543 -0.1604359 ... 0.99999994 1.\n",
|
||||
" 1. ]\n",
|
||||
" ...\n",
|
||||
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.99993724\n",
|
||||
" 0.9999457 ]\n",
|
||||
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
|
||||
" 0.99994457]\n",
|
||||
" [ 0.03982088 -0.52298605 -0.6157435 ... 0.99992454 0.9999347\n",
|
||||
" 0.99994344]]\n",
|
||||
"----\n",
|
||||
"[[ 1. 1. 1. ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [ 0.54030234 0.59737533 0.6479059 ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" [-0.41614684 -0.28628543 -0.1604359 ... 1. 1.\n",
|
||||
" 1. ]\n",
|
||||
" ...\n",
|
||||
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.9999373\n",
|
||||
" 0.9999457 ]\n",
|
||||
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
|
||||
" 0.99994457]\n",
|
||||
" [ 0.03982088 -0.5229861 -0.6157435 ... 0.99992454 0.9999347\n",
|
||||
" 0.99994344]]\n",
|
||||
")))))))\n",
|
||||
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||
" 0.0000000e+00 0.0000000e+00]\n",
|
||||
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||
" 1.1547816e-04 1.0746076e-04]\n",
|
||||
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||
" 2.3095631e-04 2.1492151e-04]\n",
|
||||
" ...\n",
|
||||
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||
" 1.1201146e-02 1.0423505e-02]\n",
|
||||
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||
" 1.1316618e-02 1.0530960e-02]\n",
|
||||
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||
" 1.1432089e-02 1.0638415e-02]]\n",
|
||||
"----\n",
|
||||
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||
" 0.0000000e+00 0.0000000e+00]\n",
|
||||
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||
" 1.1547816e-04 1.0746076e-04]\n",
|
||||
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||
" 2.3095631e-04 2.1492151e-04]\n",
|
||||
" ...\n",
|
||||
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||
" 1.1201146e-02 1.0423505e-02]\n",
|
||||
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||
" 1.1316618e-02 1.0530960e-02]\n",
|
||||
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||
" 1.1432089e-02 1.0638415e-02]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"paddle.set_device('cpu')\n",
|
||||
"ppe = paddle.zeros((max_len, d_model), dtype='float32')\n",
|
||||
"position = paddle.arange(0, max_len,\n",
|
||||
" dtype='float32').unsqueeze(1)\n",
|
||||
"print(np.allclose(position.numpy(), toruch_position))\n",
|
||||
"div_term = paddle.exp(\n",
|
||||
" paddle.arange(0, d_model, 2, dtype='float32') *\n",
|
||||
" -(math.log(10000.0) / d_model))\n",
|
||||
"print(np.allclose(div_term.numpy(), tourch_div_term))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"p_sin = paddle.sin(position * div_term)\n",
|
||||
"p_cos = paddle.cos(position * div_term)\n",
|
||||
"print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
|
||||
"print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
|
||||
"ppe[:, 0::2] = p_sin\n",
|
||||
"ppe[:, 1::2] = p_cos\n",
|
||||
"print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
|
||||
"print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n",
|
||||
"print(p_cos.numpy())\n",
|
||||
"print(\"----\")\n",
|
||||
"print(torhc_cos.cpu().detach().numpy())\n",
|
||||
"print(\")))))))\")\n",
|
||||
"print(p_sin.numpy())\n",
|
||||
"print(\"----\")\n",
|
||||
"print(torhc_sin.cpu().detach().numpy())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "integrated-boards",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"False\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(np.allclose(ppe.numpy(), pe.numpy()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "flying-reserve",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "revised-divide",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
Before Width: | Height: | Size: 206 KiB |
Before Width: | Height: | Size: 108 KiB |
@ -1,16 +0,0 @@
|
||||
# Benchmarks
|
||||
|
||||
## Acceleration with Multi-GPUs
|
||||
|
||||
We compare the training time with 1, 2, 4, 8 Tesla V100 GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) cost for training is printed on the blue bars.
|
||||
|
||||
<img src="../images/multi_gpu_speedup.png" width=450>
|
||||
|
||||
| # of GPU | Acceleration Rate |
|
||||
| -------- | --------------: |
|
||||
| 1 | 1.00 X |
|
||||
| 2 | 1.98 X |
|
||||
| 4 | 3.73 X |
|
||||
| 8 | 6.95 X |
|
||||
|
||||
`utils/profile.sh` provides such a demo profiling tool, you can change it as need.
|
@ -1,3 +0,0 @@
|
||||
# Reference
|
||||
|
||||
* [wenet](https://github.com/mobvoi/wenet)
|
@ -1,9 +0,0 @@
|
||||
# Released Models
|
||||
|
||||
## Language Model Released
|
||||
|
||||
Language Model | Training Data | Token-based | Size | Descriptions
|
||||
:-------------:| :------------:| :-----: | -----: | :-----------------
|
||||
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
|
||||
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
|
||||
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
|
After Width: | Height: | Size: 93 KiB |
After Width: | Height: | Size: 93 KiB |
@ -0,0 +1,190 @@
|
||||
# Deepspeech2
|
||||
## Streaming
|
||||
|
||||
The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
|
||||
The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
|
||||
|
||||
To illustrate the model implementation clearly, 3 parts are described in detail.
|
||||
- Data Preparation
|
||||
- Encoder
|
||||
- Decoder
|
||||
|
||||
In addition, the training process and the testing process are also introduced.
|
||||
|
||||
The arcitecture of the model is shown in Fig.1.
|
||||
|
||||
<p align="center">
|
||||
<img src="../images/ds2onlineModel.png" width=800>
|
||||
<br/>Fig.1 The Arcitecture of deepspeech2 online model
|
||||
</p>
|
||||
|
||||
### Data Preparation
|
||||
#### Vocabulary
|
||||
For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>. For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
|
||||
```
|
||||
# The code to build vocabulary
|
||||
cd examples/aishell/s0
|
||||
python3 ../../../utils/build_vocab.py \
|
||||
--unit_type="char" \
|
||||
--count_threshold=0 \
|
||||
--vocab_path="data/vocab.txt" \
|
||||
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
|
||||
|
||||
# vocabulary for aishell dataset (Mandarin)
|
||||
vi examples/aishell/s0/data/vocab.txt
|
||||
|
||||
# vocabulary for librispeech dataset (English)
|
||||
vi examples/librispeech/s0/data/vocab.txt
|
||||
```
|
||||
|
||||
#### CMVN
|
||||
For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
|
||||
```
|
||||
# The code to compute the feature mean and std
|
||||
cd examples/aishell/s0
|
||||
python3 ../../../utils/compute_mean_std.py \
|
||||
--manifest_path="data/manifest.train.raw" \
|
||||
--spectrum_type="linear" \
|
||||
--delta_delta=false \
|
||||
--stride_ms=10.0 \
|
||||
--window_ms=20.0 \
|
||||
--sample_rate=16000 \
|
||||
--use_dB_normalization=True \
|
||||
--num_samples=2000 \
|
||||
--num_workers=10 \
|
||||
--output_path="data/mean_std.json"
|
||||
|
||||
```
|
||||
|
||||
#### Feature Extraction
|
||||
For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
|
||||
Currently, the released deepspeech2 online model use the linear feature extraction method.
|
||||
```
|
||||
The code for feature extraction
|
||||
vi deepspeech/frontend/featurizer/audio_featurizer.py
|
||||
```
|
||||
|
||||
### Encoder
|
||||
The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.
|
||||
|
||||
The code of Encoder is in:
|
||||
```
|
||||
vi deepspeech/models/ds2_online/deepspeech2.py
|
||||
```
|
||||
|
||||
### Decoder
|
||||
To got the character possibilities of each frame, the feature representation of each frame output from the encoder are input into a projection layer which is implemented as a dense layer to do feature projection. The output dim of the projection layer is same with the vocabulary size. After projection layer, the softmax function is used to transform the frame-level feature representation be the possibilities of characters. While making model inference, the character possibilities of each frame are input into the CTC decoder to get the final speech recognition results.
|
||||
|
||||
The code of the decoder is in:
|
||||
```
|
||||
# The code of constructing the decoder in model
|
||||
vi deepspeech/models/ds2_online/deepspeech2.py
|
||||
# The code of CTC Decoder
|
||||
vi deepspeech/modules/ctc.py
|
||||
```
|
||||
|
||||
## Training Process
|
||||
Using the command below, you can train the deepspeech2 online model.
|
||||
```
|
||||
cd examples/aishell/s0
|
||||
bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
|
||||
```
|
||||
The detail commands are:
|
||||
```
|
||||
# The code for training in run.sh
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
gpus=2,3,5,7
|
||||
stage=0
|
||||
stop_stage=5
|
||||
conf_path=conf/deepspeech2_online.yaml # conf/deepspeech2.yaml | conf/deepspeech2_online.yaml
|
||||
avg_num=1
|
||||
model_type=online # online | offline
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
avg_ckpt=avg_${avg_num}
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `exp` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# avg n best model
|
||||
avg.sh exp/${ckpt}/checkpoints ${avg_num}
|
||||
fi
|
||||
```
|
||||
|
||||
By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.
|
||||
|
||||
## Testing Process
|
||||
Using the command below, you can test the deepspeech2 online model.
|
||||
```
|
||||
bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
|
||||
```
|
||||
The detail commands are:
|
||||
```
|
||||
conf_path=conf/deepspeech2_online.yaml
|
||||
avg_num=1
|
||||
model_type=online
|
||||
avg_ckpt=avg_${avg_num}
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# export ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=5 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
# test export ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
|
||||
fi
|
||||
```
|
||||
After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
|
||||
|
||||
|
||||
## Non-Streaming
|
||||
The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
|
||||
|
||||
The arcitecture of the model is shown in Fig.2.
|
||||
<p align="center">
|
||||
<img src="../images/ds2offlineModel.png" width=800>
|
||||
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
|
||||
</p>
|
||||
|
||||
|
||||
|
||||
For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
|
||||
|
||||
The code of encoder and decoder for deepspeech2 offline model is in:
|
||||
```
|
||||
vi deepspeech/models/ds2/deepspeech2.py
|
||||
```
|
||||
|
||||
The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
|
||||
Only some changes should be noticed.
|
||||
|
||||
For training and testing, the "model_type" and the "conf_path" must be set.
|
||||
```
|
||||
# Training offline
|
||||
cd examples/aishell/s0
|
||||
bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
|
||||
```
|
||||
```
|
||||
# Testing offline
|
||||
cd examples/aishell/s0
|
||||
bash run.sh --stage 3 --stop_stage 5 --model_type offline --conf_path conf/deepspeech2.yaml
|
||||
```
|
@ -0,0 +1,8 @@
|
||||
# Reference
|
||||
|
||||
We refer these repos to build `model` and `engine`:
|
||||
|
||||
* [delta](https://github.com/Delta-ML/delta.git)
|
||||
* [espnet](https://github.com/espnet/espnet.git)
|
||||
* [kaldi](https://github.com/kaldi-asr/kaldi.git)
|
||||
* [wenet](https://github.com/mobvoi/wenet)
|
@ -0,0 +1,28 @@
|
||||
# Released Models
|
||||
|
||||
## Acoustic Model Released in paddle 2.X
|
||||
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
|
||||
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
|
||||
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
|
||||
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
|
||||
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
|
||||
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
|
||||
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
|
||||
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
|
||||
|
||||
## Acoustic Model Transformed from paddle 1.8
|
||||
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
|
||||
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
|
||||
[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
|
||||
[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
|
||||
[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
|
||||
|
||||
|
||||
|
||||
## Language Model Released
|
||||
|
||||
Language Model | Training Data | Token-based | Size | Descriptions
|
||||
:-------------:| :------------:| :-----: | -----: | :-----------------
|
||||
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
|
||||
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
|
||||
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
|
@ -1,10 +1,10 @@
|
||||
export MAIN_ROOT=${PWD}
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:/usr/local/bin:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
@ -0,0 +1 @@
|
||||
tmp
|
@ -0,0 +1,11 @@
|
||||
# 1xt2x
|
||||
|
||||
Convert Deepspeech 1.8 released model to 2.x.
|
||||
|
||||
## Model
|
||||
* Deepspeech2x
|
||||
|
||||
## Exp
|
||||
* baidu_en8k
|
||||
* aishell
|
||||
* librispeech
|
@ -0,0 +1,5 @@
|
||||
exp
|
||||
data
|
||||
*log
|
||||
tmp
|
||||
nohup*
|
@ -0,0 +1 @@
|
||||
[]
|
@ -0,0 +1,67 @@
|
||||
# https://yaml.org/type/float.html
|
||||
data:
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
min_input_len: 0.0
|
||||
max_input_len: 27.0 # second
|
||||
min_output_len: 0.0
|
||||
max_output_len: .inf
|
||||
min_output_input_ratio: 0.00
|
||||
max_output_input_ratio: .inf
|
||||
|
||||
collator:
|
||||
batch_size: 64 # one gpu
|
||||
mean_std_filepath: data/mean_std.npz
|
||||
unit_type: char
|
||||
vocab_filepath: data/vocab.txt
|
||||
augmentation_config: conf/augmentation.json
|
||||
random_seed: 0
|
||||
spm_model_prefix:
|
||||
spectrum_type: linear
|
||||
feat_dim:
|
||||
delta_delta: False
|
||||
stride_ms: 10.0
|
||||
window_ms: 20.0
|
||||
n_fft: None
|
||||
max_freq: None
|
||||
target_sample_rate: 16000
|
||||
use_dB_normalization: True
|
||||
target_dB: -20
|
||||
dither: 1.0
|
||||
keep_transcription_text: False
|
||||
sortagrad: True
|
||||
shuffle_method: batch_shuffle
|
||||
num_workers: 2
|
||||
|
||||
model:
|
||||
num_conv_layers: 2
|
||||
num_rnn_layers: 3
|
||||
rnn_layer_size: 1024
|
||||
use_gru: True
|
||||
share_rnn_weights: False
|
||||
blank_id: 4333
|
||||
|
||||
training:
|
||||
n_epoch: 80
|
||||
accum_grad: 1
|
||||
lr: 2e-3
|
||||
lr_decay: 0.83
|
||||
weight_decay: 1e-06
|
||||
global_grad_clip: 3.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
||||
|
||||
decoding:
|
||||
batch_size: 32
|
||||
error_rate_type: cer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
|
||||
alpha: 2.6
|
||||
beta: 5.0
|
||||
beam_size: 300
|
||||
cutoff_prob: 0.99
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 8
|
@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: ${0} ckpt_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=$1
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
mkdir -p data
|
||||
TARGET_DIR=${MAIN_ROOT}/examples/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
|
||||
bash local/download_model.sh ${ckpt_dir}
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd ${ckpt_dir}
|
||||
tar xzvf aishell_model_v1.8_to_v2.x.tar.gz
|
||||
cd -
|
||||
mv ${ckpt_dir}/mean_std.npz data/
|
||||
mv ${ckpt_dir}/vocab.txt data/
|
||||
|
||||
|
||||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||
# download data, generate manifests
|
||||
python3 ${TARGET_DIR}/aishell/aishell.py \
|
||||
--manifest_prefix="data/manifest" \
|
||||
--target_dir="${TARGET_DIR}/aishell"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare Aishell failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for dataset in train dev test; do
|
||||
mv data/manifest.${dataset} data/manifest.${dataset}.raw
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# format manifest with tokenids, vocab size
|
||||
for dataset in train dev test; do
|
||||
{
|
||||
python3 ${MAIN_ROOT}/utils/format_data.py \
|
||||
--feat_type "raw" \
|
||||
--cmvn_path "data/mean_std.npz" \
|
||||
--unit_type "char" \
|
||||
--vocab_path="data/vocab.txt" \
|
||||
--manifest_path="data/manifest.${dataset}.raw" \
|
||||
--output_path="data/manifest.${dataset}"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Formt mnaifest failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
} &
|
||||
done
|
||||
wait
|
||||
fi
|
||||
|
||||
echo "Aishell data preparation done."
|
||||
exit 0
|
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
DIR=data/lm
|
||||
mkdir -p ${DIR}
|
||||
|
||||
URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm'
|
||||
MD5="29e02312deb2e59b3c8686c7966d4fe3"
|
||||
TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
|
||||
|
||||
|
||||
echo "Download language model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download the language model!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,25 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: ${0} ckpt_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=$1
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz'
|
||||
MD5=87e7577d4bea737dbf3e8daab37aa808
|
||||
TARGET=${ckpt_dir}/aishell_model_v1.8_to_v2.x.tar.gz
|
||||
|
||||
|
||||
echo "Download Aishell model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download Aishell model!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix model_type"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
model_type=$3
|
||||
|
||||
# download language model
|
||||
bash local/download_lm_ch.sh
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--nproc ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--result_file ${ckpt_prefix}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--model_type ${model_type}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,16 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
MODEL=deepspeech2
|
||||
export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
|
||||
echo "BIN_DIR "${BIN_DIR}
|
@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
conf_path=conf/deepspeech2.yaml
|
||||
avg_num=1
|
||||
model_type=offline
|
||||
gpus=2
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
v18_ckpt=aishell_v1.8
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
mkdir -p exp/${ckpt}/checkpoints
|
||||
bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
|
||||
fi
|
||||
|
@ -0,0 +1,5 @@
|
||||
exp
|
||||
data
|
||||
*log
|
||||
tmp
|
||||
nohup*
|
@ -0,0 +1 @@
|
||||
[]
|
@ -0,0 +1,67 @@
|
||||
# https://yaml.org/type/float.html
|
||||
data:
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test-clean
|
||||
min_input_len: 0.0
|
||||
max_input_len: .inf # second
|
||||
min_output_len: 0.0
|
||||
max_output_len: .inf
|
||||
min_output_input_ratio: 0.00
|
||||
max_output_input_ratio: .inf
|
||||
|
||||
collator:
|
||||
batch_size: 64 # one gpu
|
||||
mean_std_filepath: data/mean_std.npz
|
||||
unit_type: char
|
||||
vocab_filepath: data/vocab.txt
|
||||
augmentation_config: conf/augmentation.json
|
||||
random_seed: 0
|
||||
spm_model_prefix:
|
||||
spectrum_type: linear
|
||||
feat_dim:
|
||||
delta_delta: False
|
||||
stride_ms: 10.0
|
||||
window_ms: 20.0
|
||||
n_fft: None
|
||||
max_freq: None
|
||||
target_sample_rate: 16000
|
||||
use_dB_normalization: True
|
||||
target_dB: -20
|
||||
dither: 1.0
|
||||
keep_transcription_text: False
|
||||
sortagrad: True
|
||||
shuffle_method: batch_shuffle
|
||||
num_workers: 2
|
||||
|
||||
model:
|
||||
num_conv_layers: 2
|
||||
num_rnn_layers: 3
|
||||
rnn_layer_size: 1024
|
||||
use_gru: True
|
||||
share_rnn_weights: False
|
||||
blank_id: 28
|
||||
|
||||
training:
|
||||
n_epoch: 80
|
||||
accum_grad: 1
|
||||
lr: 2e-3
|
||||
lr_decay: 0.83
|
||||
weight_decay: 1e-06
|
||||
global_grad_clip: 3.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
||||
|
||||
decoding:
|
||||
batch_size: 32
|
||||
error_rate_type: wer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
|
||||
alpha: 1.4
|
||||
beta: 0.35
|
||||
beam_size: 500
|
||||
cutoff_prob: 1.0
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 8
|
@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: ${0} ckpt_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=$1
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
unit_type=char
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
mkdir -p data
|
||||
TARGET_DIR=${MAIN_ROOT}/examples/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
|
||||
|
||||
bash local/download_model.sh ${ckpt_dir}
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd ${ckpt_dir}
|
||||
tar xzvf baidu_en8k_v1.8_to_v2.x.tar.gz
|
||||
cd -
|
||||
mv ${ckpt_dir}/mean_std.npz data/
|
||||
mv ${ckpt_dir}/vocab.txt data/
|
||||
|
||||
|
||||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||
# download data, generate manifests
|
||||
python3 ${TARGET_DIR}/librispeech/librispeech.py \
|
||||
--manifest_prefix="data/manifest" \
|
||||
--target_dir="${TARGET_DIR}/librispeech" \
|
||||
--full_download="True"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare LibriSpeech failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mv data/manifest.${set} data/manifest.${set}.raw
|
||||
done
|
||||
|
||||
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
|
||||
for set in train-clean-100 train-clean-360 train-other-500; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.train.raw
|
||||
done
|
||||
|
||||
for set in dev-clean dev-other; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.dev.raw
|
||||
done
|
||||
|
||||
for set in test-clean test-other; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.test.raw
|
||||
done
|
||||
fi
|
||||
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# format manifest with tokenids, vocab size
|
||||
for set in train dev test dev-clean dev-other test-clean test-other; do
|
||||
{
|
||||
python3 ${MAIN_ROOT}/utils/format_data.py \
|
||||
--feat_type "raw" \
|
||||
--cmvn_path "data/mean_std.npz" \
|
||||
--unit_type ${unit_type} \
|
||||
--vocab_path="data/vocab.txt" \
|
||||
--manifest_path="data/manifest.${set}.raw" \
|
||||
--output_path="data/manifest.${set}"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Formt mnaifest.${set} failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
}&
|
||||
done
|
||||
wait
|
||||
fi
|
||||
|
||||
echo "LibriSpeech Data preparation done."
|
||||
exit 0
|
||||
|
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
DIR=data/lm
|
||||
mkdir -p ${DIR}
|
||||
|
||||
URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
|
||||
MD5="099a601759d467cd0a8523ff939819c5"
|
||||
TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
|
||||
|
||||
echo "Download language model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download the language model!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,25 @@
|
||||
#! /usr/bin/env bash
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: ${0} ckpt_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=$1
|
||||
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
URL='https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz'
|
||||
MD5=c1676be8505cee436e6f312823e9008c
|
||||
TARGET=${ckpt_dir}/baidu_en8k_v1.8_to_v2.x.tar.gz
|
||||
|
||||
|
||||
echo "Download BaiduEn8k model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download BaiduEn8k model!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix model_type"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
model_type=$3
|
||||
|
||||
# download language model
|
||||
bash local/download_lm_en.sh
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--nproc ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--result_file ${ckpt_prefix}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--model_type ${model_type}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,16 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
MODEL=deepspeech2
|
||||
export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
|
||||
echo "BIN_DIR "${BIN_DIR}
|
@ -0,0 +1,28 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
conf_path=conf/deepspeech2.yaml
|
||||
avg_num=1
|
||||
model_type=offline
|
||||
gpus=0
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
v18_ckpt=baidu_en8k_v1.8
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
mkdir -p exp/${ckpt}/checkpoints
|
||||
bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
|
||||
fi
|
||||
|
@ -0,0 +1,5 @@
|
||||
exp
|
||||
data
|
||||
*log
|
||||
tmp
|
||||
nohup*
|
@ -0,0 +1 @@
|
||||
[]
|
@ -0,0 +1,67 @@
|
||||
# https://yaml.org/type/float.html
|
||||
data:
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test-clean
|
||||
min_input_len: 0.0
|
||||
max_input_len: 1000.0 # second
|
||||
min_output_len: 0.0
|
||||
max_output_len: .inf
|
||||
min_output_input_ratio: 0.00
|
||||
max_output_input_ratio: .inf
|
||||
|
||||
collator:
|
||||
batch_size: 64 # one gpu
|
||||
mean_std_filepath: data/mean_std.npz
|
||||
unit_type: char
|
||||
vocab_filepath: data/vocab.txt
|
||||
augmentation_config: conf/augmentation.json
|
||||
random_seed: 0
|
||||
spm_model_prefix:
|
||||
spectrum_type: linear
|
||||
feat_dim:
|
||||
delta_delta: False
|
||||
stride_ms: 10.0
|
||||
window_ms: 20.0
|
||||
n_fft: None
|
||||
max_freq: None
|
||||
target_sample_rate: 16000
|
||||
use_dB_normalization: True
|
||||
target_dB: -20
|
||||
dither: 1.0
|
||||
keep_transcription_text: False
|
||||
sortagrad: True
|
||||
shuffle_method: batch_shuffle
|
||||
num_workers: 2
|
||||
|
||||
model:
|
||||
num_conv_layers: 2
|
||||
num_rnn_layers: 3
|
||||
rnn_layer_size: 2048
|
||||
use_gru: False
|
||||
share_rnn_weights: True
|
||||
blank_id: 28
|
||||
|
||||
training:
|
||||
n_epoch: 80
|
||||
accum_grad: 1
|
||||
lr: 2e-3
|
||||
lr_decay: 0.83
|
||||
weight_decay: 1e-06
|
||||
global_grad_clip: 3.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
||||
|
||||
decoding:
|
||||
batch_size: 32
|
||||
error_rate_type: wer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
|
||||
alpha: 2.5
|
||||
beta: 0.3
|
||||
beam_size: 500
|
||||
cutoff_prob: 1.0
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 8
|
@ -0,0 +1,84 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: ${0} ckpt_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=$1
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
unit_type=char
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
mkdir -p data
|
||||
TARGET_DIR=${MAIN_ROOT}/examples/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
|
||||
bash local/download_model.sh ${ckpt_dir}
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd ${ckpt_dir}
|
||||
tar xzvf librispeech_v1.8_to_v2.x.tar.gz
|
||||
cd -
|
||||
mv ${ckpt_dir}/mean_std.npz data/
|
||||
mv ${ckpt_dir}/vocab.txt data/
|
||||
|
||||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||
# download data, generate manifests
|
||||
python3 ${TARGET_DIR}/librispeech/librispeech.py \
|
||||
--manifest_prefix="data/manifest" \
|
||||
--target_dir="${TARGET_DIR}/librispeech" \
|
||||
--full_download="True"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare LibriSpeech failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mv data/manifest.${set} data/manifest.${set}.raw
|
||||
done
|
||||
|
||||
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
|
||||
for set in train-clean-100 train-clean-360 train-other-500; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.train.raw
|
||||
done
|
||||
|
||||
for set in dev-clean dev-other; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.dev.raw
|
||||
done
|
||||
|
||||
for set in test-clean test-other; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.test.raw
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# format manifest with tokenids, vocab size
|
||||
for set in train dev test dev-clean dev-other test-clean test-other; do
|
||||
{
|
||||
python3 ${MAIN_ROOT}/utils/format_data.py \
|
||||
--feat_type "raw" \
|
||||
--cmvn_path "data/mean_std.npz" \
|
||||
--unit_type ${unit_type} \
|
||||
--vocab_path="data/vocab.txt" \
|
||||
--manifest_path="data/manifest.${set}.raw" \
|
||||
--output_path="data/manifest.${set}"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Formt mnaifest.${set} failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
}&
|
||||
done
|
||||
wait
|
||||
fi
|
||||
|
||||
echo "LibriSpeech Data preparation done."
|
||||
exit 0
|
||||
|
@ -0,0 +1,20 @@
|
||||
#!/bin/bash
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
DIR=data/lm
|
||||
mkdir -p ${DIR}
|
||||
|
||||
URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
|
||||
MD5="099a601759d467cd0a8523ff939819c5"
|
||||
TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
|
||||
|
||||
echo "Download language model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download the language model!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,25 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 1 ];then
|
||||
echo "usage: ${0} ckpt_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=$1
|
||||
|
||||
. ${MAIN_ROOT}/utils/utility.sh
|
||||
|
||||
URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz'
|
||||
MD5=a06d9aadb560ea113984dc98d67232c8
|
||||
TARGET=${ckpt_dir}/librispeech_v1.8_to_v2.x.tar.gz
|
||||
|
||||
|
||||
echo "Download LibriSpeech model ..."
|
||||
download $URL $MD5 $TARGET
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Fail to download LibriSpeech model!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix model_type"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
model_type=$3
|
||||
|
||||
# download language model
|
||||
bash local/download_lm_en.sh
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--nproc ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--result_file ${ckpt_prefix}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--model_type ${model_type}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
exit 0
|
@ -0,0 +1,15 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
export LOCAL_DEEPSPEECH2=`realpath ${PWD}/../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
export PYTHONPATH=${LOCAL_DEEPSPEECH2}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
MODEL=deepspeech2
|
||||
export BIN_DIR=${LOCAL_DEEPSPEECH2}/src_deepspeech2x/bin
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
conf_path=conf/deepspeech2.yaml
|
||||
avg_num=1
|
||||
model_type=offline
|
||||
gpus=1
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
v18_ckpt=librispeech_v1.8
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
mkdir -p exp/${ckpt}/checkpoints
|
||||
bash ./local/data.sh exp/${ckpt}/checkpoints || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1
|
||||
fi
|
@ -0,0 +1,370 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Any
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.fluid import core
|
||||
from paddle.nn import functional as F
|
||||
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
#TODO(Hui Zhang): remove fluid import
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
########### hcak logging #############
|
||||
logger.warn = logger.warning
|
||||
|
||||
########### hcak paddle #############
|
||||
paddle.half = 'float16'
|
||||
paddle.float = 'float32'
|
||||
paddle.double = 'float64'
|
||||
paddle.short = 'int16'
|
||||
paddle.int = 'int32'
|
||||
paddle.long = 'int64'
|
||||
paddle.uint16 = 'uint16'
|
||||
paddle.cdouble = 'complex128'
|
||||
|
||||
|
||||
def convert_dtype_to_string(tensor_dtype):
|
||||
"""
|
||||
Convert the data type in numpy to the data type in Paddle
|
||||
Args:
|
||||
tensor_dtype(core.VarDesc.VarType): the data type in numpy.
|
||||
Returns:
|
||||
core.VarDesc.VarType: the data type in Paddle.
|
||||
"""
|
||||
dtype = tensor_dtype
|
||||
if dtype == core.VarDesc.VarType.FP32:
|
||||
return paddle.float32
|
||||
elif dtype == core.VarDesc.VarType.FP64:
|
||||
return paddle.float64
|
||||
elif dtype == core.VarDesc.VarType.FP16:
|
||||
return paddle.float16
|
||||
elif dtype == core.VarDesc.VarType.INT32:
|
||||
return paddle.int32
|
||||
elif dtype == core.VarDesc.VarType.INT16:
|
||||
return paddle.int16
|
||||
elif dtype == core.VarDesc.VarType.INT64:
|
||||
return paddle.int64
|
||||
elif dtype == core.VarDesc.VarType.BOOL:
|
||||
return paddle.bool
|
||||
elif dtype == core.VarDesc.VarType.BF16:
|
||||
# since there is still no support for bfloat16 in NumPy,
|
||||
# uint16 is used for casting bfloat16
|
||||
return paddle.uint16
|
||||
elif dtype == core.VarDesc.VarType.UINT8:
|
||||
return paddle.uint8
|
||||
elif dtype == core.VarDesc.VarType.INT8:
|
||||
return paddle.int8
|
||||
elif dtype == core.VarDesc.VarType.COMPLEX64:
|
||||
return paddle.complex64
|
||||
elif dtype == core.VarDesc.VarType.COMPLEX128:
|
||||
return paddle.complex128
|
||||
else:
|
||||
raise ValueError("Not supported tensor dtype %s" % dtype)
|
||||
|
||||
|
||||
if not hasattr(paddle, 'softmax'):
|
||||
logger.warn("register user softmax to paddle, remove this when fixed!")
|
||||
setattr(paddle, 'softmax', paddle.nn.functional.softmax)
|
||||
|
||||
if not hasattr(paddle, 'log_softmax'):
|
||||
logger.warn("register user log_softmax to paddle, remove this when fixed!")
|
||||
setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
|
||||
|
||||
if not hasattr(paddle, 'sigmoid'):
|
||||
logger.warn("register user sigmoid to paddle, remove this when fixed!")
|
||||
setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
|
||||
|
||||
if not hasattr(paddle, 'log_sigmoid'):
|
||||
logger.warn("register user log_sigmoid to paddle, remove this when fixed!")
|
||||
setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
|
||||
|
||||
if not hasattr(paddle, 'relu'):
|
||||
logger.warn("register user relu to paddle, remove this when fixed!")
|
||||
setattr(paddle, 'relu', paddle.nn.functional.relu)
|
||||
|
||||
|
||||
def cat(xs, dim=0):
|
||||
return paddle.concat(xs, axis=dim)
|
||||
|
||||
|
||||
if not hasattr(paddle, 'cat'):
|
||||
logger.warn(
|
||||
"override cat of paddle if exists or register, remove this when fixed!")
|
||||
paddle.cat = cat
|
||||
|
||||
|
||||
########### hcak paddle.Tensor #############
|
||||
def item(x: paddle.Tensor):
|
||||
return x.numpy().item()
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'item'):
|
||||
logger.warn(
|
||||
"override item of paddle.Tensor if exists or register, remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.item = item
|
||||
|
||||
|
||||
def func_long(x: paddle.Tensor):
|
||||
return paddle.cast(x, paddle.long)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'long'):
|
||||
logger.warn(
|
||||
"override long of paddle.Tensor if exists or register, remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.long = func_long
|
||||
|
||||
if not hasattr(paddle.Tensor, 'numel'):
|
||||
logger.warn(
|
||||
"override numel of paddle.Tensor if exists or register, remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.numel = paddle.numel
|
||||
|
||||
|
||||
def new_full(x: paddle.Tensor,
|
||||
size: Union[List[int], Tuple[int], paddle.Tensor],
|
||||
fill_value: Union[float, int, bool, paddle.Tensor],
|
||||
dtype=None):
|
||||
return paddle.full(size, fill_value, dtype=x.dtype)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'new_full'):
|
||||
logger.warn(
|
||||
"override new_full of paddle.Tensor if exists or register, remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.new_full = new_full
|
||||
|
||||
|
||||
def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
|
||||
if convert_dtype_to_string(xs.dtype) == paddle.bool:
|
||||
xs = xs.astype(paddle.int)
|
||||
return xs.equal(
|
||||
paddle.to_tensor(
|
||||
ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place))
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'eq'):
|
||||
logger.warn(
|
||||
"override eq of paddle.Tensor if exists or register, remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.eq = eq
|
||||
|
||||
if not hasattr(paddle, 'eq'):
|
||||
logger.warn(
|
||||
"override eq of paddle if exists or register, remove this when fixed!")
|
||||
paddle.eq = eq
|
||||
|
||||
|
||||
def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
|
||||
return xs
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'contiguous'):
|
||||
logger.warn(
|
||||
"override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.contiguous = contiguous
|
||||
|
||||
|
||||
def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
|
||||
nargs = len(args)
|
||||
assert (nargs <= 1)
|
||||
s = paddle.shape(xs)
|
||||
if nargs == 1:
|
||||
return s[args[0]]
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
|
||||
logger.warn(
|
||||
"override size of paddle.Tensor "
|
||||
"(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
|
||||
)
|
||||
paddle.Tensor.size = size
|
||||
|
||||
|
||||
def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
|
||||
return xs.reshape(args)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'view'):
|
||||
logger.warn("register user view to paddle.Tensor, remove this when fixed!")
|
||||
paddle.Tensor.view = view
|
||||
|
||||
|
||||
def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
|
||||
return xs.reshape(ys.size())
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'view_as'):
|
||||
logger.warn(
|
||||
"register user view_as to paddle.Tensor, remove this when fixed!")
|
||||
paddle.Tensor.view_as = view_as
|
||||
|
||||
|
||||
def is_broadcastable(shp1, shp2):
|
||||
for a, b in zip(shp1[::-1], shp2[::-1]):
|
||||
if a == 1 or b == 1 or a == b:
|
||||
pass
|
||||
else:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def masked_fill(xs: paddle.Tensor,
|
||||
mask: paddle.Tensor,
|
||||
value: Union[float, int]):
|
||||
assert is_broadcastable(xs.shape, mask.shape) is True
|
||||
bshape = paddle.broadcast_shape(xs.shape, mask.shape)
|
||||
mask = mask.broadcast_to(bshape)
|
||||
trues = paddle.ones_like(xs) * value
|
||||
xs = paddle.where(mask, trues, xs)
|
||||
return xs
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'masked_fill'):
|
||||
logger.warn(
|
||||
"register user masked_fill to paddle.Tensor, remove this when fixed!")
|
||||
paddle.Tensor.masked_fill = masked_fill
|
||||
|
||||
|
||||
def masked_fill_(xs: paddle.Tensor,
|
||||
mask: paddle.Tensor,
|
||||
value: Union[float, int]) -> paddle.Tensor:
|
||||
assert is_broadcastable(xs.shape, mask.shape) is True
|
||||
bshape = paddle.broadcast_shape(xs.shape, mask.shape)
|
||||
mask = mask.broadcast_to(bshape)
|
||||
trues = paddle.ones_like(xs) * value
|
||||
ret = paddle.where(mask, trues, xs)
|
||||
paddle.assign(ret.detach(), output=xs)
|
||||
return xs
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'masked_fill_'):
|
||||
logger.warn(
|
||||
"register user masked_fill_ to paddle.Tensor, remove this when fixed!")
|
||||
paddle.Tensor.masked_fill_ = masked_fill_
|
||||
|
||||
|
||||
def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
|
||||
val = paddle.full_like(xs, value)
|
||||
paddle.assign(val.detach(), output=xs)
|
||||
return xs
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'fill_'):
|
||||
logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!")
|
||||
paddle.Tensor.fill_ = fill_
|
||||
|
||||
|
||||
def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
|
||||
return paddle.tile(xs, size)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'repeat'):
|
||||
logger.warn(
|
||||
"register user repeat to paddle.Tensor, remove this when fixed!")
|
||||
paddle.Tensor.repeat = repeat
|
||||
|
||||
if not hasattr(paddle.Tensor, 'softmax'):
|
||||
logger.warn(
|
||||
"register user softmax to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
|
||||
|
||||
if not hasattr(paddle.Tensor, 'sigmoid'):
|
||||
logger.warn(
|
||||
"register user sigmoid to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
|
||||
|
||||
if not hasattr(paddle.Tensor, 'relu'):
|
||||
logger.warn("register user relu to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
|
||||
|
||||
|
||||
def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
|
||||
return x.astype(other.dtype)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'type_as'):
|
||||
logger.warn(
|
||||
"register user type_as to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'type_as', type_as)
|
||||
|
||||
|
||||
def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
|
||||
assert len(args) == 1
|
||||
if isinstance(args[0], str): # dtype
|
||||
return x.astype(args[0])
|
||||
elif isinstance(args[0], paddle.Tensor): #Tensor
|
||||
return x.astype(args[0].dtype)
|
||||
else: # Device
|
||||
return x
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'to'):
|
||||
logger.warn("register user to to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'to', to)
|
||||
|
||||
|
||||
def func_float(x: paddle.Tensor) -> paddle.Tensor:
|
||||
return x.astype(paddle.float)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'float'):
|
||||
logger.warn("register user float to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'float', func_float)
|
||||
|
||||
|
||||
def func_int(x: paddle.Tensor) -> paddle.Tensor:
|
||||
return x.astype(paddle.int)
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'int'):
|
||||
logger.warn("register user int to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'int', func_int)
|
||||
|
||||
|
||||
def tolist(x: paddle.Tensor) -> List[Any]:
|
||||
return x.numpy().tolist()
|
||||
|
||||
|
||||
if not hasattr(paddle.Tensor, 'tolist'):
|
||||
logger.warn(
|
||||
"register user tolist to paddle.Tensor, remove this when fixed!")
|
||||
setattr(paddle.Tensor, 'tolist', tolist)
|
||||
|
||||
|
||||
########### hcak paddle.nn #############
|
||||
class GLU(nn.Layer):
|
||||
"""Gated Linear Units (GLU) Layer"""
|
||||
|
||||
def __init__(self, dim: int=-1):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, xs):
|
||||
return F.glu(xs, axis=self.dim)
|
||||
|
||||
|
||||
if not hasattr(paddle.nn, 'GLU'):
|
||||
logger.warn("register user GLU to paddle.nn, remove this when fixed!")
|
||||
setattr(paddle.nn, 'GLU', GLU)
|
@ -0,0 +1,56 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation for DeepSpeech2 model."""
|
||||
from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester
|
||||
|
||||
from deepspeech.exps.deepspeech2.config import get_cfg_defaults
|
||||
from deepspeech.training.cli import default_argument_parser
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Tester(config, args)
|
||||
exp.setup()
|
||||
exp.run_test()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
parser.add_argument("--model_type")
|
||||
# save asr result to
|
||||
parser.add_argument(
|
||||
"--result_file", type=str, help="path of save the asr result")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
if args.model_type is None:
|
||||
args.model_type = 'offline'
|
||||
print("model_type:{}".format(args.model_type))
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
config = get_cfg_defaults(args.model_type)
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
main(config, args)
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,17 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .deepspeech2 import DeepSpeech2InferModel
|
||||
from .deepspeech2 import DeepSpeech2Model
|
||||
|
||||
__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
|
@ -0,0 +1,314 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Deepspeech2 ASR Model"""
|
||||
from typing import Optional
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from src_deepspeech2x.models.ds2.rnn import RNNStack
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from deepspeech.models.ds2.conv import ConvStack
|
||||
from deepspeech.modules.ctc import CTCDecoder
|
||||
from deepspeech.utils import layer_tools
|
||||
from deepspeech.utils.checkpoint import Checkpoint
|
||||
from deepspeech.utils.log import Log
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
|
||||
|
||||
|
||||
class CRNNEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
feat_size,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=1024,
|
||||
use_gru=False,
|
||||
share_rnn_weights=True):
|
||||
super().__init__()
|
||||
self.rnn_size = rnn_size
|
||||
self.feat_size = feat_size # 161 for linear
|
||||
self.dict_size = dict_size
|
||||
|
||||
self.conv = ConvStack(feat_size, num_conv_layers)
|
||||
|
||||
i_size = self.conv.output_height # H after conv stack
|
||||
self.rnn = RNNStack(
|
||||
i_size=i_size,
|
||||
h_size=rnn_size,
|
||||
num_stacks=num_rnn_layers,
|
||||
use_gru=use_gru,
|
||||
share_rnn_weights=share_rnn_weights)
|
||||
|
||||
@property
|
||||
def output_size(self):
|
||||
return self.rnn_size * 2
|
||||
|
||||
def forward(self, audio, audio_len):
|
||||
"""Compute Encoder outputs
|
||||
|
||||
Args:
|
||||
audio (Tensor): [B, Tmax, D]
|
||||
text (Tensor): [B, Umax]
|
||||
audio_len (Tensor): [B]
|
||||
text_len (Tensor): [B]
|
||||
Returns:
|
||||
x (Tensor): encoder outputs, [B, T, D]
|
||||
x_lens (Tensor): encoder length, [B]
|
||||
"""
|
||||
# [B, T, D] -> [B, D, T]
|
||||
audio = audio.transpose([0, 2, 1])
|
||||
# [B, D, T] -> [B, C=1, D, T]
|
||||
x = audio.unsqueeze(1)
|
||||
x_lens = audio_len
|
||||
|
||||
# convolution group
|
||||
x, x_lens = self.conv(x, x_lens)
|
||||
x_val = x.numpy()
|
||||
|
||||
# convert data from convolution feature map to sequence of vectors
|
||||
#B, C, D, T = paddle.shape(x) # not work under jit
|
||||
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
|
||||
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
|
||||
x = x.reshape([0, 0, -1]) #[B, T, C*D]
|
||||
|
||||
# remove padding part
|
||||
x, x_lens = self.rnn(x, x_lens) #[B, T, D]
|
||||
return x, x_lens
|
||||
|
||||
|
||||
class DeepSpeech2Model(nn.Layer):
|
||||
"""The DeepSpeech2 network structure.
|
||||
|
||||
:param audio_data: Audio spectrogram data layer.
|
||||
:type audio_data: Variable
|
||||
:param text_data: Transcription text data layer.
|
||||
:type text_data: Variable
|
||||
:param audio_len: Valid sequence length data layer.
|
||||
:type audio_len: Variable
|
||||
:param masks: Masks data layer to reset padding.
|
||||
:type masks: Variable
|
||||
:param dict_size: Dictionary size for tokenized transcription.
|
||||
:type dict_size: int
|
||||
:param num_conv_layers: Number of stacking convolution layers.
|
||||
:type num_conv_layers: int
|
||||
:param num_rnn_layers: Number of stacking RNN layers.
|
||||
:type num_rnn_layers: int
|
||||
:param rnn_size: RNN layer size (dimension of RNN cells).
|
||||
:type rnn_size: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward direction RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: A tuple of an output unnormalized log probability layer (
|
||||
before softmax) and a ctc cost layer.
|
||||
:rtype: tuple of LayerOutput
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||
default = CfgNode(
|
||||
dict(
|
||||
num_conv_layers=2, #Number of stacking convolution layers.
|
||||
num_rnn_layers=3, #Number of stacking RNN layers.
|
||||
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
|
||||
use_gru=True, #Use gru if set True. Use simple rnn if set False.
|
||||
share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
|
||||
))
|
||||
if config is not None:
|
||||
config.merge_from_other_cfg(default)
|
||||
return default
|
||||
|
||||
def __init__(self,
|
||||
feat_size,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=1024,
|
||||
use_gru=False,
|
||||
share_rnn_weights=True,
|
||||
blank_id=0):
|
||||
super().__init__()
|
||||
self.encoder = CRNNEncoder(
|
||||
feat_size=feat_size,
|
||||
dict_size=dict_size,
|
||||
num_conv_layers=num_conv_layers,
|
||||
num_rnn_layers=num_rnn_layers,
|
||||
rnn_size=rnn_size,
|
||||
use_gru=use_gru,
|
||||
share_rnn_weights=share_rnn_weights)
|
||||
assert (self.encoder.output_size == rnn_size * 2)
|
||||
|
||||
self.decoder = CTCDecoder(
|
||||
odim=dict_size, # <blank> is in vocab
|
||||
enc_n_units=self.encoder.output_size,
|
||||
blank_id=blank_id, # first token is <blank>
|
||||
dropout_rate=0.0,
|
||||
reduction=True, # sum
|
||||
batch_average=True) # sum / batch_size
|
||||
|
||||
def forward(self, audio, audio_len, text, text_len):
|
||||
"""Compute Model loss
|
||||
|
||||
Args:
|
||||
audio (Tenosr): [B, T, D]
|
||||
audio_len (Tensor): [B]
|
||||
text (Tensor): [B, U]
|
||||
text_len (Tensor): [B]
|
||||
|
||||
Returns:
|
||||
loss (Tenosr): [1]
|
||||
"""
|
||||
eouts, eouts_len = self.encoder(audio, audio_len)
|
||||
loss = self.decoder(eouts, eouts_len, text, text_len)
|
||||
return loss
|
||||
|
||||
@paddle.no_grad()
|
||||
def decode(self, audio, audio_len, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
|
||||
cutoff_top_n, num_processes):
|
||||
# init once
|
||||
# decoders only accept string encoded in utf-8
|
||||
self.decoder.init_decode(
|
||||
beam_alpha=beam_alpha,
|
||||
beam_beta=beam_beta,
|
||||
lang_model_path=lang_model_path,
|
||||
vocab_list=vocab_list,
|
||||
decoding_method=decoding_method)
|
||||
|
||||
eouts, eouts_len = self.encoder(audio, audio_len)
|
||||
probs = self.decoder.softmax(eouts)
|
||||
print("probs.shape", probs.shape)
|
||||
return self.decoder.decode_probs(
|
||||
probs.numpy(), eouts_len, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
|
||||
cutoff_top_n, num_processes)
|
||||
|
||||
def decode_probs_split(self, probs_split, vocab_list, decoding_method,
|
||||
lang_model_path, beam_alpha, beam_beta, beam_size,
|
||||
cutoff_prob, cutoff_top_n, num_processes):
|
||||
self.decoder.init_decode(
|
||||
beam_alpha=beam_alpha,
|
||||
beam_beta=beam_beta,
|
||||
lang_model_path=lang_model_path,
|
||||
vocab_list=vocab_list,
|
||||
decoding_method=decoding_method)
|
||||
return self.decoder.decode_probs_split(
|
||||
probs_split, vocab_list, decoding_method, lang_model_path,
|
||||
beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n,
|
||||
num_processes)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, dataloader, config, checkpoint_path):
|
||||
"""Build a DeepSpeech2Model model from a pretrained model.
|
||||
Parameters
|
||||
----------
|
||||
dataloader: paddle.io.DataLoader
|
||||
|
||||
config: yacs.config.CfgNode
|
||||
model configs
|
||||
|
||||
checkpoint_path: Path or str
|
||||
the path of pretrained model checkpoint, without extension name
|
||||
|
||||
Returns
|
||||
-------
|
||||
DeepSpeech2Model
|
||||
The model built from pretrained result.
|
||||
"""
|
||||
model = cls(feat_size=dataloader.collate_fn.feature_size,
|
||||
dict_size=len(dataloader.collate_fn.vocab_list),
|
||||
num_conv_layers=config.model.num_conv_layers,
|
||||
num_rnn_layers=config.model.num_rnn_layers,
|
||||
rnn_size=config.model.rnn_layer_size,
|
||||
use_gru=config.model.use_gru,
|
||||
share_rnn_weights=config.model.share_rnn_weights)
|
||||
infos = Checkpoint().load_parameters(
|
||||
model, checkpoint_path=checkpoint_path)
|
||||
logger.info(f"checkpoint info: {infos}")
|
||||
layer_tools.summary(model)
|
||||
return model
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config):
|
||||
"""Build a DeepSpeec2Model from config
|
||||
Parameters
|
||||
|
||||
config: yacs.config.CfgNode
|
||||
config.model
|
||||
Returns
|
||||
-------
|
||||
DeepSpeech2Model
|
||||
The model built from config.
|
||||
"""
|
||||
model = cls(feat_size=config.feat_size,
|
||||
dict_size=config.dict_size,
|
||||
num_conv_layers=config.num_conv_layers,
|
||||
num_rnn_layers=config.num_rnn_layers,
|
||||
rnn_size=config.rnn_layer_size,
|
||||
use_gru=config.use_gru,
|
||||
share_rnn_weights=config.share_rnn_weights,
|
||||
blank_id=config.blank_id)
|
||||
return model
|
||||
|
||||
|
||||
class DeepSpeech2InferModel(DeepSpeech2Model):
|
||||
def __init__(self,
|
||||
feat_size,
|
||||
dict_size,
|
||||
num_conv_layers=2,
|
||||
num_rnn_layers=3,
|
||||
rnn_size=1024,
|
||||
use_gru=False,
|
||||
share_rnn_weights=True,
|
||||
blank_id=0):
|
||||
super().__init__(
|
||||
feat_size=feat_size,
|
||||
dict_size=dict_size,
|
||||
num_conv_layers=num_conv_layers,
|
||||
num_rnn_layers=num_rnn_layers,
|
||||
rnn_size=rnn_size,
|
||||
use_gru=use_gru,
|
||||
share_rnn_weights=share_rnn_weights,
|
||||
blank_id=blank_id)
|
||||
|
||||
def forward(self, audio, audio_len):
|
||||
"""export model function
|
||||
|
||||
Args:
|
||||
audio (Tensor): [B, T, D]
|
||||
audio_len (Tensor): [B]
|
||||
|
||||
Returns:
|
||||
probs: probs after softmax
|
||||
"""
|
||||
eouts, eouts_len = self.encoder(audio, audio_len)
|
||||
probs = self.decoder.softmax(eouts)
|
||||
return probs, eouts_len
|
||||
|
||||
def export(self):
|
||||
static_model = paddle.jit.to_static(
|
||||
self,
|
||||
input_spec=[
|
||||
paddle.static.InputSpec(
|
||||
shape=[None, None, self.encoder.feat_size],
|
||||
dtype='float32'), # audio, [B,T,D]
|
||||
paddle.static.InputSpec(shape=[None],
|
||||
dtype='int64'), # audio_length, [B]
|
||||
])
|
||||
return static_model
|
@ -0,0 +1,334 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import math
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn import functional as F
|
||||
from paddle.nn import initializer as I
|
||||
|
||||
from deepspeech.modules.activation import brelu
|
||||
from deepspeech.modules.mask import make_non_pad_mask
|
||||
from deepspeech.utils.log import Log
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
__all__ = ['RNNStack']
|
||||
|
||||
|
||||
class RNNCell(nn.RNNCellBase):
|
||||
r"""
|
||||
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
|
||||
computes the outputs and updates states.
|
||||
The formula used is as follows:
|
||||
.. math::
|
||||
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
|
||||
y_{t} & = h_{t}
|
||||
|
||||
where :math:`act` is for :attr:`activation`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
hidden_size: int,
|
||||
activation="tanh",
|
||||
weight_ih_attr=None,
|
||||
weight_hh_attr=None,
|
||||
bias_ih_attr=None,
|
||||
bias_hh_attr=None,
|
||||
name=None):
|
||||
super().__init__()
|
||||
std = 1.0 / math.sqrt(hidden_size)
|
||||
self.weight_hh = self.create_parameter(
|
||||
(hidden_size, hidden_size),
|
||||
weight_hh_attr,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
self.bias_ih = None
|
||||
self.bias_hh = self.create_parameter(
|
||||
(hidden_size, ),
|
||||
bias_hh_attr,
|
||||
is_bias=True,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
if activation not in ["tanh", "relu", "brelu"]:
|
||||
raise ValueError(
|
||||
"activation for SimpleRNNCell should be tanh or relu, "
|
||||
"but get {}".format(activation))
|
||||
self.activation = activation
|
||||
self._activation_fn = paddle.tanh \
|
||||
if activation == "tanh" \
|
||||
else F.relu
|
||||
if activation == 'brelu':
|
||||
self._activation_fn = brelu
|
||||
|
||||
def forward(self, inputs, states=None):
|
||||
if states is None:
|
||||
states = self.get_initial_states(inputs, self.state_shape)
|
||||
pre_h = states
|
||||
i2h = inputs
|
||||
if self.bias_ih is not None:
|
||||
i2h += self.bias_ih
|
||||
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
|
||||
if self.bias_hh is not None:
|
||||
h2h += self.bias_hh
|
||||
h = self._activation_fn(i2h + h2h)
|
||||
return h, h
|
||||
|
||||
@property
|
||||
def state_shape(self):
|
||||
return (self.hidden_size, )
|
||||
|
||||
|
||||
class GRUCell(nn.RNNCellBase):
|
||||
r"""
|
||||
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
|
||||
it computes the outputs and updates states.
|
||||
The formula for GRU used is as follows:
|
||||
.. math::
|
||||
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
|
||||
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
|
||||
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
|
||||
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
|
||||
y_{t} & = h_{t}
|
||||
|
||||
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
|
||||
multiplication operator.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
input_size: int,
|
||||
hidden_size: int,
|
||||
weight_ih_attr=None,
|
||||
weight_hh_attr=None,
|
||||
bias_ih_attr=None,
|
||||
bias_hh_attr=None,
|
||||
name=None):
|
||||
super().__init__()
|
||||
std = 1.0 / math.sqrt(hidden_size)
|
||||
self.weight_hh = self.create_parameter(
|
||||
(3 * hidden_size, hidden_size),
|
||||
weight_hh_attr,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
self.bias_ih = None
|
||||
self.bias_hh = self.create_parameter(
|
||||
(3 * hidden_size, ),
|
||||
bias_hh_attr,
|
||||
is_bias=True,
|
||||
default_initializer=I.Uniform(-std, std))
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.input_size = input_size
|
||||
self._gate_activation = F.sigmoid
|
||||
self._activation = paddle.relu
|
||||
|
||||
def forward(self, inputs, states=None):
|
||||
if states is None:
|
||||
states = self.get_initial_states(inputs, self.state_shape)
|
||||
|
||||
pre_hidden = states # shape [batch_size, hidden_size]
|
||||
|
||||
x_gates = inputs
|
||||
if self.bias_ih is not None:
|
||||
x_gates = x_gates + self.bias_ih
|
||||
bias_u, bias_r, bias_c = paddle.split(
|
||||
self.bias_hh, num_or_sections=3, axis=0)
|
||||
|
||||
weight_hh = paddle.transpose(
|
||||
self.weight_hh,
|
||||
perm=[1, 0]) #weight_hh:shape[hidden_size, 3 * hidden_size]
|
||||
w_u_r_c = paddle.flatten(weight_hh)
|
||||
size_u_r = self.hidden_size * 2 * self.hidden_size
|
||||
w_u_r = paddle.reshape(w_u_r_c[:size_u_r],
|
||||
(self.hidden_size, self.hidden_size * 2))
|
||||
w_u, w_r = paddle.split(w_u_r, num_or_sections=2, axis=1)
|
||||
w_c = paddle.reshape(w_u_r_c[size_u_r:],
|
||||
(self.hidden_size, self.hidden_size))
|
||||
|
||||
h_u = paddle.matmul(
|
||||
pre_hidden, w_u,
|
||||
transpose_y=False) + bias_u #shape [batch_size, hidden_size]
|
||||
h_r = paddle.matmul(
|
||||
pre_hidden, w_r,
|
||||
transpose_y=False) + bias_r #shape [batch_size, hidden_size]
|
||||
|
||||
x_u, x_r, x_c = paddle.split(
|
||||
x_gates, num_or_sections=3, axis=1) #shape[batch_size, hidden_size]
|
||||
|
||||
u = self._gate_activation(x_u + h_u) #shape [batch_size, hidden_size]
|
||||
r = self._gate_activation(x_r + h_r) #shape [batch_size, hidden_size]
|
||||
c = self._activation(
|
||||
x_c + paddle.matmul(r * pre_hidden, w_c, transpose_y=False) +
|
||||
bias_c) # [batch_size, hidden_size]
|
||||
|
||||
h = (1 - u) * pre_hidden + u * c
|
||||
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
|
||||
return h, h
|
||||
|
||||
@property
|
||||
def state_shape(self):
|
||||
r"""
|
||||
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
|
||||
size would be automatically inserted into shape). The shape corresponds
|
||||
to the shape of :math:`h_{t-1}`.
|
||||
"""
|
||||
return (self.hidden_size, )
|
||||
|
||||
|
||||
class BiRNNWithBN(nn.Layer):
|
||||
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param size: Dimension of RNN cells.
|
||||
:type size: int
|
||||
:param share_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
:type share_weights: bool
|
||||
:return: Bidirectional simple rnn layer.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size: int, h_size: int, share_weights: bool):
|
||||
super().__init__()
|
||||
self.share_weights = share_weights
|
||||
if self.share_weights:
|
||||
#input-hidden weights shared between bi-directional rnn.
|
||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
# batch norm is only performed on input-state projection
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = self.fw_fc
|
||||
self.bw_bn = self.fw_bn
|
||||
else:
|
||||
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
|
||||
self.bw_bn = nn.BatchNorm1D(
|
||||
h_size, bias_attr=None, data_format='NLC')
|
||||
|
||||
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
|
||||
self.fw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||
self.bw_rnn = nn.RNN(
|
||||
self.bw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||
|
||||
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
|
||||
# x, shape [B, T, D]
|
||||
fw_x = self.fw_bn(self.fw_fc(x))
|
||||
bw_x = self.bw_bn(self.bw_fc(x))
|
||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class BiGRUWithBN(nn.Layer):
|
||||
"""Bidirectonal gru layer with sequence-wise batch normalization.
|
||||
The batch normalization is only performed on input-state weights.
|
||||
|
||||
:param name: Name of the layer.
|
||||
:type name: string
|
||||
:param input: Input layer.
|
||||
:type input: Variable
|
||||
:param size: Dimension of GRU cells.
|
||||
:type size: int
|
||||
:param act: Activation type.
|
||||
:type act: string
|
||||
:return: Bidirectional GRU layer.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self, i_size: int, h_size: int):
|
||||
super().__init__()
|
||||
hidden_size = h_size * 3
|
||||
|
||||
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||
self.fw_bn = nn.BatchNorm1D(
|
||||
hidden_size, bias_attr=None, data_format='NLC')
|
||||
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
|
||||
self.bw_bn = nn.BatchNorm1D(
|
||||
hidden_size, bias_attr=None, data_format='NLC')
|
||||
|
||||
self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
||||
self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
|
||||
self.fw_rnn = nn.RNN(
|
||||
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
|
||||
self.bw_rnn = nn.RNN(
|
||||
self.bw_cell, is_reverse=True, time_major=False) #[B, T, D]
|
||||
|
||||
def forward(self, x, x_len):
|
||||
# x, shape [B, T, D]
|
||||
fw_x = self.fw_bn(self.fw_fc(x))
|
||||
|
||||
bw_x = self.bw_bn(self.bw_fc(x))
|
||||
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
|
||||
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
|
||||
x = paddle.concat([fw_x, bw_x], axis=-1)
|
||||
return x, x_len
|
||||
|
||||
|
||||
class RNNStack(nn.Layer):
|
||||
"""RNN group with stacked bidirectional simple RNN or GRU layers.
|
||||
|
||||
:param input: Input layer.
|
||||
:type input: Variable
|
||||
:param size: Dimension of RNN cells in each layer.
|
||||
:type size: int
|
||||
:param num_stacks: Number of stacked rnn layers.
|
||||
:type num_stacks: int
|
||||
:param use_gru: Use gru if set True. Use simple rnn if set False.
|
||||
:type use_gru: bool
|
||||
:param share_rnn_weights: Whether to share input-hidden weights between
|
||||
forward and backward directional RNNs.
|
||||
It is only available when use_gru=False.
|
||||
:type share_weights: bool
|
||||
:return: Output layer of the RNN group.
|
||||
:rtype: Variable
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
i_size: int,
|
||||
h_size: int,
|
||||
num_stacks: int,
|
||||
use_gru: bool,
|
||||
share_rnn_weights: bool):
|
||||
super().__init__()
|
||||
rnn_stacks = []
|
||||
for i in range(num_stacks):
|
||||
if use_gru:
|
||||
#default:GRU using tanh
|
||||
rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
|
||||
else:
|
||||
rnn_stacks.append(
|
||||
BiRNNWithBN(
|
||||
i_size=i_size,
|
||||
h_size=h_size,
|
||||
share_weights=share_rnn_weights))
|
||||
i_size = h_size * 2
|
||||
|
||||
self.rnn_stacks = nn.LayerList(rnn_stacks)
|
||||
|
||||
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
|
||||
"""
|
||||
x: shape [B, T, D]
|
||||
x_len: shpae [B]
|
||||
"""
|
||||
for i, rnn in enumerate(self.rnn_stacks):
|
||||
x, x_len = rnn(x, x_len)
|
||||
masks = make_non_pad_mask(x_len) #[B, T]
|
||||
masks = masks.unsqueeze(-1) # [B, T, 1]
|
||||
# TODO(Hui Zhang): not support bool multiply
|
||||
masks = masks.astype(x.dtype)
|
||||
x = x.multiply(masks)
|
||||
return x, x_len
|
@ -0,0 +1,429 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Contains DeepSpeech2 and DeepSpeech2Online model."""
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from contextlib import nullcontext
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from src_deepspeech2x.models.ds2 import DeepSpeech2InferModel
|
||||
from src_deepspeech2x.models.ds2 import DeepSpeech2Model
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.io.dataset import ManifestDataset
|
||||
from deepspeech.io.sampler import SortagradBatchSampler
|
||||
from deepspeech.io.sampler import SortagradDistributedBatchSampler
|
||||
from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
|
||||
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
|
||||
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
|
||||
from deepspeech.training.trainer import Trainer
|
||||
from deepspeech.utils import error_rate
|
||||
from deepspeech.utils import layer_tools
|
||||
from deepspeech.utils import mp_tools
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class DeepSpeech2Trainer(Trainer):
|
||||
@classmethod
|
||||
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||
# training config
|
||||
default = CfgNode(
|
||||
dict(
|
||||
lr=5e-4, # learning rate
|
||||
lr_decay=1.0, # learning rate decay
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
global_grad_clip=5.0, # the global norm clip
|
||||
n_epoch=50, # train epochs
|
||||
))
|
||||
|
||||
if config is not None:
|
||||
config.merge_from_other_cfg(default)
|
||||
return default
|
||||
|
||||
def __init__(self, config, args):
|
||||
super().__init__(config, args)
|
||||
|
||||
def train_batch(self, batch_index, batch_data, msg):
|
||||
train_conf = self.config.training
|
||||
start = time.time()
|
||||
|
||||
# forward
|
||||
utt, audio, audio_len, text, text_len = batch_data
|
||||
loss = self.model(audio, audio_len, text, text_len)
|
||||
losses_np = {
|
||||
'train_loss': float(loss),
|
||||
}
|
||||
|
||||
# loss backward
|
||||
if (batch_index + 1) % train_conf.accum_grad != 0:
|
||||
# Disable gradient synchronizations across DDP processes.
|
||||
# Within this context, gradients will be accumulated on module
|
||||
# variables, which will later be synchronized.
|
||||
context = self.model.no_sync
|
||||
else:
|
||||
# Used for single gpu training and DDP gradient synchronization
|
||||
# processes.
|
||||
context = nullcontext
|
||||
|
||||
with context():
|
||||
loss.backward()
|
||||
layer_tools.print_grads(self.model, print_func=None)
|
||||
|
||||
# optimizer step
|
||||
if (batch_index + 1) % train_conf.accum_grad == 0:
|
||||
self.optimizer.step()
|
||||
self.optimizer.clear_grad()
|
||||
self.iteration += 1
|
||||
|
||||
iteration_time = time.time() - start
|
||||
|
||||
msg += "train time: {:>.3f}s, ".format(iteration_time)
|
||||
msg += "batch size: {}, ".format(self.config.collator.batch_size)
|
||||
msg += "accum: {}, ".format(train_conf.accum_grad)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0 and self.visualizer:
|
||||
for k, v in losses_np.items():
|
||||
# `step -1` since we update `step` after optimizer.step().
|
||||
self.visualizer.add_scalar("train/{}".format(k), v,
|
||||
self.iteration - 1)
|
||||
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
|
||||
self.model.eval()
|
||||
valid_losses = defaultdict(list)
|
||||
num_seen_utts = 1
|
||||
total_loss = 0.0
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
utt, audio, audio_len, text, text_len = batch
|
||||
loss = self.model(audio, audio_len, text, text_len)
|
||||
if paddle.isfinite(loss):
|
||||
num_utts = batch[1].shape[0]
|
||||
num_seen_utts += num_utts
|
||||
total_loss += float(loss) * num_utts
|
||||
valid_losses['val_loss'].append(float(loss))
|
||||
|
||||
if (i + 1) % self.config.training.log_interval == 0:
|
||||
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
valid_dump['val_history_loss'] = total_loss / num_seen_utts
|
||||
|
||||
# logging
|
||||
msg = f"Valid: Rank: {dist.get_rank()}, "
|
||||
msg += "epoch: {}, ".format(self.epoch)
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader))
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_dump.items())
|
||||
logger.info(msg)
|
||||
|
||||
logger.info('Rank {} Val info val_loss {}'.format(
|
||||
dist.get_rank(), total_loss / num_seen_utts))
|
||||
return total_loss, num_seen_utts
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config.clone()
|
||||
config.defrost()
|
||||
config.model.feat_size = self.train_loader.collate_fn.feature_size
|
||||
#config.model.dict_size = self.train_loader.collate_fn.vocab_size
|
||||
config.model.dict_size = len(self.train_loader.collate_fn.vocab_list)
|
||||
config.freeze()
|
||||
|
||||
if self.args.model_type == 'offline':
|
||||
model = DeepSpeech2Model.from_config(config.model)
|
||||
elif self.args.model_type == 'online':
|
||||
model = DeepSpeech2ModelOnline.from_config(config.model)
|
||||
else:
|
||||
raise Exception("wrong model type")
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
logger.info(f"{model}")
|
||||
layer_tools.print_params(model, logger.info)
|
||||
|
||||
grad_clip = ClipGradByGlobalNormWithLog(
|
||||
config.training.global_grad_clip)
|
||||
lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
|
||||
learning_rate=config.training.lr,
|
||||
gamma=config.training.lr_decay,
|
||||
verbose=True)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=lr_scheduler,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.lr_scheduler = lr_scheduler
|
||||
logger.info("Setup model/optimizer/lr_scheduler!")
|
||||
|
||||
def setup_dataloader(self):
|
||||
config = self.config.clone()
|
||||
config.defrost()
|
||||
config.collator.keep_transcription_text = False
|
||||
|
||||
config.data.manifest = config.data.train_manifest
|
||||
train_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
config.data.manifest = config.data.dev_manifest
|
||||
dev_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
config.data.manifest = config.data.test_manifest
|
||||
test_dataset = ManifestDataset.from_config(config)
|
||||
|
||||
if self.parallel:
|
||||
batch_sampler = SortagradDistributedBatchSampler(
|
||||
train_dataset,
|
||||
batch_size=config.collator.batch_size,
|
||||
num_replicas=None,
|
||||
rank=None,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
sortagrad=config.collator.sortagrad,
|
||||
shuffle_method=config.collator.shuffle_method)
|
||||
else:
|
||||
batch_sampler = SortagradBatchSampler(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=config.collator.batch_size,
|
||||
drop_last=True,
|
||||
sortagrad=config.collator.sortagrad,
|
||||
shuffle_method=config.collator.shuffle_method)
|
||||
|
||||
collate_fn_train = SpeechCollator.from_config(config)
|
||||
|
||||
config.collator.augmentation_config = ""
|
||||
collate_fn_dev = SpeechCollator.from_config(config)
|
||||
|
||||
config.collator.keep_transcription_text = True
|
||||
config.collator.augmentation_config = ""
|
||||
collate_fn_test = SpeechCollator.from_config(config)
|
||||
|
||||
self.train_loader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=batch_sampler,
|
||||
collate_fn=collate_fn_train,
|
||||
num_workers=config.collator.num_workers)
|
||||
self.valid_loader = DataLoader(
|
||||
dev_dataset,
|
||||
batch_size=config.collator.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=collate_fn_dev)
|
||||
self.test_loader = DataLoader(
|
||||
test_dataset,
|
||||
batch_size=config.decoding.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=collate_fn_test)
|
||||
if "<eos>" in self.test_loader.collate_fn.vocab_list:
|
||||
self.test_loader.collate_fn.vocab_list.remove("<eos>")
|
||||
if "<eos>" in self.valid_loader.collate_fn.vocab_list:
|
||||
self.valid_loader.collate_fn.vocab_list.remove("<eos>")
|
||||
if "<eos>" in self.train_loader.collate_fn.vocab_list:
|
||||
self.train_loader.collate_fn.vocab_list.remove("<eos>")
|
||||
logger.info("Setup train/valid/test Dataloader!")
|
||||
|
||||
|
||||
class DeepSpeech2Tester(DeepSpeech2Trainer):
|
||||
@classmethod
|
||||
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||
# testing config
|
||||
default = CfgNode(
|
||||
dict(
|
||||
alpha=2.5, # Coef of LM for beam search.
|
||||
beta=0.3, # Coef of WC for beam search.
|
||||
cutoff_prob=1.0, # Cutoff probability for pruning.
|
||||
cutoff_top_n=40, # Cutoff number for pruning.
|
||||
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
|
||||
decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy
|
||||
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
|
||||
num_proc_bsearch=8, # # of CPUs for beam search.
|
||||
beam_size=500, # Beam search width.
|
||||
batch_size=128, # decoding batch size
|
||||
))
|
||||
|
||||
if config is not None:
|
||||
config.merge_from_other_cfg(default)
|
||||
return default
|
||||
|
||||
def __init__(self, config, args):
|
||||
|
||||
self._text_featurizer = TextFeaturizer(
|
||||
unit_type=config.collator.unit_type, vocab_filepath=None)
|
||||
super().__init__(config, args)
|
||||
|
||||
def ordid2token(self, texts, texts_len):
|
||||
""" ord() id to chr() chr """
|
||||
trans = []
|
||||
for text, n in zip(texts, texts_len):
|
||||
n = n.numpy().item()
|
||||
ids = text[:n]
|
||||
trans.append(''.join([chr(i) for i in ids]))
|
||||
return trans
|
||||
|
||||
def compute_metrics(self,
|
||||
utts,
|
||||
audio,
|
||||
audio_len,
|
||||
texts,
|
||||
texts_len,
|
||||
fout=None):
|
||||
cfg = self.config.decoding
|
||||
errors_sum, len_refs, num_ins = 0.0, 0, 0
|
||||
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
|
||||
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
|
||||
|
||||
vocab_list = self.test_loader.collate_fn.vocab_list
|
||||
|
||||
target_transcripts = self.ordid2token(texts, texts_len)
|
||||
|
||||
result_transcripts = self.compute_result_transcripts(audio, audio_len,
|
||||
vocab_list, cfg)
|
||||
for utt, target, result in zip(utts, target_transcripts,
|
||||
result_transcripts):
|
||||
errors, len_ref = errors_func(target, result)
|
||||
errors_sum += errors
|
||||
len_refs += len_ref
|
||||
num_ins += 1
|
||||
if fout:
|
||||
fout.write(utt + " " + result + "\n")
|
||||
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
|
||||
(target, result))
|
||||
logger.info("Current error rate [%s] = %f" %
|
||||
(cfg.error_rate_type, error_rate_func(target, result)))
|
||||
|
||||
return dict(
|
||||
errors_sum=errors_sum,
|
||||
len_refs=len_refs,
|
||||
num_ins=num_ins,
|
||||
error_rate=errors_sum / len_refs,
|
||||
error_rate_type=cfg.error_rate_type)
|
||||
|
||||
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
|
||||
result_transcripts = self.model.decode(
|
||||
audio,
|
||||
audio_len,
|
||||
vocab_list,
|
||||
decoding_method=cfg.decoding_method,
|
||||
lang_model_path=cfg.lang_model_path,
|
||||
beam_alpha=cfg.alpha,
|
||||
beam_beta=cfg.beta,
|
||||
beam_size=cfg.beam_size,
|
||||
cutoff_prob=cfg.cutoff_prob,
|
||||
cutoff_top_n=cfg.cutoff_top_n,
|
||||
num_processes=cfg.num_proc_bsearch)
|
||||
result_transcripts = [
|
||||
self._text_featurizer.detokenize(item)
|
||||
for item in result_transcripts
|
||||
]
|
||||
return result_transcripts
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def test(self):
|
||||
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
|
||||
self.model.eval()
|
||||
cfg = self.config
|
||||
error_rate_type = None
|
||||
errors_sum, len_refs, num_ins = 0.0, 0, 0
|
||||
with open(self.args.result_file, 'w') as fout:
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
utts, audio, audio_len, texts, texts_len = batch
|
||||
metrics = self.compute_metrics(utts, audio, audio_len, texts,
|
||||
texts_len, fout)
|
||||
errors_sum += metrics['errors_sum']
|
||||
len_refs += metrics['len_refs']
|
||||
num_ins += metrics['num_ins']
|
||||
error_rate_type = metrics['error_rate_type']
|
||||
logger.info("Error rate [%s] (%d/?) = %f" %
|
||||
(error_rate_type, num_ins, errors_sum / len_refs))
|
||||
|
||||
# logging
|
||||
msg = "Test: "
|
||||
msg += "epoch: {}, ".format(self.epoch)
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "Final error rate [%s] (%d/%d) = %f" % (
|
||||
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
|
||||
logger.info(msg)
|
||||
|
||||
def run_test(self):
|
||||
self.resume_or_scratch()
|
||||
try:
|
||||
self.test()
|
||||
except KeyboardInterrupt:
|
||||
exit(-1)
|
||||
|
||||
def export(self):
|
||||
if self.args.model_type == 'offline':
|
||||
infer_model = DeepSpeech2InferModel.from_pretrained(
|
||||
self.test_loader, self.config, self.args.checkpoint_path)
|
||||
elif self.args.model_type == 'online':
|
||||
infer_model = DeepSpeech2InferModelOnline.from_pretrained(
|
||||
self.test_loader, self.config, self.args.checkpoint_path)
|
||||
else:
|
||||
raise Exception("wrong model type")
|
||||
|
||||
infer_model.eval()
|
||||
feat_dim = self.test_loader.collate_fn.feature_size
|
||||
static_model = infer_model.export()
|
||||
logger.info(f"Export code: {static_model.forward.code}")
|
||||
paddle.jit.save(static_model, self.args.export_path)
|
||||
|
||||
def run_export(self):
|
||||
try:
|
||||
self.export()
|
||||
except KeyboardInterrupt:
|
||||
exit(-1)
|
||||
|
||||
def setup(self):
|
||||
"""Setup the experiment.
|
||||
"""
|
||||
paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
|
||||
|
||||
self.setup_output_dir()
|
||||
self.setup_checkpointer()
|
||||
|
||||
self.setup_dataloader()
|
||||
self.setup_model()
|
||||
|
||||
self.iteration = 0
|
||||
self.epoch = 0
|
||||
|
||||
def setup_output_dir(self):
|
||||
"""Create a directory used for output.
|
||||
"""
|
||||
# output dir
|
||||
if self.args.output:
|
||||
output_dir = Path(self.args.output).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
output_dir = Path(
|
||||
self.args.checkpoint_path).expanduser().parent.parent
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.output_dir = output_dir
|
@ -1,10 +0,0 @@
|
||||
[
|
||||
{
|
||||
"type": "shift",
|
||||
"params": {
|
||||
"min_shift_ms": -5,
|
||||
"max_shift_ms": 5
|
||||
},
|
||||
"prob": 1.0
|
||||
}
|
||||
]
|
@ -0,0 +1,58 @@
|
||||
# [CC-CEDICT](https://cc-cedict.org/wiki/)
|
||||
|
||||
What is CC-CEDICT?
|
||||
CC-CEDICT is a continuation of the CEDICT project.
|
||||
The objective of the CEDICT project was to create an online, downloadable (as opposed to searchable-only) public-domain Chinese-English dictionary.
|
||||
CEDICT was started by Paul Andrew Denisowski in October 1997.
|
||||
For the most part, the project is modeled on Jim Breen's highly successful EDICT (Japanese-English dictionary) project and is intended to be a collaborative effort,
|
||||
with users providing entries and corrections to the main file.
|
||||
|
||||
|
||||
## Parse CC-CEDICT to Json format
|
||||
|
||||
1. Parse to Json
|
||||
|
||||
```
|
||||
run.sh
|
||||
```
|
||||
|
||||
2. Result
|
||||
|
||||
```
|
||||
exp/
|
||||
|-- cedict
|
||||
`-- cedict.json
|
||||
|
||||
0 directories, 2 files
|
||||
```
|
||||
|
||||
```
|
||||
4c4bffc84e24467fe1b2ea9ba37ed6b6 exp/cedict
|
||||
3adf504dacd13886f88cc9fe3b37c75d exp/cedict.json
|
||||
```
|
||||
|
||||
```
|
||||
==> exp/cedict <==
|
||||
# CC-CEDICT
|
||||
# Community maintained free Chinese-English dictionary.
|
||||
#
|
||||
# Published by MDBG
|
||||
#
|
||||
# License:
|
||||
# Creative Commons Attribution-ShareAlike 4.0 International License
|
||||
# https://creativecommons.org/licenses/by-sa/4.0/
|
||||
#
|
||||
# Referenced works:
|
||||
|
||||
==> exp/cedict.json <==
|
||||
{"traditional": "2019\u51a0\u72c0\u75c5\u6bd2\u75c5", "simplified": "2019\u51a0\u72b6\u75c5\u6bd2\u75c5", "pinyin": "er4 ling2 yi1 jiu3 guan1 zhuang4 bing4 du2 bing4", "english": "COVID-19, the coronavirus disease identified in 2019"}
|
||||
{"traditional": "21\u4e09\u9ad4\u7d9c\u5408\u75c7", "simplified": "21\u4e09\u4f53\u7efc\u5408\u75c7", "pinyin": "er4 shi2 yi1 san1 ti3 zong1 he2 zheng4", "english": "trisomy"}
|
||||
{"traditional": "3C", "simplified": "3C", "pinyin": "san1 C", "english": "abbr. for computers, communications, and consumer electronics"}
|
||||
{"traditional": "3P", "simplified": "3P", "pinyin": "san1 P", "english": "(slang) threesome"}
|
||||
{"traditional": "3Q", "simplified": "3Q", "pinyin": "san1 Q", "english": "(Internet slang) thank you (loanword)"}
|
||||
{"traditional": "421", "simplified": "421", "pinyin": "si4 er4 yi1", "english": "four grandparents, two parents and an only child"}
|
||||
{"traditional": "502\u81a0", "simplified": "502\u80f6", "pinyin": "wu3 ling2 er4 jiao1", "english": "cyanoacrylate glue"}
|
||||
{"traditional": "88", "simplified": "88", "pinyin": "ba1 ba1", "english": "(Internet slang) bye-bye (alternative for \u62dc\u62dc[bai2 bai2])"}
|
||||
{"traditional": "996", "simplified": "996", "pinyin": "jiu3 jiu3 liu4", "english": "9am-9pm, six days a week (work schedule)"}
|
||||
{"traditional": "A", "simplified": "A", "pinyin": "A", "english": "(slang) (Tw) to steal"}
|
||||
```
|
@ -1,5 +0,0 @@
|
||||
# Download Baker dataset
|
||||
|
||||
Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
|
||||
|
||||
Download URL https://test.data-baker.com/#/data/index/source.
|
@ -0,0 +1,4 @@
|
||||
*.tgz
|
||||
manifest.*
|
||||
*.meta
|
||||
aidatatang_200zh/
|
@ -0,0 +1,14 @@
|
||||
# [Aidatatang_200zh](http://www.openslr.org/62/)
|
||||
|
||||
Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License.
|
||||
The contents and the corresponding descriptions of the corpus include:
|
||||
|
||||
* The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data.
|
||||
* 600 speakers from different accent areas in China are invited to participate in the recording.
|
||||
* The transcription accuracy for each sentence is larger than 98%.
|
||||
* Recordings are conducted in a quiet indoor environment.
|
||||
* The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2.
|
||||
* Detail information such as speech data coding and speaker information is preserved in the metadata file.
|
||||
* Segmented transcripts are also provided.
|
||||
|
||||
The corpus aims to support researchers in speech recognition, machine translation, voiceprint recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use.
|
@ -0,0 +1,153 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare aidatatang_200zh mandarin dataset
|
||||
|
||||
Download, unpack and create manifest files.
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
import os
|
||||
|
||||
import soundfile
|
||||
|
||||
from utils.utility import download
|
||||
from utils.utility import unpack
|
||||
|
||||
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
|
||||
|
||||
URL_ROOT = 'http://www.openslr.org/resources/62'
|
||||
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
|
||||
DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
|
||||
MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--target_dir",
|
||||
default=DATA_HOME + "/aidatatang_200zh",
|
||||
type=str,
|
||||
help="Directory to save the dataset. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
transcript_path = os.path.join(data_dir, 'transcript',
|
||||
'aidatatang_200_zh_transcript.txt')
|
||||
transcript_dict = {}
|
||||
for line in codecs.open(transcript_path, 'r', 'utf-8'):
|
||||
line = line.strip()
|
||||
if line == '':
|
||||
continue
|
||||
audio_id, text = line.split(' ', 1)
|
||||
# remove withespace, charactor text
|
||||
text = ''.join(text.split())
|
||||
transcript_dict[audio_id] = text
|
||||
|
||||
data_types = ['train', 'dev', 'test']
|
||||
for dtype in data_types:
|
||||
del json_lines[:]
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
|
||||
audio_dir = os.path.join(data_dir, 'corpus/', dtype)
|
||||
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
|
||||
for fname in filelist:
|
||||
if not fname.endswith('.wav'):
|
||||
continue
|
||||
|
||||
audio_path = os.path.abspath(os.path.join(subfolder, fname))
|
||||
audio_id = os.path.basename(fname)[:-4]
|
||||
|
||||
audio_data, samplerate = soundfile.read(audio_path)
|
||||
duration = float(len(audio_data) / samplerate)
|
||||
text = transcript_dict[audio_id]
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
'utt': audio_id,
|
||||
'feat': audio_path,
|
||||
'feat_shape': (duration, ), # second
|
||||
'text': text,
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(text)
|
||||
total_num += 1
|
||||
|
||||
manifest_path = manifest_path_prefix + '.' + dtype
|
||||
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
|
||||
for line in json_lines:
|
||||
fout.write(line + '\n')
|
||||
|
||||
manifest_dir = os.path.dirname(manifest_path_prefix)
|
||||
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
|
||||
with open(meta_path, 'w') as f:
|
||||
print(f"{dtype}:", file=f)
|
||||
print(f"{total_num} utts", file=f)
|
||||
print(f"{total_sec / (60*60)} h", file=f)
|
||||
print(f"{total_text} text", file=f)
|
||||
print(f"{total_text / total_sec} text/sec", file=f)
|
||||
print(f"{total_sec / total_num} sec/utt", file=f)
|
||||
|
||||
|
||||
def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
|
||||
"""Download, unpack and create manifest file."""
|
||||
data_dir = os.path.join(target_dir, subset)
|
||||
if not os.path.exists(data_dir):
|
||||
filepath = download(url, md5sum, target_dir)
|
||||
unpack(filepath, target_dir)
|
||||
# unpack all audio tar files
|
||||
audio_dir = os.path.join(data_dir, 'corpus')
|
||||
for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
|
||||
for sub in dirlist:
|
||||
print(f"unpack dir {sub}...")
|
||||
for folder, _, filelist in sorted(
|
||||
os.walk(os.path.join(subfolder, sub))):
|
||||
for ftar in filelist:
|
||||
unpack(os.path.join(folder, ftar), folder, True)
|
||||
else:
|
||||
print("Skip downloading and unpacking. Data already exists in %s." %
|
||||
target_dir)
|
||||
|
||||
create_manifest(data_dir, manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
if args.target_dir.startswith('~'):
|
||||
args.target_dir = os.path.expanduser(args.target_dir)
|
||||
|
||||
prepare_dataset(
|
||||
url=DATA_URL,
|
||||
md5sum=MD5_DATA,
|
||||
target_dir=args.target_dir,
|
||||
manifest_path=args.manifest_prefix,
|
||||
subset='aidatatang_200zh')
|
||||
|
||||
print("Data download and manifest prepare done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1 +1,5 @@
|
||||
data_aishell*
|
||||
*.meta
|
||||
manifest.*
|
||||
*.tgz
|
||||
resource_aishell
|
||||
|
@ -0,0 +1,3 @@
|
||||
# [Aishell1](http://www.openslr.org/33/)
|
||||
|
||||
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
|
@ -0,0 +1,3 @@
|
||||
# [Aishell3](http://www.openslr.org/93/)
|
||||
|
||||
AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus which could be used to train multi-speaker Text-to-Speech (TTS) systems. The corpus contains roughly **85 hours** of emotion-neutral recordings spoken by 218 native Chinese mandarin speakers and total 88035 utterances. Their auxiliary attributes such as gender, age group and native accents are explicitly marked and provided in the corpus. Accordingly, transcripts in Chinese character-level and pinyin-level are provided along with the recordings. The word & tone transcription accuracy rate is above 98%, through professional speech annotation and strict quality inspection for tone and prosody. ( This database is free for academic research, not in the commerce, if without permission. )
|
@ -0,0 +1 @@
|
||||
GigaSpeech/
|
@ -0,0 +1,10 @@
|
||||
# [GigaSpeech](https://github.com/SpeechColab/GigaSpeech)
|
||||
|
||||
```
|
||||
git clone https://github.com/SpeechColab/GigaSpeech.git
|
||||
|
||||
cd GigaSpeech
|
||||
utils/gigaspeech_download.sh /disk1/audio_data/gigaspeech
|
||||
toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
|
||||
cd ..
|
||||
```
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
curdir=$PWD
|
||||
|
||||
test -d GigaSpeech || git clone https://github.com/SpeechColab/GigaSpeech.git
|
||||
|
||||
|
||||
pushd GigaSpeech
|
||||
source env_vars.sh
|
||||
./utils/download_gigaspeech.sh ${curdir}/
|
||||
#toolkits/kaldi/gigaspeech_data_prep.sh --train-subset XL /disk1/audio_data/gigaspeech ../data
|
||||
popd
|
@ -1,7 +1,9 @@
|
||||
dev-clean/
|
||||
dev-other/
|
||||
test-clean/
|
||||
test-other/
|
||||
train-clean-100/
|
||||
train-clean-360/
|
||||
train-other-500/
|
||||
dev-clean
|
||||
dev-other
|
||||
test-clean
|
||||
test-other
|
||||
train-clean-100
|
||||
train-clean-360
|
||||
train-other-500
|
||||
*.meta
|
||||
manifest.*
|
||||
|
@ -0,0 +1,15 @@
|
||||
# [MagicData](http://www.openslr.org/68/)
|
||||
|
||||
MAGICDATA Mandarin Chinese Read Speech Corpus was developed by MAGIC DATA Technology Co., Ltd. and freely published for non-commercial use.
|
||||
The contents and the corresponding descriptions of the corpus include:
|
||||
|
||||
* The corpus contains 755 hours of speech data, which is mostly mobile recorded data.
|
||||
* 1080 speakers from different accent areas in China are invited to participate in the recording.
|
||||
* The sentence transcription accuracy is higher than 98%.
|
||||
* Recordings are conducted in a quiet indoor environment.
|
||||
* The database is divided into training set, validation set, and testing set in a ratio of 51: 1: 2.
|
||||
* Detail information such as speech data coding and speaker information is preserved in the metadata file.
|
||||
* The domain of recording texts is diversified, including interactive Q&A, music search, SNS messages, home command and control, etc.
|
||||
* Segmented transcripts are also provided.
|
||||
|
||||
The corpus aims to support researchers in speech recognition, machine translation, speaker recognition, and other speech-related fields. Therefore, the corpus is totally free for academic use.
|
@ -0,0 +1,11 @@
|
||||
# multi-cn
|
||||
|
||||
This is a Chinese speech recognition recipe that trains on all Chinese corpora on OpenSLR, including:
|
||||
|
||||
* Aidatatang (140 hours)
|
||||
* Aishell (151 hours)
|
||||
* MagicData (712 hours)
|
||||
* Primewords (99 hours)
|
||||
* ST-CMDS (110 hours)
|
||||
* THCHS-30 (26 hours)
|
||||
* optional AISHELL2 (~1000 hours) if available
|
@ -0,0 +1,6 @@
|
||||
# [Primewords](http://www.openslr.org/47/)
|
||||
|
||||
This free Chinese Mandarin speech corpus set is released by Shanghai Primewords Information Technology Co., Ltd.
|
||||
The corpus is recorded by smart mobile phones from 296 native Chinese speakers. The transcription accuracy is larger than 98%, at the confidence level of 95%. It is free for academic use.
|
||||
|
||||
The mapping between the transcript and utterance is given in JSON format.
|
@ -0,0 +1 @@
|
||||
# [FreeST](http://www.openslr.org/38/)
|
@ -0,0 +1,6 @@
|
||||
*.tar.gz.*
|
||||
manifest.*
|
||||
*.md
|
||||
EN-ZH/
|
||||
train-split/
|
||||
test-segment/
|
@ -0,0 +1,116 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Prepare Ted-En-Zh speech translation dataset
|
||||
|
||||
Create manifest files from splited datased.
|
||||
dev set: tst2010, test set: tst2015
|
||||
Manifest file is a json-format file with each line containing the
|
||||
meta data (i.e. audio filepath, transcript and audio duration)
|
||||
of each audio file in the data set.
|
||||
"""
|
||||
import argparse
|
||||
import codecs
|
||||
import json
|
||||
import os
|
||||
|
||||
import soundfile
|
||||
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--src_dir",
|
||||
default="",
|
||||
type=str,
|
||||
help="Directory to kaldi splited data. (default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--manifest_prefix",
|
||||
default="manifest",
|
||||
type=str,
|
||||
help="Filepath prefix for output manifests. (default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def create_manifest(data_dir, manifest_path_prefix):
|
||||
print("Creating manifest %s ..." % manifest_path_prefix)
|
||||
json_lines = []
|
||||
|
||||
data_types_infos = [
|
||||
('train', 'train-split/train-segment', 'En-Zh/train.en-zh'),
|
||||
('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'),
|
||||
('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')
|
||||
]
|
||||
for data_info in data_types_infos:
|
||||
dtype, audio_relative_dir, text_relative_path = data_info
|
||||
del json_lines[:]
|
||||
total_sec = 0.0
|
||||
total_text = 0.0
|
||||
total_num = 0
|
||||
|
||||
text_path = os.path.join(data_dir, text_relative_path)
|
||||
audio_dir = os.path.join(data_dir, audio_relative_dir)
|
||||
|
||||
for line in codecs.open(text_path, 'r', 'utf-8', errors='ignore'):
|
||||
line = line.strip()
|
||||
if len(line) < 1:
|
||||
continue
|
||||
audio_id, trancription, translation = line.split('\t')
|
||||
utt = audio_id.split('.')[0]
|
||||
|
||||
audio_path = os.path.join(audio_dir, audio_id)
|
||||
if os.path.exists(audio_path):
|
||||
if os.path.getsize(audio_path) < 30000:
|
||||
continue
|
||||
audio_data, samplerate = soundfile.read(audio_path)
|
||||
duration = float(len(audio_data) / samplerate)
|
||||
json_lines.append(
|
||||
json.dumps(
|
||||
{
|
||||
'utt': utt,
|
||||
'feat': audio_path,
|
||||
'feat_shape': (duration, ), # second
|
||||
'text': " ".join(translation.split()),
|
||||
'text1': " ".join(trancription.split())
|
||||
},
|
||||
ensure_ascii=False))
|
||||
|
||||
total_sec += duration
|
||||
total_text += len(translation.split())
|
||||
total_num += 1
|
||||
if not total_num % 1000:
|
||||
print(dtype, 'Processed:', total_num)
|
||||
|
||||
manifest_path = manifest_path_prefix + '.' + dtype + '.raw'
|
||||
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
|
||||
for line in json_lines:
|
||||
fout.write(line + '\n')
|
||||
|
||||
|
||||
def prepare_dataset(src_dir, manifest_path=None):
|
||||
"""create manifest file."""
|
||||
if os.path.isdir(manifest_path):
|
||||
manifest_path = os.path.join(manifest_path, 'manifest')
|
||||
if manifest_path:
|
||||
create_manifest(src_dir, manifest_path)
|
||||
|
||||
|
||||
def main():
|
||||
if args.src_dir.startswith('~'):
|
||||
args.src_dir = os.path.expanduser(args.src_dir)
|
||||
|
||||
prepare_dataset(src_dir=args.src_dir, manifest_path=args.manifest_prefix)
|
||||
|
||||
print("manifest prepare done!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue