E2E/Streaming Transformer/Conformer ASR (#578)
* add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format codepull/604/head
parent
3a2de9e461
commit
71e046b0ba
@ -0,0 +1,50 @@
|
|||||||
|
[flake8]
|
||||||
|
|
||||||
|
########## OPTIONS ##########
|
||||||
|
# Set the maximum length that any line (with some exceptions) may be.
|
||||||
|
max-line-length = 120
|
||||||
|
|
||||||
|
|
||||||
|
################### FILE PATTERNS ##########################
|
||||||
|
# Provide a comma-separated list of glob patterns to exclude from checks.
|
||||||
|
exclude =
|
||||||
|
# git folder
|
||||||
|
.git,
|
||||||
|
# python cache
|
||||||
|
__pycache__,
|
||||||
|
third_party/,
|
||||||
|
# Provide a comma-separate list of glob patterns to include for checks.
|
||||||
|
filename =
|
||||||
|
*.py
|
||||||
|
|
||||||
|
|
||||||
|
########## RULES ##########
|
||||||
|
|
||||||
|
# ERROR CODES
|
||||||
|
#
|
||||||
|
# E/W - PEP8 errors/warnings (pycodestyle)
|
||||||
|
# F - linting errors (pyflakes)
|
||||||
|
# C - McCabe complexity error (mccabe)
|
||||||
|
#
|
||||||
|
# W503 - line break before binary operator
|
||||||
|
|
||||||
|
# Specify a list of codes to ignore.
|
||||||
|
ignore =
|
||||||
|
W503
|
||||||
|
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
|
||||||
|
W291,W293,W605
|
||||||
|
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
|
||||||
|
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
|
||||||
|
# to line this up with executable bit
|
||||||
|
EXE001,
|
||||||
|
# these ignores are from flake8-bugbear; please fix!
|
||||||
|
B007,B008,
|
||||||
|
# these ignores are from flake8-comprehensions; please fix!
|
||||||
|
C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
|
||||||
|
|
||||||
|
# Specify the list of error codes you wish Flake8 to report.
|
||||||
|
select =
|
||||||
|
E,
|
||||||
|
W,
|
||||||
|
F,
|
||||||
|
C
|
@ -0,0 +1,48 @@
|
|||||||
|
[alias]
|
||||||
|
st = status
|
||||||
|
ci = commit
|
||||||
|
br = branch
|
||||||
|
co = checkout
|
||||||
|
df = diff
|
||||||
|
l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
|
||||||
|
ll = log --stat
|
||||||
|
|
||||||
|
[merge]
|
||||||
|
tool = vimdiff
|
||||||
|
|
||||||
|
[core]
|
||||||
|
excludesfile = ~/.gitignore
|
||||||
|
editor = vim
|
||||||
|
|
||||||
|
[color]
|
||||||
|
branch = auto
|
||||||
|
diff = auto
|
||||||
|
status = auto
|
||||||
|
|
||||||
|
[color "branch"]
|
||||||
|
current = yellow reverse
|
||||||
|
local = yellow
|
||||||
|
remote = green
|
||||||
|
|
||||||
|
[color "diff"]
|
||||||
|
meta = yellow bold
|
||||||
|
frag = magenta bold
|
||||||
|
old = red bold
|
||||||
|
new = green bold
|
||||||
|
|
||||||
|
[color "status"]
|
||||||
|
added = yellow
|
||||||
|
changed = green
|
||||||
|
untracked = cyan
|
||||||
|
|
||||||
|
[push]
|
||||||
|
default = matching
|
||||||
|
|
||||||
|
[credential]
|
||||||
|
helper = store
|
||||||
|
|
||||||
|
[user]
|
||||||
|
name =
|
||||||
|
email =
|
||||||
|
|
||||||
|
|
@ -0,0 +1,605 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "academic-surname",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import paddle\n",
|
||||||
|
"from paddle import nn"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "fundamental-treasure",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||||
|
" and should_run_async(code)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"L = nn.Linear(256, 2048)\n",
|
||||||
|
"L2 = nn.Linear(2048, 256)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "consolidated-elephant",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import torch\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "moderate-noise",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"float64\n",
|
||||||
|
"Tensor(shape=[2, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[[-1.54171216, -2.61531472, -1.79881978, ..., -0.31395876, 0.56513089, -0.44516513],\n",
|
||||||
|
" [-0.79492962, 1.91157901, 0.66567147, ..., 0.54825783, -1.01471853, -0.84924090],\n",
|
||||||
|
" [-1.22556651, -0.36225814, 0.65063190, ..., 0.65726501, 0.05563191, 0.09009409],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 0.38615900, -0.77905393, 0.99732304, ..., -1.38463700, -3.32365036, -1.31089687],\n",
|
||||||
|
" [ 0.05579993, 0.06885809, -1.66662002, ..., -0.23346378, -3.29372883, 1.30561364],\n",
|
||||||
|
" [ 1.90676069, 1.95093191, -0.28849599, ..., -0.06860496, 0.95347673, 1.00475824]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.91453546, 0.55298805, -1.06146812, ..., -0.86378336, 1.00454640, 1.26062179],\n",
|
||||||
|
" [ 0.10223761, 0.81301165, 2.36865163, ..., 0.16821407, 0.29240361, 1.05408621],\n",
|
||||||
|
" [-1.33196676, 1.94433689, 0.01934209, ..., 0.48036841, 0.51585966, 1.22893548],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [-0.19558455, -0.47075930, 0.90796155, ..., -1.28598249, -0.24321797, 0.17734711],\n",
|
||||||
|
" [ 0.89819717, -1.39516675, 0.17138045, ..., 2.39761519, 1.76364994, -0.52177650],\n",
|
||||||
|
" [ 0.94122332, -0.18581429, 1.36099780, ..., 0.67647684, -0.04699665, 1.51205540]]])\n",
|
||||||
|
"tensor([[[-1.5417, -2.6153, -1.7988, ..., -0.3140, 0.5651, -0.4452],\n",
|
||||||
|
" [-0.7949, 1.9116, 0.6657, ..., 0.5483, -1.0147, -0.8492],\n",
|
||||||
|
" [-1.2256, -0.3623, 0.6506, ..., 0.6573, 0.0556, 0.0901],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 0.3862, -0.7791, 0.9973, ..., -1.3846, -3.3237, -1.3109],\n",
|
||||||
|
" [ 0.0558, 0.0689, -1.6666, ..., -0.2335, -3.2937, 1.3056],\n",
|
||||||
|
" [ 1.9068, 1.9509, -0.2885, ..., -0.0686, 0.9535, 1.0048]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.9145, 0.5530, -1.0615, ..., -0.8638, 1.0045, 1.2606],\n",
|
||||||
|
" [ 0.1022, 0.8130, 2.3687, ..., 0.1682, 0.2924, 1.0541],\n",
|
||||||
|
" [-1.3320, 1.9443, 0.0193, ..., 0.4804, 0.5159, 1.2289],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [-0.1956, -0.4708, 0.9080, ..., -1.2860, -0.2432, 0.1773],\n",
|
||||||
|
" [ 0.8982, -1.3952, 0.1714, ..., 2.3976, 1.7636, -0.5218],\n",
|
||||||
|
" [ 0.9412, -0.1858, 1.3610, ..., 0.6765, -0.0470, 1.5121]]])\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/workspace/DeepSpeech-2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||||
|
" and should_run_async(code)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"x = np.random.randn(2, 51, 256)\n",
|
||||||
|
"print(x.dtype)\n",
|
||||||
|
"px = paddle.to_tensor(x, dtype='float32')\n",
|
||||||
|
"tx = torch.tensor(x, dtype=torch.float32)\n",
|
||||||
|
"print(px)\n",
|
||||||
|
"print(tx)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "cooked-progressive",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "mechanical-prisoner",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
|
||||||
|
"t_norm_ff = data['norm_ff']\n",
|
||||||
|
"t_ff_out = data['ff_out']\n",
|
||||||
|
"t_ff_l_x = data['ff_l_x']\n",
|
||||||
|
"t_ff_l_a_x = data['ff_l_a_x']\n",
|
||||||
|
"t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
|
||||||
|
"t_ps = data['ps']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "indie-marriage",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "assured-zambia",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"L.set_state_dict({'weight': t_ps[0].T, 'bias': t_ps[1]})\n",
|
||||||
|
"L2.set_state_dict({'weight': t_ps[2].T, 'bias': t_ps[3]})\n",
|
||||||
|
"\n",
|
||||||
|
"ps = []\n",
|
||||||
|
"for n, p in L.named_parameters():\n",
|
||||||
|
" ps.append(p)\n",
|
||||||
|
"\n",
|
||||||
|
"for n, p in L2.state_dict().items():\n",
|
||||||
|
" ps.append(p)\n",
|
||||||
|
" \n",
|
||||||
|
"for p, tp in zip(ps, t_ps):\n",
|
||||||
|
" print(np.allclose(p.numpy(), tp.T))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "committed-jacob",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "extreme-traffic",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "optimum-milwaukee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "viral-indian",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# data = np.load('enc_0_ff_out.npz', allow_pickle=True)\n",
|
||||||
|
"# t_norm_ff = data['norm_ff']\n",
|
||||||
|
"# t_ff_out = data['ff_out']\n",
|
||||||
|
"# t_ff_l_x = data['ff_l_x']\n",
|
||||||
|
"# t_ff_l_a_x = data['ff_l_a_x']\n",
|
||||||
|
"# t_ff_l_a_l_x = data['ff_l_a_l_x']\n",
|
||||||
|
"# t_ps = data['ps']\n",
|
||||||
|
"TL = torch.nn.Linear(256, 2048)\n",
|
||||||
|
"TL2 = torch.nn.Linear(2048, 256)\n",
|
||||||
|
"TL.load_state_dict({'weight': torch.tensor(t_ps[0]), 'bias': torch.tensor(t_ps[1])})\n",
|
||||||
|
"TL2.load_state_dict({'weight': torch.tensor(t_ps[2]), 'bias': torch.tensor(t_ps[3])})\n",
|
||||||
|
"\n",
|
||||||
|
"# for n, p in TL.named_parameters():\n",
|
||||||
|
"# print(n, p)\n",
|
||||||
|
"# for n, p in TL2.named_parameters():\n",
|
||||||
|
"# print(n, p)\n",
|
||||||
|
"\n",
|
||||||
|
"ps = []\n",
|
||||||
|
"for n, p in TL.state_dict().items():\n",
|
||||||
|
" ps.append(p.data.numpy())\n",
|
||||||
|
" \n",
|
||||||
|
"for n, p in TL2.state_dict().items():\n",
|
||||||
|
" ps.append(p.data.numpy())\n",
|
||||||
|
" \n",
|
||||||
|
"for p, tp in zip(ps, t_ps):\n",
|
||||||
|
" print(np.allclose(p, tp))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "skilled-vietnamese",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[[ 0.67277956 0.08313607 -0.62761104 ... -0.17480263 0.42718208\n",
|
||||||
|
" -0.5787626 ]\n",
|
||||||
|
" [ 0.91516656 0.5393416 1.7159258 ... 0.06144593 0.06486575\n",
|
||||||
|
" -0.03350811]\n",
|
||||||
|
" [ 0.438351 0.6227843 0.24096036 ... 1.0912522 -0.90929437\n",
|
||||||
|
" -1.012989 ]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 0.68631977 0.14240924 0.10763275 ... -0.11513516 0.48065388\n",
|
||||||
|
" 0.04070369]\n",
|
||||||
|
" [-0.9525228 0.23197874 0.31264272 ... 0.5312439 0.18773697\n",
|
||||||
|
" -0.8450228 ]\n",
|
||||||
|
" [ 0.42024016 -0.04561988 0.54541194 ... -0.41933843 -0.00436018\n",
|
||||||
|
" -0.06663495]]\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.11638781 -0.33566502 -0.20887226 ... 0.17423287 -0.9195841\n",
|
||||||
|
" -0.8161046 ]\n",
|
||||||
|
" [-0.3469874 0.88269687 -0.11887559 ... -0.15566081 0.16357468\n",
|
||||||
|
" -0.20766167]\n",
|
||||||
|
" [-0.3847657 0.3984318 -0.06963477 ... -0.00360622 1.2360432\n",
|
||||||
|
" -0.26811332]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 0.08230796 -0.46158582 0.54582864 ... 0.15747628 -0.44790155\n",
|
||||||
|
" 0.06020184]\n",
|
||||||
|
" [-0.8095085 0.43163058 -0.42837143 ... 0.8627463 0.90656304\n",
|
||||||
|
" 0.15847842]\n",
|
||||||
|
" [-1.485811 -0.18216592 -0.8882585 ... 0.32596245 0.7822631\n",
|
||||||
|
" -0.6460344 ]]]\n",
|
||||||
|
"[[[ 0.67278004 0.08313602 -0.6276114 ... -0.17480245 0.42718196\n",
|
||||||
|
" -0.5787625 ]\n",
|
||||||
|
" [ 0.91516703 0.5393413 1.7159253 ... 0.06144581 0.06486579\n",
|
||||||
|
" -0.03350812]\n",
|
||||||
|
" [ 0.43835106 0.62278455 0.24096027 ... 1.0912521 -0.9092943\n",
|
||||||
|
" -1.0129892 ]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 0.6863195 0.14240888 0.10763284 ... -0.11513527 0.48065376\n",
|
||||||
|
" 0.04070365]\n",
|
||||||
|
" [-0.9525231 0.23197863 0.31264275 ... 0.53124386 0.18773702\n",
|
||||||
|
" -0.84502304]\n",
|
||||||
|
" [ 0.42024007 -0.04561983 0.545412 ... -0.41933888 -0.00436005\n",
|
||||||
|
" -0.066635 ]]\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.11638767 -0.33566508 -0.20887226 ... 0.17423296 -0.9195838\n",
|
||||||
|
" -0.8161046 ]\n",
|
||||||
|
" [-0.34698725 0.88269705 -0.11887549 ... -0.15566081 0.16357464\n",
|
||||||
|
" -0.20766166]\n",
|
||||||
|
" [-0.3847657 0.3984319 -0.06963488 ... -0.00360619 1.2360426\n",
|
||||||
|
" -0.26811326]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 0.08230786 -0.4615857 0.5458287 ... 0.15747619 -0.44790167\n",
|
||||||
|
" 0.06020182]\n",
|
||||||
|
" [-0.8095083 0.4316307 -0.42837155 ... 0.862746 0.9065631\n",
|
||||||
|
" 0.15847899]\n",
|
||||||
|
" [-1.485811 -0.18216613 -0.8882584 ... 0.32596254 0.7822631\n",
|
||||||
|
" -0.6460344 ]]]\n",
|
||||||
|
"True\n",
|
||||||
|
"False\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"y = L(px)\n",
|
||||||
|
"print(y.numpy())\n",
|
||||||
|
"\n",
|
||||||
|
"ty = TL(tx)\n",
|
||||||
|
"print(ty.data.numpy())\n",
|
||||||
|
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
|
||||||
|
"print(np.allclose(y.numpy(), ty.detach().numpy()))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "incorrect-allah",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "prostate-cameroon",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "governmental-surge",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[ 0.04476918 0.554463 -0.3027508 ... -0.49600336 0.3751858\n",
|
||||||
|
" 0.8254095 ]\n",
|
||||||
|
" [ 0.95594174 -0.29528382 -1.2899452 ... 0.43718258 0.05584608\n",
|
||||||
|
" -0.06974669]]\n",
|
||||||
|
"[[ 0.04476918 0.5544631 -0.3027507 ... -0.49600336 0.37518573\n",
|
||||||
|
" 0.8254096 ]\n",
|
||||||
|
" [ 0.95594174 -0.29528376 -1.2899454 ... 0.4371827 0.05584623\n",
|
||||||
|
" -0.0697467 ]]\n",
|
||||||
|
"True\n",
|
||||||
|
"False\n",
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"x = np.random.randn(2, 256)\n",
|
||||||
|
"px = paddle.to_tensor(x, dtype='float32')\n",
|
||||||
|
"tx = torch.tensor(x, dtype=torch.float32)\n",
|
||||||
|
"y = L(px)\n",
|
||||||
|
"print(y.numpy())\n",
|
||||||
|
"ty = TL(tx)\n",
|
||||||
|
"print(ty.data.numpy())\n",
|
||||||
|
"print(np.allclose(px.numpy(), tx.detach().numpy()))\n",
|
||||||
|
"print(np.allclose(y.numpy(), ty.detach().numpy()))\n",
|
||||||
|
"print(np.allclose(y.numpy(), ty.detach().numpy(), atol=1e-5))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "confidential-jacket",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "improved-civilization",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"5e7e7c9fde8350084abf1898cf52651cfc84b17a\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(paddle.version.commit)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "d1e2d3b4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['__builtins__',\n",
|
||||||
|
" '__cached__',\n",
|
||||||
|
" '__doc__',\n",
|
||||||
|
" '__file__',\n",
|
||||||
|
" '__loader__',\n",
|
||||||
|
" '__name__',\n",
|
||||||
|
" '__package__',\n",
|
||||||
|
" '__spec__',\n",
|
||||||
|
" 'commit',\n",
|
||||||
|
" 'full_version',\n",
|
||||||
|
" 'istaged',\n",
|
||||||
|
" 'major',\n",
|
||||||
|
" 'minor',\n",
|
||||||
|
" 'mkl',\n",
|
||||||
|
" 'patch',\n",
|
||||||
|
" 'rc',\n",
|
||||||
|
" 'show',\n",
|
||||||
|
" 'with_mkl']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"dir(paddle.version)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "c880c719",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"2.1.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(paddle.version.full_version)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "f26977bf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"commit: 5e7e7c9fde8350084abf1898cf52651cfc84b17a\n",
|
||||||
|
"None\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(paddle.version.show())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "04ad47f6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"1.6.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(torch.__version__)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"id": "e1e03830",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['__builtins__',\n",
|
||||||
|
" '__cached__',\n",
|
||||||
|
" '__doc__',\n",
|
||||||
|
" '__file__',\n",
|
||||||
|
" '__loader__',\n",
|
||||||
|
" '__name__',\n",
|
||||||
|
" '__package__',\n",
|
||||||
|
" '__spec__',\n",
|
||||||
|
" '__version__',\n",
|
||||||
|
" 'cuda',\n",
|
||||||
|
" 'debug',\n",
|
||||||
|
" 'git_version',\n",
|
||||||
|
" 'hip']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"dir(torch.version)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "4ad0389b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'b31f58de6fa8bbda5353b3c77d9be4914399724d'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"torch.version.git_version"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "7870ea10",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'10.2'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"torch.version.cuda"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "db8ee5a7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6321ec2a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,290 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "breeding-haven",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/ssd5/zhanghui/DeepSpeech2.x\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'/home/ssd5/zhanghui/DeepSpeech2.x'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%cd ..\n",
|
||||||
|
"%pwd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "appropriate-theta",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"LICENSE deepspeech examples\t\t requirements.txt tools\r\n",
|
||||||
|
"README.md docs\t libsndfile-1.0.28\t setup.sh\t utils\r\n",
|
||||||
|
"README_cn.md env.sh\t libsndfile-1.0.28.tar.gz tests\r\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!ls"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "entire-bloom",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||||
|
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||||
|
" def convert_to_list(value, n, name, dtype=np.int):\n",
|
||||||
|
"WARNING:root:override cat of paddle.Tensor if exists or register, remove this when fixed!\n",
|
||||||
|
"WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
|
||||||
|
"WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
|
||||||
|
"WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
|
||||||
|
"WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
|
||||||
|
"WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
|
||||||
|
"WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
|
||||||
|
"WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from deepspeech.modules import loss"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "governmental-aircraft",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||||
|
" and should_run_async(code)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import paddle"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "proprietary-disaster",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<function deepspeech.modules.repeat(xs: paddle.VarBase, *size: Any) -> paddle.VarBase>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"paddle.Tensor.repeat"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "first-diagram",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<property at 0x7fb515eeeb88>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"paddle.Tensor.size"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "intelligent-david",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<function paddle.tensor.manipulation.concat(x, axis=0, name=None)>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"paddle.Tensor.cat"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "bronze-tenant",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"a = paddle.to_tensor([12,32, 10, 12, 123,32 ,4])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "balanced-bearing",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"7"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a.size"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "extreme-republic",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:\n",
|
||||||
|
" nargs = len(args)\n",
|
||||||
|
" assert (nargs <= 1)\n",
|
||||||
|
" s = paddle.shape(xs)\n",
|
||||||
|
" if nargs == 1:\n",
|
||||||
|
" return s[args[0]]\n",
|
||||||
|
" else:\n",
|
||||||
|
" return s\n",
|
||||||
|
"\n",
|
||||||
|
"# logger.warn(\n",
|
||||||
|
"# \"override size of paddle.Tensor if exists or register, remove this when fixed!\"\n",
|
||||||
|
"# )\n",
|
||||||
|
"paddle.Tensor.size = size"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "gross-addiction",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||||
|
" [7])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a.size(0)\n",
|
||||||
|
"a.size()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "adverse-dining",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||||
|
" [7])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a.size()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "popular-potato",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,229 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"id": "academic-surname",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import paddle\n",
|
||||||
|
"from paddle import nn"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "fundamental-treasure",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Parameter containing:\n",
|
||||||
|
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
|
||||||
|
" [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])\n",
|
||||||
|
"Parameter containing:\n",
|
||||||
|
"Tensor(shape=[256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n",
|
||||||
|
" [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"L = nn.LayerNorm(256, epsilon=1e-12)\n",
|
||||||
|
"for p in L.parameters():\n",
|
||||||
|
" print(p)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"id": "consolidated-elephant",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 46,
|
||||||
|
"id": "moderate-noise",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"float64\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"x = np.random.randn(2, 51, 256)\n",
|
||||||
|
"print(x.dtype)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 47,
|
||||||
|
"id": "cooked-progressive",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y = L(paddle.to_tensor(x, dtype='float32'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 48,
|
||||||
|
"id": "optimum-milwaukee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 49,
|
||||||
|
"id": "viral-indian",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Parameter containing:\n",
|
||||||
|
"tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n",
|
||||||
|
" 1., 1., 1., 1.], requires_grad=True)\n",
|
||||||
|
"Parameter containing:\n",
|
||||||
|
"tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
|
||||||
|
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
|
||||||
|
" requires_grad=True)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"TL = torch.nn.LayerNorm(256, eps=1e-12)\n",
|
||||||
|
"for p in TL.parameters():\n",
|
||||||
|
" print(p)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 50,
|
||||||
|
"id": "skilled-vietnamese",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ty = TL(torch.tensor(x, dtype=torch.float32))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 51,
|
||||||
|
"id": "incorrect-allah",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 51,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"np.allclose(y.numpy(), ty.detach().numpy())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "prostate-cameroon",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 52,
|
||||||
|
"id": "governmental-surge",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 52,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"x = np.random.randn(2, 256)\n",
|
||||||
|
"y = L(paddle.to_tensor(x, dtype='float32'))\n",
|
||||||
|
"ty = TL(torch.tensor(x, dtype=torch.float32))\n",
|
||||||
|
"np.allclose(y.numpy(), ty.detach().numpy())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "confidential-jacket",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,449 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "primary-organic",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"id": "stopped-semester",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def mask_finished_scores(score: torch.Tensor,\n",
|
||||||
|
" flag: torch.Tensor) -> torch.Tensor:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" If a sequence is finished, we only allow one alive branch. This function\n",
|
||||||
|
" aims to give one branch a zero score and the rest -inf score.\n",
|
||||||
|
" Args:\n",
|
||||||
|
" score (torch.Tensor): A real value array with shape\n",
|
||||||
|
" (batch_size * beam_size, beam_size).\n",
|
||||||
|
" flag (torch.Tensor): A bool array with shape\n",
|
||||||
|
" (batch_size * beam_size, 1).\n",
|
||||||
|
" Returns:\n",
|
||||||
|
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" beam_size = score.size(-1)\n",
|
||||||
|
" zero_mask = torch.zeros_like(flag, dtype=torch.bool)\n",
|
||||||
|
" if beam_size > 1:\n",
|
||||||
|
" unfinished = torch.cat((zero_mask, flag.repeat([1, beam_size - 1])),\n",
|
||||||
|
" dim=1)\n",
|
||||||
|
" finished = torch.cat((flag, zero_mask.repeat([1, beam_size - 1])),\n",
|
||||||
|
" dim=1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" unfinished = zero_mask\n",
|
||||||
|
" finished = flag\n",
|
||||||
|
" print(unfinished)\n",
|
||||||
|
" print(finished)\n",
|
||||||
|
" score.masked_fill_(unfinished, -float('inf'))\n",
|
||||||
|
" score.masked_fill_(finished, 0)\n",
|
||||||
|
" return score"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 58,
|
||||||
|
"id": "agreed-portuguese",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"tensor([[ True],\n",
|
||||||
|
" [False]])\n",
|
||||||
|
"tensor([[-0.8841, 0.7381, -0.9986],\n",
|
||||||
|
" [ 0.2675, -0.7971, 0.3798]])\n",
|
||||||
|
"tensor([[ True, True],\n",
|
||||||
|
" [False, False]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"score = torch.randn((2, 3))\n",
|
||||||
|
"flag = torch.ones((2, 1), dtype=torch.bool)\n",
|
||||||
|
"flag[1] = False\n",
|
||||||
|
"print(flag)\n",
|
||||||
|
"print(score)\n",
|
||||||
|
"print(flag.repeat([1, 2]))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 59,
|
||||||
|
"id": "clean-aspect",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"tensor([[False, True, True],\n",
|
||||||
|
" [False, False, False]])\n",
|
||||||
|
"tensor([[ True, False, False],\n",
|
||||||
|
" [False, False, False]])\n",
|
||||||
|
"tensor([[ 0.0000, -inf, -inf],\n",
|
||||||
|
" [ 0.2675, -0.7971, 0.3798]])\n",
|
||||||
|
"tensor([[ 0.0000, -inf, -inf],\n",
|
||||||
|
" [ 0.2675, -0.7971, 0.3798]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"r = mask_finished_scores(score, flag)\n",
|
||||||
|
"print(r)\n",
|
||||||
|
"print(score)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 55,
|
||||||
|
"id": "thrown-airline",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tensor(shape=[2, 1], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[True ],\n",
|
||||||
|
" [False]])\n",
|
||||||
|
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[True , True ],\n",
|
||||||
|
" [False, False]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import paddle\n",
|
||||||
|
"\n",
|
||||||
|
"score = paddle.randn((2, 3))\n",
|
||||||
|
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||||
|
"flag[1] = False\n",
|
||||||
|
"print(flag)\n",
|
||||||
|
"print(score)\n",
|
||||||
|
"print(flag.tile([1, 2]))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"id": "internal-patent",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[False, True , True ],\n",
|
||||||
|
" [False, False, False]])\n",
|
||||||
|
"Tensor(shape=[2, 3], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[True , False, False],\n",
|
||||||
|
" [False, False, False]])\n",
|
||||||
|
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 2.05994511, 1.87704289, 0.01988174],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"x Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"2 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 2.05994511, -inf. , -inf. ],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"3 Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 0. , -inf. , -inf. ],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n",
|
||||||
|
"Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 0. , -inf. , -inf. ],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"paddle.bool = 'bool'\n",
|
||||||
|
"\n",
|
||||||
|
"def masked_fill(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
|
||||||
|
" print(xs)\n",
|
||||||
|
" trues = paddle.ones_like(xs) * value\n",
|
||||||
|
" assert xs.shape == mask.shape\n",
|
||||||
|
" xs = paddle.where(mask, trues, xs)\n",
|
||||||
|
" return xs\n",
|
||||||
|
"\n",
|
||||||
|
"def masked_fill_(xs:paddle.Tensor, mask:paddle.Tensor, value:float):\n",
|
||||||
|
" print('x', xs)\n",
|
||||||
|
" trues = paddle.ones_like(xs) * value\n",
|
||||||
|
" assert xs.shape == mask.shape\n",
|
||||||
|
" ret = paddle.where(mask, trues, xs)\n",
|
||||||
|
" print('2', xs)\n",
|
||||||
|
" paddle.assign(ret, output=xs)\n",
|
||||||
|
" print('3', xs)\n",
|
||||||
|
"\n",
|
||||||
|
"paddle.Tensor.masked_fill = masked_fill\n",
|
||||||
|
"paddle.Tensor.masked_fill_ = masked_fill_\n",
|
||||||
|
"\n",
|
||||||
|
"def mask_finished_scores_pd(score: paddle.Tensor,\n",
|
||||||
|
" flag: paddle.Tensor) -> paddle.Tensor:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" If a sequence is finished, we only allow one alive branch. This function\n",
|
||||||
|
" aims to give one branch a zero score and the rest -inf score.\n",
|
||||||
|
" Args:\n",
|
||||||
|
" score (torch.Tensor): A real value array with shape\n",
|
||||||
|
" (batch_size * beam_size, beam_size).\n",
|
||||||
|
" flag (torch.Tensor): A bool array with shape\n",
|
||||||
|
" (batch_size * beam_size, 1).\n",
|
||||||
|
" Returns:\n",
|
||||||
|
" torch.Tensor: (batch_size * beam_size, beam_size).\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" beam_size = score.shape[-1]\n",
|
||||||
|
" zero_mask = paddle.zeros_like(flag, dtype=paddle.bool)\n",
|
||||||
|
" if beam_size > 1:\n",
|
||||||
|
" unfinished = paddle.concat((zero_mask, flag.tile([1, beam_size - 1])),\n",
|
||||||
|
" axis=1)\n",
|
||||||
|
" finished = paddle.concat((flag, zero_mask.tile([1, beam_size - 1])),\n",
|
||||||
|
" axis=1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" unfinished = zero_mask\n",
|
||||||
|
" finished = flag\n",
|
||||||
|
" print(unfinished)\n",
|
||||||
|
" print(finished)\n",
|
||||||
|
" \n",
|
||||||
|
" #score.masked_fill_(unfinished, -float('inf'))\n",
|
||||||
|
" #score.masked_fill_(finished, 0)\n",
|
||||||
|
"# infs = paddle.ones_like(score) * -float('inf')\n",
|
||||||
|
"# score = paddle.where(unfinished, infs, score)\n",
|
||||||
|
"# score = paddle.where(finished, paddle.zeros_like(score), score)\n",
|
||||||
|
"\n",
|
||||||
|
"# score = score.masked_fill(unfinished, -float('inf'))\n",
|
||||||
|
"# score = score.masked_fill(finished, 0)\n",
|
||||||
|
" score.masked_fill_(unfinished, -float('inf'))\n",
|
||||||
|
" score.masked_fill_(finished, 0)\n",
|
||||||
|
" return score\n",
|
||||||
|
"\n",
|
||||||
|
"r = mask_finished_scores_pd(score, flag)\n",
|
||||||
|
"print(r)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 57,
|
||||||
|
"id": "vocal-prime",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<bound method PyCapsule.value of Tensor(shape=[2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[ 0. , -inf. , -inf. ],\n",
|
||||||
|
" [-0.40165186, 0.77547729, -0.64469045]])>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 57,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"score.value"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 71,
|
||||||
|
"id": "bacterial-adolescent",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Union, Any"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 72,
|
||||||
|
"id": "absent-fiber",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def repeat(xs : paddle.Tensor, *size: Any):\n",
|
||||||
|
" print(size)\n",
|
||||||
|
" return paddle.tile(xs, size)\n",
|
||||||
|
"paddle.Tensor.repeat = repeat"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 73,
|
||||||
|
"id": "material-harbor",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(1, 2)\n",
|
||||||
|
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[True , True ],\n",
|
||||||
|
" [False, False]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||||
|
"flag[1] = False\n",
|
||||||
|
"print(flag.repeat(1, 2))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 84,
|
||||||
|
"id": "acute-brighton",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [1]), 2)\n",
|
||||||
|
"Tensor(shape=[2, 2], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n",
|
||||||
|
" [[True , True ],\n",
|
||||||
|
" [False, False]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"flag = paddle.ones((2, 1), dtype='bool')\n",
|
||||||
|
"flag[1] = False\n",
|
||||||
|
"print(flag.repeat(paddle.to_tensor(1), 2))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 85,
|
||||||
|
"id": "european-rugby",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def size(xs, *args: int):\n",
|
||||||
|
" nargs = len(args)\n",
|
||||||
|
" s = paddle.shape(xs)\n",
|
||||||
|
" assert(nargs <= 1)\n",
|
||||||
|
" if nargs == 1:\n",
|
||||||
|
" return s[args[0]]\n",
|
||||||
|
" else:\n",
|
||||||
|
" return s\n",
|
||||||
|
"paddle.Tensor.size = size"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 86,
|
||||||
|
"id": "moral-special",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tensor(shape=[2], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||||
|
" [2, 1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 86,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"flag.size()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 87,
|
||||||
|
"id": "ahead-coach",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||||
|
" [1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 87,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"flag.size(1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 88,
|
||||||
|
"id": "incomplete-fitness",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,\n",
|
||||||
|
" [2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 88,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"flag.size(0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "upset-connectivity",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,231 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "designing-borough",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
|
||||||
|
" and should_run_async(code)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||||
|
" 0.0000000e+00 0.0000000e+00]\n",
|
||||||
|
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||||
|
" 1.1547816e-04 1.0746076e-04]\n",
|
||||||
|
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||||
|
" 2.3095631e-04 2.1492151e-04]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||||
|
" 1.1201146e-02 1.0423505e-02]\n",
|
||||||
|
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||||
|
" 1.1316618e-02 1.0530960e-02]\n",
|
||||||
|
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||||
|
" 1.1432089e-02 1.0638415e-02]]\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"import math\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"max_len=100\n",
|
||||||
|
"d_model=256\n",
|
||||||
|
"\n",
|
||||||
|
"pe = torch.zeros(max_len, d_model)\n",
|
||||||
|
"position = torch.arange(0, max_len,\n",
|
||||||
|
" dtype=torch.float32).unsqueeze(1)\n",
|
||||||
|
"toruch_position = position\n",
|
||||||
|
"div_term = torch.exp(\n",
|
||||||
|
" torch.arange(0, d_model, 2, dtype=torch.float32) *\n",
|
||||||
|
" -(math.log(10000.0) / d_model))\n",
|
||||||
|
"tourch_div_term = div_term.cpu().detach().numpy()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"torhc_sin = torch.sin(position * div_term)\n",
|
||||||
|
"torhc_cos = torch.cos(position * div_term)\n",
|
||||||
|
"print(torhc_sin.cpu().detach().numpy())\n",
|
||||||
|
"np_sin = np.sin((position * div_term).cpu().detach().numpy())\n",
|
||||||
|
"np_cos = np.cos((position * div_term).cpu().detach().numpy())\n",
|
||||||
|
"print(np.allclose(np_sin, torhc_sin.cpu().detach().numpy()))\n",
|
||||||
|
"print(np.allclose(np_cos, torhc_cos.cpu().detach().numpy()))\n",
|
||||||
|
"pe[:, 0::2] = torhc_sin\n",
|
||||||
|
"pe[:, 1::2] = torhc_cos\n",
|
||||||
|
"tourch_pe = pe.cpu().detach().numpy()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "swiss-referral",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"False\n",
|
||||||
|
"False\n",
|
||||||
|
"False\n",
|
||||||
|
"False\n",
|
||||||
|
"[[ 1. 1. 1. ... 1. 1.\n",
|
||||||
|
" 1. ]\n",
|
||||||
|
" [ 0.5403023 0.59737533 0.6479059 ... 1. 1.\n",
|
||||||
|
" 1. ]\n",
|
||||||
|
" [-0.41614684 -0.28628543 -0.1604359 ... 0.99999994 1.\n",
|
||||||
|
" 1. ]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.99993724\n",
|
||||||
|
" 0.9999457 ]\n",
|
||||||
|
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
|
||||||
|
" 0.99994457]\n",
|
||||||
|
" [ 0.03982088 -0.52298605 -0.6157435 ... 0.99992454 0.9999347\n",
|
||||||
|
" 0.99994344]]\n",
|
||||||
|
"----\n",
|
||||||
|
"[[ 1. 1. 1. ... 1. 1.\n",
|
||||||
|
" 1. ]\n",
|
||||||
|
" [ 0.54030234 0.59737533 0.6479059 ... 1. 1.\n",
|
||||||
|
" 1. ]\n",
|
||||||
|
" [-0.41614684 -0.28628543 -0.1604359 ... 1. 1.\n",
|
||||||
|
" 1. ]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [-0.92514753 -0.66694194 -0.67894876 ... 0.9999276 0.9999373\n",
|
||||||
|
" 0.9999457 ]\n",
|
||||||
|
" [-0.81928825 -0.9959641 -0.999139 ... 0.99992603 0.999936\n",
|
||||||
|
" 0.99994457]\n",
|
||||||
|
" [ 0.03982088 -0.5229861 -0.6157435 ... 0.99992454 0.9999347\n",
|
||||||
|
" 0.99994344]]\n",
|
||||||
|
")))))))\n",
|
||||||
|
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||||
|
" 0.0000000e+00 0.0000000e+00]\n",
|
||||||
|
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||||
|
" 1.1547816e-04 1.0746076e-04]\n",
|
||||||
|
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||||
|
" 2.3095631e-04 2.1492151e-04]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||||
|
" 1.1201146e-02 1.0423505e-02]\n",
|
||||||
|
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||||
|
" 1.1316618e-02 1.0530960e-02]\n",
|
||||||
|
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||||
|
" 1.1432089e-02 1.0638415e-02]]\n",
|
||||||
|
"----\n",
|
||||||
|
"[[ 0.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00\n",
|
||||||
|
" 0.0000000e+00 0.0000000e+00]\n",
|
||||||
|
" [ 8.4147096e-01 8.0196178e-01 7.6172036e-01 ... 1.2409373e-04\n",
|
||||||
|
" 1.1547816e-04 1.0746076e-04]\n",
|
||||||
|
" [ 9.0929741e-01 9.5814437e-01 9.8704624e-01 ... 2.4818745e-04\n",
|
||||||
|
" 2.3095631e-04 2.1492151e-04]\n",
|
||||||
|
" ...\n",
|
||||||
|
" [ 3.7960774e-01 7.4510968e-01 7.3418564e-01 ... 1.2036801e-02\n",
|
||||||
|
" 1.1201146e-02 1.0423505e-02]\n",
|
||||||
|
" [-5.7338190e-01 -8.9752287e-02 -4.1488394e-02 ... 1.2160885e-02\n",
|
||||||
|
" 1.1316618e-02 1.0530960e-02]\n",
|
||||||
|
" [-9.9920684e-01 -8.5234123e-01 -7.8794664e-01 ... 1.2284970e-02\n",
|
||||||
|
" 1.1432089e-02 1.0638415e-02]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import paddle\n",
|
||||||
|
"paddle.set_device('cpu')\n",
|
||||||
|
"ppe = paddle.zeros((max_len, d_model), dtype='float32')\n",
|
||||||
|
"position = paddle.arange(0, max_len,\n",
|
||||||
|
" dtype='float32').unsqueeze(1)\n",
|
||||||
|
"print(np.allclose(position.numpy(), toruch_position))\n",
|
||||||
|
"div_term = paddle.exp(\n",
|
||||||
|
" paddle.arange(0, d_model, 2, dtype='float32') *\n",
|
||||||
|
" -(math.log(10000.0) / d_model))\n",
|
||||||
|
"print(np.allclose(div_term.numpy(), tourch_div_term))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"p_sin = paddle.sin(position * div_term)\n",
|
||||||
|
"p_cos = paddle.cos(position * div_term)\n",
|
||||||
|
"print(np.allclose(np_sin, p_sin.numpy(), rtol=1.e-6, atol=0))\n",
|
||||||
|
"print(np.allclose(np_cos, p_cos.numpy(), rtol=1.e-6, atol=0))\n",
|
||||||
|
"ppe[:, 0::2] = p_sin\n",
|
||||||
|
"ppe[:, 1::2] = p_cos\n",
|
||||||
|
"print(np.allclose(p_sin.numpy(), torhc_sin.cpu().detach().numpy()))\n",
|
||||||
|
"print(np.allclose(p_cos.numpy(), torhc_cos.cpu().detach().numpy()))\n",
|
||||||
|
"print(p_cos.numpy())\n",
|
||||||
|
"print(\"----\")\n",
|
||||||
|
"print(torhc_cos.cpu().detach().numpy())\n",
|
||||||
|
"print(\")))))))\")\n",
|
||||||
|
"print(p_sin.numpy())\n",
|
||||||
|
"print(\"----\")\n",
|
||||||
|
"print(torhc_sin.cpu().detach().numpy())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "integrated-boards",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"False\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(np.allclose(ppe.numpy(), pe.numpy()))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "flying-reserve",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "revised-divide",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,37 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
setup_env(){
|
||||||
|
cd tools && make && cd -
|
||||||
|
}
|
||||||
|
|
||||||
|
install(){
|
||||||
|
if [ -f "setup.sh" ]; then
|
||||||
|
bash setup.sh
|
||||||
|
#export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
|
||||||
|
fi
|
||||||
|
if [ $? != 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
print_env(){
|
||||||
|
cat /etc/lsb-release
|
||||||
|
gcc -v
|
||||||
|
g++ -v
|
||||||
|
}
|
||||||
|
|
||||||
|
abort(){
|
||||||
|
echo "Run install failed" 1>&2
|
||||||
|
echo "Please check your code" 1>&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
trap 'abort' 0
|
||||||
|
set -e
|
||||||
|
|
||||||
|
print_env
|
||||||
|
setup_env
|
||||||
|
source tools/venv/bin/activate
|
||||||
|
install
|
||||||
|
|
||||||
|
trap : 0
|
@ -0,0 +1,468 @@
|
|||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Maintainer:
|
||||||
|
" Amir Salihefendic — @amix3k
|
||||||
|
"
|
||||||
|
" Awesome_version:
|
||||||
|
" Get this config, nice color schemes and lots of plugins!
|
||||||
|
"
|
||||||
|
" Install the awesome version from:
|
||||||
|
"
|
||||||
|
" https://github.com/amix/vimrc
|
||||||
|
"
|
||||||
|
" Sections:
|
||||||
|
" -> General
|
||||||
|
" -> VIM user interface
|
||||||
|
" -> Colors and Fonts
|
||||||
|
" -> Files and backups
|
||||||
|
" -> Text, tab and indent related
|
||||||
|
" -> Visual mode related
|
||||||
|
" -> Moving around, tabs and buffers
|
||||||
|
" -> Status line
|
||||||
|
" -> Editing mappings
|
||||||
|
" -> vimgrep searching and cope displaying
|
||||||
|
" -> Spell checking
|
||||||
|
" -> Misc
|
||||||
|
" -> Helper functions
|
||||||
|
"
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => General
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Sets how many lines of history VIM has to remember
|
||||||
|
set history=500
|
||||||
|
|
||||||
|
" Enable filetype plugins
|
||||||
|
filetype plugin on
|
||||||
|
filetype indent on
|
||||||
|
|
||||||
|
" Set to auto read when a file is changed from the outside
|
||||||
|
set autoread
|
||||||
|
au FocusGained,BufEnter * checktime
|
||||||
|
|
||||||
|
" With a map leader it's possible to do extra key combinations
|
||||||
|
" like <leader>w saves the current file
|
||||||
|
let mapleader = ","
|
||||||
|
|
||||||
|
" Fast saving
|
||||||
|
nmap <leader>w :w!<cr>
|
||||||
|
|
||||||
|
" :W sudo saves the file
|
||||||
|
" (useful for handling the permission-denied error)
|
||||||
|
command! W execute 'w !sudo tee % > /dev/null' <bar> edit!
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => VIM user interface
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Set 7 lines to the cursor - when moving vertically using j/k
|
||||||
|
set so=7
|
||||||
|
|
||||||
|
" Avoid garbled characters in Chinese language windows OS
|
||||||
|
let $LANG='en'
|
||||||
|
set langmenu=en
|
||||||
|
source $VIMRUNTIME/delmenu.vim
|
||||||
|
source $VIMRUNTIME/menu.vim
|
||||||
|
|
||||||
|
" Turn on the Wild menu
|
||||||
|
set wildmenu
|
||||||
|
|
||||||
|
" Ignore compiled files
|
||||||
|
set wildignore=*.o,*~,*.pyc
|
||||||
|
if has("win16") || has("win32")
|
||||||
|
set wildignore+=.git\*,.hg\*,.svn\*
|
||||||
|
else
|
||||||
|
set wildignore+=*/.git/*,*/.hg/*,*/.svn/*,*/.DS_Store
|
||||||
|
endif
|
||||||
|
|
||||||
|
"Always show current position
|
||||||
|
set ruler
|
||||||
|
|
||||||
|
" Height of the command bar
|
||||||
|
set cmdheight=1
|
||||||
|
|
||||||
|
" A buffer becomes hidden when it is abandoned
|
||||||
|
set hid
|
||||||
|
|
||||||
|
" Configure backspace so it acts as it should act
|
||||||
|
set backspace=eol,start,indent
|
||||||
|
set whichwrap+=<,>,h,l
|
||||||
|
|
||||||
|
" Ignore case when searching
|
||||||
|
set ignorecase
|
||||||
|
|
||||||
|
" When searching try to be smart about cases
|
||||||
|
set smartcase
|
||||||
|
|
||||||
|
" Highlight search results
|
||||||
|
set hlsearch
|
||||||
|
|
||||||
|
" Makes search act like search in modern browsers
|
||||||
|
set incsearch
|
||||||
|
|
||||||
|
" Don't redraw while executing macros (good performance config)
|
||||||
|
set lazyredraw
|
||||||
|
|
||||||
|
" For regular expressions turn magic on
|
||||||
|
set magic
|
||||||
|
|
||||||
|
" Show matching brackets when text indicator is over them
|
||||||
|
set showmatch
|
||||||
|
" How many tenths of a second to blink when matching brackets
|
||||||
|
set mat=2
|
||||||
|
|
||||||
|
" No annoying sound on errors
|
||||||
|
set noerrorbells
|
||||||
|
set novisualbell
|
||||||
|
set t_vb=
|
||||||
|
set tm=500
|
||||||
|
|
||||||
|
" Properly disable sound on errors on MacVim
|
||||||
|
if has("gui_macvim")
|
||||||
|
autocmd GUIEnter * set vb t_vb=
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
" Add a bit extra margin to the left
|
||||||
|
set foldcolumn=1
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Colors and Fonts
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Enable syntax highlighting
|
||||||
|
syntax enable
|
||||||
|
|
||||||
|
" Enable 256 colors palette in Gnome Terminal
|
||||||
|
if $COLORTERM == 'gnome-terminal'
|
||||||
|
set t_Co=256
|
||||||
|
endif
|
||||||
|
|
||||||
|
try
|
||||||
|
colorscheme desert
|
||||||
|
catch
|
||||||
|
endtry
|
||||||
|
|
||||||
|
set background=dark
|
||||||
|
|
||||||
|
" Set extra options when running in GUI mode
|
||||||
|
if has("gui_running")
|
||||||
|
set guioptions-=T
|
||||||
|
set guioptions-=e
|
||||||
|
set t_Co=256
|
||||||
|
set guitablabel=%M\ %t
|
||||||
|
endif
|
||||||
|
|
||||||
|
" Set utf8 as standard encoding and en_US as the standard language
|
||||||
|
set encoding=utf8
|
||||||
|
set fileencodings=ucs-bom,utf-8,cp936
|
||||||
|
set fileencoding=gb2312
|
||||||
|
set termencoding=utf-8
|
||||||
|
|
||||||
|
" Use Unix as the standard file type
|
||||||
|
set ffs=unix,dos,mac
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Files, backups and undo
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Turn backup off, since most stuff is in SVN, git etc. anyway...
|
||||||
|
set nobackup
|
||||||
|
set nowb
|
||||||
|
set noswapfile
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Text, tab and indent related
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Use spaces instead of tabs
|
||||||
|
set expandtab
|
||||||
|
|
||||||
|
" Be smart when using tabs ;)
|
||||||
|
set smarttab
|
||||||
|
|
||||||
|
" 1 tab == 4 spaces
|
||||||
|
set shiftwidth=4
|
||||||
|
set tabstop=4
|
||||||
|
|
||||||
|
" Linebreak on 500 characters
|
||||||
|
set lbr
|
||||||
|
set tw=500
|
||||||
|
|
||||||
|
set ai "Auto indent
|
||||||
|
set si "Smart indent
|
||||||
|
set wrap "Wrap lines
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => Visual mode related
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" Visual mode pressing * or # searches for the current selection
|
||||||
|
" Super useful! From an idea by Michael Naumann
|
||||||
|
vnoremap <silent> * :<C-u>call VisualSelection('', '')<CR>/<C-R>=@/<CR><CR>
|
||||||
|
vnoremap <silent> # :<C-u>call VisualSelection('', '')<CR>?<C-R>=@/<CR><CR>
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Moving around, tabs, windows and buffers
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Map <Space> to / (search) and Ctrl-<Space> to ? (backwards search)
|
||||||
|
map <space> /
|
||||||
|
map <C-space> ?
|
||||||
|
|
||||||
|
" Disable highlight when <leader><cr> is pressed
|
||||||
|
map <silent> <leader><cr> :noh<cr>
|
||||||
|
|
||||||
|
" Smart way to move between windows
|
||||||
|
map <C-j> <C-W>j
|
||||||
|
map <C-k> <C-W>k
|
||||||
|
map <C-h> <C-W>h
|
||||||
|
map <C-l> <C-W>l
|
||||||
|
|
||||||
|
" Close the current buffer
|
||||||
|
map <leader>bd :Bclose<cr>:tabclose<cr>gT
|
||||||
|
|
||||||
|
" Close all the buffers
|
||||||
|
map <leader>ba :bufdo bd<cr>
|
||||||
|
|
||||||
|
map <leader>l :bnext<cr>
|
||||||
|
map <leader>h :bprevious<cr>
|
||||||
|
|
||||||
|
" Useful mappings for managing tabs
|
||||||
|
map <leader>tn :tabnew<cr>
|
||||||
|
map <leader>to :tabonly<cr>
|
||||||
|
map <leader>tc :tabclose<cr>
|
||||||
|
map <leader>tm :tabmove
|
||||||
|
map <leader>t<leader> :tabnext
|
||||||
|
|
||||||
|
" Let 'tl' toggle between this and the last accessed tab
|
||||||
|
let g:lasttab = 1
|
||||||
|
nmap <Leader>tl :exe "tabn ".g:lasttab<CR>
|
||||||
|
au TabLeave * let g:lasttab = tabpagenr()
|
||||||
|
|
||||||
|
|
||||||
|
" Opens a new tab with the current buffer's path
|
||||||
|
" Super useful when editing files in the same directory
|
||||||
|
map <leader>te :tabedit <C-r>=expand("%:p:h")<cr>/
|
||||||
|
|
||||||
|
" Switch CWD to the directory of the open buffer
|
||||||
|
map <leader>cd :cd %:p:h<cr>:pwd<cr>
|
||||||
|
|
||||||
|
" Specify the behavior when switching between buffers
|
||||||
|
try
|
||||||
|
set switchbuf=useopen,usetab,newtab
|
||||||
|
set stal=2
|
||||||
|
catch
|
||||||
|
endtry
|
||||||
|
|
||||||
|
" Return to last edit position when opening files (You want this!)
|
||||||
|
au BufReadPost * if line("'\"") > 1 && line("'\"") <= line("$") | exe "normal! g'\"" | endif
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => Status line
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" Always show the status line
|
||||||
|
set laststatus=2
|
||||||
|
|
||||||
|
" Format the status line
|
||||||
|
set statusline=\ %{HasPaste()}%F%m%r%h\ %w\ \ CWD:\ %r%{getcwd()}%h\ \ \ Line:\ %l\ \ Column:\ %c
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Editing mappings
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Remap VIM 0 to first non-blank character
|
||||||
|
map 0 ^
|
||||||
|
|
||||||
|
" Move a line of text using ALT+[jk] or Command+[jk] on mac
|
||||||
|
nmap <M-j> mz:m+<cr>`z
|
||||||
|
nmap <M-k> mz:m-2<cr>`z
|
||||||
|
vmap <M-j> :m'>+<cr>`<my`>mzgv`yo`z
|
||||||
|
vmap <M-k> :m'<-2<cr>`>my`<mzgv`yo`z
|
||||||
|
|
||||||
|
if has("mac") || has("macunix")
|
||||||
|
nmap <D-j> <M-j>
|
||||||
|
nmap <D-k> <M-k>
|
||||||
|
vmap <D-j> <M-j>
|
||||||
|
vmap <D-k> <M-k>
|
||||||
|
endif
|
||||||
|
|
||||||
|
" Delete trailing white space on save, useful for some filetypes ;)
|
||||||
|
fun! CleanExtraSpaces()
|
||||||
|
let save_cursor = getpos(".")
|
||||||
|
let old_query = getreg('/')
|
||||||
|
silent! %s/\s\+$//e
|
||||||
|
call setpos('.', save_cursor)
|
||||||
|
call setreg('/', old_query)
|
||||||
|
endfun
|
||||||
|
|
||||||
|
if has("autocmd")
|
||||||
|
autocmd BufWritePre *.txt,*.js,*.py,*.wiki,*.sh,*.coffee :call CleanExtraSpaces()
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Spell checking
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Pressing ,ss will toggle and untoggle spell checking
|
||||||
|
map <leader>ss :setlocal spell!<cr>
|
||||||
|
|
||||||
|
" Shortcuts using <leader>
|
||||||
|
map <leader>sn ]s
|
||||||
|
map <leader>sp [s
|
||||||
|
map <leader>sa zg
|
||||||
|
map <leader>s? z=
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Misc
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Remove the Windows ^M - when the encodings gets messed up
|
||||||
|
noremap <Leader>m mmHmt:%s/<C-V><cr>//ge<cr>'tzt'm
|
||||||
|
|
||||||
|
" Quickly open a buffer for scribble
|
||||||
|
map <leader>q :e ~/buffer<cr>
|
||||||
|
|
||||||
|
" Quickly open a markdown buffer for scribble
|
||||||
|
map <leader>x :e ~/buffer.md<cr>
|
||||||
|
|
||||||
|
" Toggle paste mode on and off
|
||||||
|
map <leader>pp :setlocal paste!<cr>
|
||||||
|
|
||||||
|
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" => Helper functions
|
||||||
|
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||||
|
" Returns true if paste mode is enabled
|
||||||
|
function! HasPaste()
|
||||||
|
if &paste
|
||||||
|
return 'PASTE MODE '
|
||||||
|
endif
|
||||||
|
return ''
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
" Don't close window, when deleting a buffer
|
||||||
|
command! Bclose call <SID>BufcloseCloseIt()
|
||||||
|
function! <SID>BufcloseCloseIt()
|
||||||
|
let l:currentBufNum = bufnr("%")
|
||||||
|
let l:alternateBufNum = bufnr("#")
|
||||||
|
|
||||||
|
if buflisted(l:alternateBufNum)
|
||||||
|
buffer #
|
||||||
|
else
|
||||||
|
bnext
|
||||||
|
endif
|
||||||
|
|
||||||
|
if bufnr("%") == l:currentBufNum
|
||||||
|
new
|
||||||
|
endif
|
||||||
|
|
||||||
|
if buflisted(l:currentBufNum)
|
||||||
|
execute("bdelete! ".l:currentBufNum)
|
||||||
|
endif
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
function! CmdLine(str)
|
||||||
|
call feedkeys(":" . a:str)
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
function! VisualSelection(direction, extra_filter) range
|
||||||
|
let l:saved_reg = @"
|
||||||
|
execute "normal! vgvy"
|
||||||
|
|
||||||
|
let l:pattern = escape(@", "\\/.*'$^~[]")
|
||||||
|
let l:pattern = substitute(l:pattern, "\n$", "", "")
|
||||||
|
|
||||||
|
if a:direction == 'gv'
|
||||||
|
call CmdLine("Ack '" . l:pattern . "' " )
|
||||||
|
elseif a:direction == 'replace'
|
||||||
|
call CmdLine("%s" . '/'. l:pattern . '/')
|
||||||
|
endif
|
||||||
|
|
||||||
|
let @/ = l:pattern
|
||||||
|
let @" = l:saved_reg
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => Python section
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
let python_highlight_all = 1
|
||||||
|
au FileType python syn keyword pythonDecorator True None False self
|
||||||
|
|
||||||
|
au BufNewFile,BufRead *.jinja set syntax=htmljinja
|
||||||
|
au BufNewFile,BufRead *.mako set ft=mako
|
||||||
|
|
||||||
|
au FileType python map <buffer> F :set foldmethod=indent<cr>
|
||||||
|
|
||||||
|
au FileType python inoremap <buffer> $r return
|
||||||
|
au FileType python inoremap <buffer> $i import
|
||||||
|
au FileType python inoremap <buffer> $p print
|
||||||
|
au FileType python inoremap <buffer> $f # --- <esc>a
|
||||||
|
au FileType python map <buffer> <leader>1 /class
|
||||||
|
au FileType python map <buffer> <leader>2 /def
|
||||||
|
au FileType python map <buffer> <leader>C ?class
|
||||||
|
au FileType python map <buffer> <leader>D ?def
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => JavaScript section
|
||||||
|
"""""""""""""""""""""""""""""""
|
||||||
|
au FileType javascript call JavaScriptFold()
|
||||||
|
au FileType javascript setl fen
|
||||||
|
au FileType javascript setl nocindent
|
||||||
|
|
||||||
|
au FileType javascript imap <C-t> $log();<esc>hi
|
||||||
|
au FileType javascript imap <C-a> alert();<esc>hi
|
||||||
|
|
||||||
|
au FileType javascript inoremap <buffer> $r return
|
||||||
|
au FileType javascript inoremap <buffer> $f // --- PH<esc>FP2xi
|
||||||
|
|
||||||
|
function! JavaScriptFold()
|
||||||
|
setl foldmethod=syntax
|
||||||
|
setl foldlevelstart=1
|
||||||
|
syn region foldBraces start=/{/ end=/}/ transparent fold keepend extend
|
||||||
|
|
||||||
|
function! FoldText()
|
||||||
|
return substitute(getline(v:foldstart), '{.*', '{...}', '')
|
||||||
|
endfunction
|
||||||
|
setl foldtext=FoldText()
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => CoffeeScript section
|
||||||
|
"""""""""""""""""""""""""""""""
|
||||||
|
function! CoffeeScriptFold()
|
||||||
|
setl foldmethod=indent
|
||||||
|
setl foldlevelstart=1
|
||||||
|
endfunction
|
||||||
|
au FileType coffee call CoffeeScriptFold()
|
||||||
|
|
||||||
|
au FileType gitcommit call setpos('.', [0, 1, 1, 0])
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => Shell section
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
if exists('$TMUX')
|
||||||
|
if has('nvim')
|
||||||
|
set termguicolors
|
||||||
|
else
|
||||||
|
set term=screen-256color
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => Twig section
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
autocmd BufRead *.twig set syntax=html filetype=html
|
||||||
|
|
||||||
|
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
" => Markdown
|
||||||
|
""""""""""""""""""""""""""""""
|
||||||
|
let vim_markdown_folding_disabled = 1
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,48 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Export for U2 model."""
|
||||||
|
from deepspeech.exps.u2.config import get_cfg_defaults
|
||||||
|
from deepspeech.exps.u2.model import U2Tester as Tester
|
||||||
|
from deepspeech.training.cli import default_argument_parser
|
||||||
|
from deepspeech.utils.utility import print_arguments
|
||||||
|
|
||||||
|
|
||||||
|
def main_sp(config, args):
|
||||||
|
exp = Tester(config, args)
|
||||||
|
exp.setup()
|
||||||
|
exp.run_export()
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
main_sp(config, args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
print_arguments(args, globals())
|
||||||
|
|
||||||
|
# https://yaml.org/type/float.html
|
||||||
|
config = get_cfg_defaults()
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
print(config)
|
||||||
|
if args.dump_config:
|
||||||
|
with open(args.dump_config, 'w') as f:
|
||||||
|
print(config, file=f)
|
||||||
|
|
||||||
|
main(config, args)
|
@ -0,0 +1,59 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Trainer for U2 model."""
|
||||||
|
import cProfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
from paddle import distributed as dist
|
||||||
|
|
||||||
|
from deepspeech.exps.u2.config import get_cfg_defaults
|
||||||
|
from deepspeech.exps.u2.model import U2Trainer as Trainer
|
||||||
|
from deepspeech.training.cli import default_argument_parser
|
||||||
|
from deepspeech.utils.utility import print_arguments
|
||||||
|
|
||||||
|
|
||||||
|
def main_sp(config, args):
|
||||||
|
exp = Trainer(config, args)
|
||||||
|
exp.setup()
|
||||||
|
exp.run()
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
if args.device == "gpu" and args.nprocs > 1:
|
||||||
|
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||||
|
else:
|
||||||
|
main_sp(config, args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
print_arguments(args, globals())
|
||||||
|
|
||||||
|
# https://yaml.org/type/float.html
|
||||||
|
config = get_cfg_defaults()
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
print(config)
|
||||||
|
if args.dump_config:
|
||||||
|
with open(args.dump_config, 'w') as f:
|
||||||
|
print(config, file=f)
|
||||||
|
|
||||||
|
# Setting for profiling
|
||||||
|
pr = cProfile.Profile()
|
||||||
|
pr.runcall(main, config, args)
|
||||||
|
pr.dump_stats(os.path.join(args.output, 'train.profile'))
|
@ -0,0 +1,38 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from deepspeech.exps.u2.model import U2Tester
|
||||||
|
from deepspeech.exps.u2.model import U2Trainer
|
||||||
|
from deepspeech.io.dataset import ManifestDataset
|
||||||
|
from deepspeech.models.u2 import U2Model
|
||||||
|
|
||||||
|
_C = CfgNode()
|
||||||
|
|
||||||
|
_C.data = ManifestDataset.params()
|
||||||
|
|
||||||
|
_C.model = U2Model.params()
|
||||||
|
|
||||||
|
_C.training = U2Trainer.params()
|
||||||
|
|
||||||
|
_C.decoding = U2Tester.params()
|
||||||
|
|
||||||
|
|
||||||
|
def get_cfg_defaults():
|
||||||
|
"""Get a yacs CfgNode object with default values for my_project."""
|
||||||
|
# Return a clone so that the defaults will not be altered
|
||||||
|
# This is for the "local variable" use pattern
|
||||||
|
config = _C.clone()
|
||||||
|
config.set_new_allowed(True)
|
||||||
|
return config
|
@ -0,0 +1,545 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Contains U2 model."""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from paddle import distributed as dist
|
||||||
|
from paddle.io import DataLoader
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from deepspeech.io.collator import SpeechCollator
|
||||||
|
from deepspeech.io.dataset import ManifestDataset
|
||||||
|
from deepspeech.io.sampler import SortagradBatchSampler
|
||||||
|
from deepspeech.io.sampler import SortagradDistributedBatchSampler
|
||||||
|
from deepspeech.models.u2 import U2Model
|
||||||
|
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
|
||||||
|
from deepspeech.training.scheduler import WarmupLR
|
||||||
|
from deepspeech.training.trainer import Trainer
|
||||||
|
from deepspeech.utils import error_rate
|
||||||
|
from deepspeech.utils import layer_tools
|
||||||
|
from deepspeech.utils import mp_tools
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class U2Trainer(Trainer):
|
||||||
|
@classmethod
|
||||||
|
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||||
|
# training config
|
||||||
|
default = CfgNode(
|
||||||
|
dict(
|
||||||
|
n_epoch=50, # train epochs
|
||||||
|
log_interval=100, # steps
|
||||||
|
accum_grad=1, # accum grad by # steps
|
||||||
|
global_grad_clip=5.0, # the global norm clip
|
||||||
|
))
|
||||||
|
default.optim = 'adam'
|
||||||
|
default.optim_conf = CfgNode(
|
||||||
|
dict(
|
||||||
|
lr=5e-4, # learning rate
|
||||||
|
weight_decay=1e-6, # the coeff of weight decay
|
||||||
|
))
|
||||||
|
default.scheduler = 'warmuplr'
|
||||||
|
default.scheduler_conf = CfgNode(
|
||||||
|
dict(
|
||||||
|
warmup_steps=25000,
|
||||||
|
lr_decay=1.0, # learning rate decay
|
||||||
|
))
|
||||||
|
|
||||||
|
if config is not None:
|
||||||
|
config.merge_from_other_cfg(default)
|
||||||
|
return default
|
||||||
|
|
||||||
|
def __init__(self, config, args):
|
||||||
|
super().__init__(config, args)
|
||||||
|
|
||||||
|
def train_batch(self, batch_index, batch_data, msg):
|
||||||
|
train_conf = self.config.training
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
loss, attention_loss, ctc_loss = self.model(*batch_data)
|
||||||
|
# loss div by `batch_size * accum_grad`
|
||||||
|
loss /= train_conf.accum_grad
|
||||||
|
loss.backward()
|
||||||
|
layer_tools.print_grads(self.model, print_func=None)
|
||||||
|
|
||||||
|
losses_np = {'loss': float(loss) * train_conf.accum_grad}
|
||||||
|
if attention_loss:
|
||||||
|
losses_np['att_loss'] = float(attention_loss)
|
||||||
|
if ctc_loss:
|
||||||
|
losses_np['ctc_loss'] = float(ctc_loss)
|
||||||
|
|
||||||
|
if (batch_index + 1) % train_conf.accum_grad == 0:
|
||||||
|
self.optimizer.step()
|
||||||
|
self.optimizer.clear_grad()
|
||||||
|
self.lr_scheduler.step()
|
||||||
|
self.iteration += 1
|
||||||
|
|
||||||
|
iteration_time = time.time() - start
|
||||||
|
|
||||||
|
if (batch_index + 1) % train_conf.log_interval == 0:
|
||||||
|
msg += "train time: {:>.3f}s, ".format(iteration_time)
|
||||||
|
msg += "batch size: {}, ".format(self.config.data.batch_size)
|
||||||
|
msg += "accum: {}, ".format(train_conf.accum_grad)
|
||||||
|
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||||
|
for k, v in losses_np.items())
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
|
if dist.get_rank() == 0 and self.visualizer:
|
||||||
|
losses_np_v = losses_np.copy()
|
||||||
|
losses_np_v.update({"lr": self.lr_scheduler()})
|
||||||
|
self.visualizer.add_scalars("step", losses_np_v,
|
||||||
|
self.iteration - 1)
|
||||||
|
|
||||||
|
@paddle.no_grad()
|
||||||
|
def valid(self):
|
||||||
|
self.model.eval()
|
||||||
|
logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
|
||||||
|
valid_losses = defaultdict(list)
|
||||||
|
num_seen_utts = 1
|
||||||
|
total_loss = 0.0
|
||||||
|
for i, batch in enumerate(self.valid_loader):
|
||||||
|
loss, attention_loss, ctc_loss = self.model(*batch)
|
||||||
|
if paddle.isfinite(loss):
|
||||||
|
num_utts = batch[0].shape[0]
|
||||||
|
num_seen_utts += num_utts
|
||||||
|
total_loss += float(loss) * num_utts
|
||||||
|
valid_losses['val_loss'].append(float(loss))
|
||||||
|
if attention_loss:
|
||||||
|
valid_losses['val_att_loss'].append(float(attention_loss))
|
||||||
|
if ctc_loss:
|
||||||
|
valid_losses['val_ctc_loss'].append(float(ctc_loss))
|
||||||
|
|
||||||
|
if (i + 1) % self.config.training.log_interval == 0:
|
||||||
|
valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||||
|
valid_dump['val_history_loss'] = total_loss / num_seen_utts
|
||||||
|
|
||||||
|
# logging
|
||||||
|
msg = f"Valid: Rank: {dist.get_rank()}, "
|
||||||
|
msg += "epoch: {}, ".format(self.epoch)
|
||||||
|
msg += "step: {}, ".format(self.iteration)
|
||||||
|
msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
|
||||||
|
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||||
|
for k, v in valid_dump.items())
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
|
logger.info('Rank {} Val info val_loss {}'.format(
|
||||||
|
dist.get_rank(), total_loss / num_seen_utts))
|
||||||
|
return total_loss, num_seen_utts
|
||||||
|
|
||||||
|
def train(self):
|
||||||
|
"""The training process control by step."""
|
||||||
|
# !!!IMPORTANT!!!
|
||||||
|
# Try to export the model by script, if fails, we should refine
|
||||||
|
# the code to satisfy the script export requirements
|
||||||
|
# script_model = paddle.jit.to_static(self.model)
|
||||||
|
# script_model_path = str(self.checkpoint_dir / 'init')
|
||||||
|
# paddle.jit.save(script_model, script_model_path)
|
||||||
|
|
||||||
|
from_scratch = self.resume_or_scratch()
|
||||||
|
if from_scratch:
|
||||||
|
# save init model, i.e. 0 epoch
|
||||||
|
self.save(tag='init')
|
||||||
|
|
||||||
|
self.lr_scheduler.step(self.iteration)
|
||||||
|
if self.parallel:
|
||||||
|
self.train_loader.batch_sampler.set_epoch(self.epoch)
|
||||||
|
|
||||||
|
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
|
||||||
|
while self.epoch < self.config.training.n_epoch:
|
||||||
|
self.model.train()
|
||||||
|
try:
|
||||||
|
data_start_time = time.time()
|
||||||
|
for batch_index, batch in enumerate(self.train_loader):
|
||||||
|
dataload_time = time.time() - data_start_time
|
||||||
|
msg = "Train: Rank: {}, ".format(dist.get_rank())
|
||||||
|
msg += "epoch: {}, ".format(self.epoch)
|
||||||
|
msg += "step: {}, ".format(self.iteration)
|
||||||
|
msg += "batch : {}/{}, ".format(batch_index + 1,
|
||||||
|
len(self.train_loader))
|
||||||
|
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
|
||||||
|
msg += "data time: {:>.3f}s, ".format(dataload_time)
|
||||||
|
self.train_batch(batch_index, batch, msg)
|
||||||
|
data_start_time = time.time()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
total_loss, num_seen_utts = self.valid()
|
||||||
|
if dist.get_world_size() > 1:
|
||||||
|
num_seen_utts = paddle.to_tensor(num_seen_utts)
|
||||||
|
# the default operator in all_reduce function is sum.
|
||||||
|
dist.all_reduce(num_seen_utts)
|
||||||
|
total_loss = paddle.to_tensor(total_loss)
|
||||||
|
dist.all_reduce(total_loss)
|
||||||
|
cv_loss = total_loss / num_seen_utts
|
||||||
|
cv_loss = float(cv_loss)
|
||||||
|
else:
|
||||||
|
cv_loss = total_loss / num_seen_utts
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
|
||||||
|
if self.visualizer:
|
||||||
|
self.visualizer.add_scalars(
|
||||||
|
'epoch', {'cv_loss': cv_loss,
|
||||||
|
'lr': self.lr_scheduler()}, self.epoch)
|
||||||
|
self.save(tag=self.epoch, infos={'val_loss': cv_loss})
|
||||||
|
self.new_epoch()
|
||||||
|
|
||||||
|
def setup_dataloader(self):
|
||||||
|
config = self.config.clone()
|
||||||
|
config.defrost()
|
||||||
|
config.data.keep_transcription_text = False
|
||||||
|
|
||||||
|
# train/valid dataset, return token ids
|
||||||
|
config.data.manifest = config.data.train_manifest
|
||||||
|
train_dataset = ManifestDataset.from_config(config)
|
||||||
|
|
||||||
|
config.data.manifest = config.data.dev_manifest
|
||||||
|
config.data.augmentation_config = ""
|
||||||
|
dev_dataset = ManifestDataset.from_config(config)
|
||||||
|
|
||||||
|
collate_fn = SpeechCollator(keep_transcription_text=False)
|
||||||
|
if self.parallel:
|
||||||
|
batch_sampler = SortagradDistributedBatchSampler(
|
||||||
|
train_dataset,
|
||||||
|
batch_size=config.data.batch_size,
|
||||||
|
num_replicas=None,
|
||||||
|
rank=None,
|
||||||
|
shuffle=True,
|
||||||
|
drop_last=True,
|
||||||
|
sortagrad=config.data.sortagrad,
|
||||||
|
shuffle_method=config.data.shuffle_method)
|
||||||
|
else:
|
||||||
|
batch_sampler = SortagradBatchSampler(
|
||||||
|
train_dataset,
|
||||||
|
shuffle=True,
|
||||||
|
batch_size=config.data.batch_size,
|
||||||
|
drop_last=True,
|
||||||
|
sortagrad=config.data.sortagrad,
|
||||||
|
shuffle_method=config.data.shuffle_method)
|
||||||
|
self.train_loader = DataLoader(
|
||||||
|
train_dataset,
|
||||||
|
batch_sampler=batch_sampler,
|
||||||
|
collate_fn=collate_fn,
|
||||||
|
num_workers=config.data.num_workers, )
|
||||||
|
self.valid_loader = DataLoader(
|
||||||
|
dev_dataset,
|
||||||
|
batch_size=config.data.batch_size,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=False,
|
||||||
|
collate_fn=collate_fn)
|
||||||
|
|
||||||
|
# test dataset, return raw text
|
||||||
|
config.data.manifest = config.data.test_manifest
|
||||||
|
config.data.keep_transcription_text = True
|
||||||
|
config.data.augmentation_config = ""
|
||||||
|
# filter test examples, will cause less examples, but no mismatch with training
|
||||||
|
# and can use large batch size , save training time, so filter test egs now.
|
||||||
|
# config.data.min_input_len = 0.0 # second
|
||||||
|
# config.data.max_input_len = float('inf') # second
|
||||||
|
# config.data.min_output_len = 0.0 # tokens
|
||||||
|
# config.data.max_output_len = float('inf') # tokens
|
||||||
|
# config.data.min_output_input_ratio = 0.00
|
||||||
|
# config.data.max_output_input_ratio = float('inf')
|
||||||
|
test_dataset = ManifestDataset.from_config(config)
|
||||||
|
# return text ord id
|
||||||
|
self.test_loader = DataLoader(
|
||||||
|
test_dataset,
|
||||||
|
batch_size=config.decoding.batch_size,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=False,
|
||||||
|
collate_fn=SpeechCollator(keep_transcription_text=True))
|
||||||
|
logger.info("Setup train/valid/test Dataloader!")
|
||||||
|
|
||||||
|
def setup_model(self):
|
||||||
|
config = self.config
|
||||||
|
model_conf = config.model
|
||||||
|
model_conf.defrost()
|
||||||
|
model_conf.input_dim = self.train_loader.dataset.feature_size
|
||||||
|
model_conf.output_dim = self.train_loader.dataset.vocab_size
|
||||||
|
model_conf.freeze()
|
||||||
|
model = U2Model.from_config(model_conf)
|
||||||
|
|
||||||
|
if self.parallel:
|
||||||
|
model = paddle.DataParallel(model)
|
||||||
|
|
||||||
|
logger.info(f"{model}")
|
||||||
|
layer_tools.print_params(model, logger.info)
|
||||||
|
|
||||||
|
train_config = config.training
|
||||||
|
optim_type = train_config.optim
|
||||||
|
optim_conf = train_config.optim_conf
|
||||||
|
scheduler_type = train_config.scheduler
|
||||||
|
scheduler_conf = train_config.scheduler_conf
|
||||||
|
|
||||||
|
grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
|
||||||
|
weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
|
||||||
|
|
||||||
|
if scheduler_type == 'expdecaylr':
|
||||||
|
lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
|
||||||
|
learning_rate=optim_conf.lr,
|
||||||
|
gamma=scheduler_conf.lr_decay,
|
||||||
|
verbose=False)
|
||||||
|
elif scheduler_type == 'warmuplr':
|
||||||
|
lr_scheduler = WarmupLR(
|
||||||
|
learning_rate=optim_conf.lr,
|
||||||
|
warmup_steps=scheduler_conf.warmup_steps,
|
||||||
|
verbose=False)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Not support scheduler: {scheduler_type}")
|
||||||
|
|
||||||
|
if optim_type == 'adam':
|
||||||
|
optimizer = paddle.optimizer.Adam(
|
||||||
|
learning_rate=lr_scheduler,
|
||||||
|
parameters=model.parameters(),
|
||||||
|
weight_decay=weight_decay,
|
||||||
|
grad_clip=grad_clip)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Not support optim: {optim_type}")
|
||||||
|
|
||||||
|
self.model = model
|
||||||
|
self.optimizer = optimizer
|
||||||
|
self.lr_scheduler = lr_scheduler
|
||||||
|
logger.info("Setup model/optimizer/lr_scheduler!")
|
||||||
|
|
||||||
|
|
||||||
|
class U2Tester(U2Trainer):
|
||||||
|
@classmethod
|
||||||
|
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||||
|
# decoding config
|
||||||
|
default = CfgNode(
|
||||||
|
dict(
|
||||||
|
alpha=2.5, # Coef of LM for beam search.
|
||||||
|
beta=0.3, # Coef of WC for beam search.
|
||||||
|
cutoff_prob=1.0, # Cutoff probability for pruning.
|
||||||
|
cutoff_top_n=40, # Cutoff number for pruning.
|
||||||
|
lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model.
|
||||||
|
decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search',
|
||||||
|
# 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||||
|
error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer'
|
||||||
|
num_proc_bsearch=8, # # of CPUs for beam search.
|
||||||
|
beam_size=10, # Beam search width.
|
||||||
|
batch_size=16, # decoding batch size
|
||||||
|
ctc_weight=0.0, # ctc weight for attention rescoring decode mode.
|
||||||
|
decoding_chunk_size=-1, # decoding chunk size. Defaults to -1.
|
||||||
|
# <0: for decoding, use full chunk.
|
||||||
|
# >0: for decoding, use fixed chunk size as set.
|
||||||
|
# 0: used for training, it's prohibited here.
|
||||||
|
num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1.
|
||||||
|
simulate_streaming=False, # simulate streaming inference. Defaults to False.
|
||||||
|
))
|
||||||
|
|
||||||
|
if config is not None:
|
||||||
|
config.merge_from_other_cfg(default)
|
||||||
|
return default
|
||||||
|
|
||||||
|
def __init__(self, config, args):
|
||||||
|
super().__init__(config, args)
|
||||||
|
|
||||||
|
def ordid2token(self, texts, texts_len):
|
||||||
|
""" ord() id to chr() chr """
|
||||||
|
trans = []
|
||||||
|
for text, n in zip(texts, texts_len):
|
||||||
|
n = n.numpy().item()
|
||||||
|
ids = text[:n]
|
||||||
|
trans.append(''.join([chr(i) for i in ids]))
|
||||||
|
return trans
|
||||||
|
|
||||||
|
def compute_metrics(self, audio, audio_len, texts, texts_len, fout=None):
|
||||||
|
cfg = self.config.decoding
|
||||||
|
errors_sum, len_refs, num_ins = 0.0, 0, 0
|
||||||
|
errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
|
||||||
|
error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
text_feature = self.test_loader.dataset.text_feature
|
||||||
|
target_transcripts = self.ordid2token(texts, texts_len)
|
||||||
|
result_transcripts = self.model.decode(
|
||||||
|
audio,
|
||||||
|
audio_len,
|
||||||
|
text_feature=text_feature,
|
||||||
|
decoding_method=cfg.decoding_method,
|
||||||
|
lang_model_path=cfg.lang_model_path,
|
||||||
|
beam_alpha=cfg.alpha,
|
||||||
|
beam_beta=cfg.beta,
|
||||||
|
beam_size=cfg.beam_size,
|
||||||
|
cutoff_prob=cfg.cutoff_prob,
|
||||||
|
cutoff_top_n=cfg.cutoff_top_n,
|
||||||
|
num_processes=cfg.num_proc_bsearch,
|
||||||
|
ctc_weight=cfg.ctc_weight,
|
||||||
|
decoding_chunk_size=cfg.decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
|
||||||
|
simulate_streaming=cfg.simulate_streaming)
|
||||||
|
decode_time = time.time() - start_time
|
||||||
|
|
||||||
|
for target, result in zip(target_transcripts, result_transcripts):
|
||||||
|
errors, len_ref = errors_func(target, result)
|
||||||
|
errors_sum += errors
|
||||||
|
len_refs += len_ref
|
||||||
|
num_ins += 1
|
||||||
|
if fout:
|
||||||
|
fout.write(result + "\n")
|
||||||
|
logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
|
||||||
|
(target, result))
|
||||||
|
logger.info("One example error rate [%s] = %f" %
|
||||||
|
(cfg.error_rate_type, error_rate_func(target, result)))
|
||||||
|
|
||||||
|
return dict(
|
||||||
|
errors_sum=errors_sum,
|
||||||
|
len_refs=len_refs,
|
||||||
|
num_ins=num_ins, # num examples
|
||||||
|
error_rate=errors_sum / len_refs,
|
||||||
|
error_rate_type=cfg.error_rate_type,
|
||||||
|
num_frames=audio_len.sum().numpy().item(),
|
||||||
|
decode_time=decode_time)
|
||||||
|
|
||||||
|
@mp_tools.rank_zero_only
|
||||||
|
@paddle.no_grad()
|
||||||
|
def test(self):
|
||||||
|
assert self.args.result_file
|
||||||
|
self.model.eval()
|
||||||
|
logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
|
||||||
|
|
||||||
|
stride_ms = self.test_loader.dataset.stride_ms
|
||||||
|
error_rate_type = None
|
||||||
|
errors_sum, len_refs, num_ins = 0.0, 0, 0
|
||||||
|
num_frames = 0.0
|
||||||
|
num_time = 0.0
|
||||||
|
with open(self.args.result_file, 'w') as fout:
|
||||||
|
for i, batch in enumerate(self.test_loader):
|
||||||
|
metrics = self.compute_metrics(*batch, fout=fout)
|
||||||
|
num_frames += metrics['num_frames']
|
||||||
|
num_time += metrics["decode_time"]
|
||||||
|
errors_sum += metrics['errors_sum']
|
||||||
|
len_refs += metrics['len_refs']
|
||||||
|
num_ins += metrics['num_ins']
|
||||||
|
error_rate_type = metrics['error_rate_type']
|
||||||
|
rtf = num_time / (num_frames * stride_ms)
|
||||||
|
logger.info(
|
||||||
|
"RTF: %f, Error rate [%s] (%d/?) = %f" %
|
||||||
|
(rtf, error_rate_type, num_ins, errors_sum / len_refs))
|
||||||
|
|
||||||
|
rtf = num_time / (num_frames * stride_ms)
|
||||||
|
msg = "Test: "
|
||||||
|
msg += "epoch: {}, ".format(self.epoch)
|
||||||
|
msg += "step: {}, ".format(self.iteration)
|
||||||
|
msg += "RTF: {}, ".format(rtf)
|
||||||
|
msg += "Final error rate [%s] (%d/%d) = %f" % (
|
||||||
|
error_rate_type, num_ins, num_ins, errors_sum / len_refs)
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
|
# test meta results
|
||||||
|
err_meta_path = os.path.splitext(self.args.checkpoint_path)[0] + '.err'
|
||||||
|
err_type_str = "{}".format(error_rate_type)
|
||||||
|
with open(err_meta_path, 'w') as f:
|
||||||
|
data = json.dumps({
|
||||||
|
"epoch":
|
||||||
|
self.epoch,
|
||||||
|
"step":
|
||||||
|
self.iteration,
|
||||||
|
"rtf":
|
||||||
|
rtf,
|
||||||
|
error_rate_type:
|
||||||
|
errors_sum / len_refs,
|
||||||
|
"dataset_hour": (num_frames * stride_ms) / 1000.0 / 3600.0,
|
||||||
|
"process_hour":
|
||||||
|
num_time / 1000.0 / 3600.0,
|
||||||
|
"num_examples":
|
||||||
|
num_ins,
|
||||||
|
"err_sum":
|
||||||
|
errors_sum,
|
||||||
|
"ref_len":
|
||||||
|
len_refs,
|
||||||
|
})
|
||||||
|
f.write(data + '\n')
|
||||||
|
|
||||||
|
def run_test(self):
|
||||||
|
self.resume_or_scratch()
|
||||||
|
try:
|
||||||
|
self.test()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
def load_inferspec(self):
|
||||||
|
"""infer model and input spec.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
nn.Layer: inference model
|
||||||
|
List[paddle.static.InputSpec]: input spec.
|
||||||
|
"""
|
||||||
|
from deepspeech.models.u2 import U2InferModel
|
||||||
|
infer_model = U2InferModel.from_pretrained(self.test_loader.dataset,
|
||||||
|
self.config.model.clone(),
|
||||||
|
self.args.checkpoint_path)
|
||||||
|
feat_dim = self.test_loader.dataset.feature_size
|
||||||
|
input_spec = [
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[None, feat_dim, None],
|
||||||
|
dtype='float32'), # audio, [B,D,T]
|
||||||
|
paddle.static.InputSpec(shape=[None],
|
||||||
|
dtype='int64'), # audio_length, [B]
|
||||||
|
]
|
||||||
|
return infer_model, input_spec
|
||||||
|
|
||||||
|
def export(self):
|
||||||
|
infer_model, input_spec = self.load_inferspec()
|
||||||
|
assert isinstance(input_spec, list), type(input_spec)
|
||||||
|
infer_model.eval()
|
||||||
|
static_model = paddle.jit.to_static(infer_model, input_spec=input_spec)
|
||||||
|
logger.info(f"Export code: {static_model.forward.code}")
|
||||||
|
paddle.jit.save(static_model, self.args.export_path)
|
||||||
|
|
||||||
|
def run_export(self):
|
||||||
|
try:
|
||||||
|
self.export()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
def setup(self):
|
||||||
|
"""Setup the experiment.
|
||||||
|
"""
|
||||||
|
paddle.set_device(self.args.device)
|
||||||
|
|
||||||
|
self.setup_output_dir()
|
||||||
|
self.setup_checkpointer()
|
||||||
|
|
||||||
|
self.setup_dataloader()
|
||||||
|
self.setup_model()
|
||||||
|
|
||||||
|
self.iteration = 0
|
||||||
|
self.epoch = 0
|
||||||
|
|
||||||
|
def setup_output_dir(self):
|
||||||
|
"""Create a directory used for output.
|
||||||
|
"""
|
||||||
|
# output dir
|
||||||
|
if self.args.output:
|
||||||
|
output_dir = Path(self.args.output).expanduser()
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
else:
|
||||||
|
output_dir = Path(
|
||||||
|
self.args.checkpoint_path).expanduser().parent.parent
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
self.output_dir = output_dir
|
@ -0,0 +1,170 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Contains the volume perturb augmentation model."""
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from deepspeech.frontend.augmentor.base import AugmentorBase
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class SpecAugmentor(AugmentorBase):
|
||||||
|
"""Augmentation model for Time warping, Frequency masking, Time masking.
|
||||||
|
|
||||||
|
SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
|
https://arxiv.org/abs/1904.08779
|
||||||
|
|
||||||
|
SpecAugment on Large Scale Datasets
|
||||||
|
https://arxiv.org/abs/1912.05533
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
rng,
|
||||||
|
F,
|
||||||
|
T,
|
||||||
|
n_freq_masks,
|
||||||
|
n_time_masks,
|
||||||
|
p=1.0,
|
||||||
|
W=40,
|
||||||
|
adaptive_number_ratio=0,
|
||||||
|
adaptive_size_ratio=0,
|
||||||
|
max_n_time_masks=20):
|
||||||
|
"""SpecAugment class.
|
||||||
|
Args:
|
||||||
|
rng (random.Random): random generator object.
|
||||||
|
F (int): parameter for frequency masking
|
||||||
|
T (int): parameter for time masking
|
||||||
|
n_freq_masks (int): number of frequency masks
|
||||||
|
n_time_masks (int): number of time masks
|
||||||
|
p (float): parameter for upperbound of the time mask
|
||||||
|
W (int): parameter for time warping
|
||||||
|
adaptive_number_ratio (float): adaptive multiplicity ratio for time masking
|
||||||
|
adaptive_size_ratio (float): adaptive size ratio for time masking
|
||||||
|
max_n_time_masks (int): maximum number of time masking
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self._rng = rng
|
||||||
|
|
||||||
|
self.W = W
|
||||||
|
self.F = F
|
||||||
|
self.T = T
|
||||||
|
self.n_freq_masks = n_freq_masks
|
||||||
|
self.n_time_masks = n_time_masks
|
||||||
|
self.p = p
|
||||||
|
#logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}")
|
||||||
|
|
||||||
|
# adaptive SpecAugment
|
||||||
|
self.adaptive_number_ratio = adaptive_number_ratio
|
||||||
|
self.adaptive_size_ratio = adaptive_size_ratio
|
||||||
|
self.max_n_time_masks = max_n_time_masks
|
||||||
|
|
||||||
|
if adaptive_number_ratio > 0:
|
||||||
|
self.n_time_masks = 0
|
||||||
|
logger.info('n_time_masks is set ot zero for adaptive SpecAugment.')
|
||||||
|
if adaptive_size_ratio > 0:
|
||||||
|
self.T = 0
|
||||||
|
logger.info('T is set to zero for adaptive SpecAugment.')
|
||||||
|
|
||||||
|
self._freq_mask = None
|
||||||
|
self._time_mask = None
|
||||||
|
|
||||||
|
def librispeech_basic(self):
|
||||||
|
self.W = 80
|
||||||
|
self.F = 27
|
||||||
|
self.T = 100
|
||||||
|
self.n_freq_masks = 1
|
||||||
|
self.n_time_masks = 1
|
||||||
|
self.p = 1.0
|
||||||
|
|
||||||
|
def librispeech_double(self):
|
||||||
|
self.W = 80
|
||||||
|
self.F = 27
|
||||||
|
self.T = 100
|
||||||
|
self.n_freq_masks = 2
|
||||||
|
self.n_time_masks = 2
|
||||||
|
self.p = 1.0
|
||||||
|
|
||||||
|
def switchboard_mild(self):
|
||||||
|
self.W = 40
|
||||||
|
self.F = 15
|
||||||
|
self.T = 70
|
||||||
|
self.n_freq_masks = 2
|
||||||
|
self.n_time_masks = 2
|
||||||
|
self.p = 0.2
|
||||||
|
|
||||||
|
def switchboard_strong(self):
|
||||||
|
self.W = 40
|
||||||
|
self.F = 27
|
||||||
|
self.T = 70
|
||||||
|
self.n_freq_masks = 2
|
||||||
|
self.n_time_masks = 2
|
||||||
|
self.p = 0.2
|
||||||
|
|
||||||
|
@property
|
||||||
|
def freq_mask(self):
|
||||||
|
return self._freq_mask
|
||||||
|
|
||||||
|
@property
|
||||||
|
def time_mask(self):
|
||||||
|
return self._time_mask
|
||||||
|
|
||||||
|
def time_warp(xs, W=40):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def mask_freq(self, xs, replace_with_zero=False):
|
||||||
|
n_bins = xs.shape[0]
|
||||||
|
for i in range(0, self.n_freq_masks):
|
||||||
|
f = int(self._rng.uniform(low=0, high=self.F))
|
||||||
|
f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
|
||||||
|
xs[f_0:f_0 + f, :] = 0
|
||||||
|
assert f_0 <= f_0 + f
|
||||||
|
self._freq_mask = (f_0, f_0 + f)
|
||||||
|
return xs
|
||||||
|
|
||||||
|
def mask_time(self, xs, replace_with_zero=False):
|
||||||
|
n_frames = xs.shape[1]
|
||||||
|
|
||||||
|
if self.adaptive_number_ratio > 0:
|
||||||
|
n_masks = int(n_frames * self.adaptive_number_ratio)
|
||||||
|
n_masks = min(n_masks, self.max_n_time_masks)
|
||||||
|
else:
|
||||||
|
n_masks = self.n_time_masks
|
||||||
|
|
||||||
|
if self.adaptive_size_ratio > 0:
|
||||||
|
T = self.adaptive_size_ratio * n_frames
|
||||||
|
else:
|
||||||
|
T = self.T
|
||||||
|
|
||||||
|
for i in range(n_masks):
|
||||||
|
t = int(self._rng.uniform(low=0, high=T))
|
||||||
|
t = min(t, int(n_frames * self.p))
|
||||||
|
t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
|
||||||
|
xs[:, t_0:t_0 + t] = 0
|
||||||
|
assert t_0 <= t_0 + t
|
||||||
|
self._time_mask = (t_0, t_0 + t)
|
||||||
|
return xs
|
||||||
|
|
||||||
|
def transform_feature(self, xs: np.ndarray):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
xs (FloatTensor): `[F, T]`
|
||||||
|
Returns:
|
||||||
|
xs (FloatTensor): `[F, T]`
|
||||||
|
"""
|
||||||
|
# xs = self.time_warp(xs)
|
||||||
|
xs = self.mask_freq(xs)
|
||||||
|
xs = self.mask_time(xs)
|
||||||
|
return xs
|
@ -0,0 +1,82 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
__all__ = ["pad_sequence"]
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
def pad_sequence(sequences: List[np.ndarray],
|
||||||
|
batch_first: bool=True,
|
||||||
|
padding_value: float=0.0) -> np.ndarray:
|
||||||
|
r"""Pad a list of variable length Tensors with ``padding_value``
|
||||||
|
|
||||||
|
``pad_sequence`` stacks a list of Tensors along a new dimension,
|
||||||
|
and pads them to equal length. For example, if the input is list of
|
||||||
|
sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
|
||||||
|
otherwise.
|
||||||
|
|
||||||
|
`B` is batch size. It is equal to the number of elements in ``sequences``.
|
||||||
|
`T` is length of the longest sequence.
|
||||||
|
`L` is length of the sequence.
|
||||||
|
`*` is any number of trailing dimensions, including none.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> a = np.ones([25, 300])
|
||||||
|
>>> b = np.ones([22, 300])
|
||||||
|
>>> c = np.ones([15, 300])
|
||||||
|
>>> pad_sequence([a, b, c]).shape
|
||||||
|
[25, 3, 300]
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This function returns a np.ndarray of size ``T x B x *`` or ``B x T x *``
|
||||||
|
where `T` is the length of the longest sequence. This function assumes
|
||||||
|
trailing dimensions and type of all the Tensors in sequences are same.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequences (list[np.ndarray]): list of variable length sequences.
|
||||||
|
batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
|
||||||
|
``T x B x *`` otherwise
|
||||||
|
padding_value (float, optional): value for padded elements. Default: 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``.
|
||||||
|
np.ndarray of size ``B x T x *`` otherwise
|
||||||
|
"""
|
||||||
|
|
||||||
|
# assuming trailing dimensions and type of all the Tensors
|
||||||
|
# in sequences are same and fetching those from sequences[0]
|
||||||
|
max_size = sequences[0].shape
|
||||||
|
trailing_dims = max_size[1:]
|
||||||
|
max_len = max([s.shape[0] for s in sequences])
|
||||||
|
if batch_first:
|
||||||
|
out_dims = (len(sequences), max_len) + trailing_dims
|
||||||
|
else:
|
||||||
|
out_dims = (max_len, len(sequences)) + trailing_dims
|
||||||
|
|
||||||
|
out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)
|
||||||
|
for i, tensor in enumerate(sequences):
|
||||||
|
length = tensor.shape[0]
|
||||||
|
# use index notation to prevent duplicate references to the tensor
|
||||||
|
if batch_first:
|
||||||
|
out_tensor[i, :length, ...] = tensor
|
||||||
|
else:
|
||||||
|
out_tensor[:length, i, ...] = tensor
|
||||||
|
|
||||||
|
return out_tensor
|
@ -0,0 +1,928 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""U2 ASR Model
|
||||||
|
Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
|
||||||
|
(https://arxiv.org/pdf/2012.05481.pdf)
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Dict
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import jit
|
||||||
|
from paddle import nn
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from deepspeech.frontend.utility import IGNORE_ID
|
||||||
|
from deepspeech.frontend.utility import load_cmvn
|
||||||
|
from deepspeech.modules.cmvn import GlobalCMVN
|
||||||
|
from deepspeech.modules.ctc import CTCDecoder
|
||||||
|
from deepspeech.modules.decoder import TransformerDecoder
|
||||||
|
from deepspeech.modules.encoder import ConformerEncoder
|
||||||
|
from deepspeech.modules.encoder import TransformerEncoder
|
||||||
|
from deepspeech.modules.loss import LabelSmoothingLoss
|
||||||
|
from deepspeech.modules.mask import make_pad_mask
|
||||||
|
from deepspeech.modules.mask import mask_finished_preds
|
||||||
|
from deepspeech.modules.mask import mask_finished_scores
|
||||||
|
from deepspeech.modules.mask import subsequent_mask
|
||||||
|
from deepspeech.utils import checkpoint
|
||||||
|
from deepspeech.utils import layer_tools
|
||||||
|
from deepspeech.utils.ctc_utils import remove_duplicates_and_blank
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
from deepspeech.utils.tensor_utils import add_sos_eos
|
||||||
|
from deepspeech.utils.tensor_utils import pad_sequence
|
||||||
|
from deepspeech.utils.tensor_utils import th_accuracy
|
||||||
|
from deepspeech.utils.utility import log_add
|
||||||
|
|
||||||
|
__all__ = ["U2Model", "U2InferModel"]
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class U2BaseModel(nn.Module):
|
||||||
|
"""CTC-Attention hybrid Encoder-Decoder model"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
|
||||||
|
# network architecture
|
||||||
|
default = CfgNode()
|
||||||
|
# allow add new item when merge_with_file
|
||||||
|
default.cmvn_file = ""
|
||||||
|
default.cmvn_file_type = "json"
|
||||||
|
default.input_dim = 0
|
||||||
|
default.output_dim = 0
|
||||||
|
# encoder related
|
||||||
|
default.encoder = 'transformer'
|
||||||
|
default.encoder_conf = CfgNode(
|
||||||
|
dict(
|
||||||
|
output_size=256, # dimension of attention
|
||||||
|
attention_heads=4,
|
||||||
|
linear_units=2048, # the number of units of position-wise feed forward
|
||||||
|
num_blocks=12, # the number of encoder blocks
|
||||||
|
dropout_rate=0.1,
|
||||||
|
positional_dropout_rate=0.1,
|
||||||
|
attention_dropout_rate=0.0,
|
||||||
|
input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before=True,
|
||||||
|
# use_cnn_module=True,
|
||||||
|
# cnn_module_kernel=15,
|
||||||
|
# activation_type='swish',
|
||||||
|
# pos_enc_layer_type='rel_pos',
|
||||||
|
# selfattention_layer_type='rel_selfattn',
|
||||||
|
))
|
||||||
|
# decoder related
|
||||||
|
default.decoder = 'transformer'
|
||||||
|
default.decoder_conf = CfgNode(
|
||||||
|
dict(
|
||||||
|
attention_heads=4,
|
||||||
|
linear_units=2048,
|
||||||
|
num_blocks=6,
|
||||||
|
dropout_rate=0.1,
|
||||||
|
positional_dropout_rate=0.1,
|
||||||
|
self_attention_dropout_rate=0.0,
|
||||||
|
src_attention_dropout_rate=0.0, ))
|
||||||
|
# hybrid CTC/attention
|
||||||
|
default.model_conf = CfgNode(
|
||||||
|
dict(
|
||||||
|
ctc_weight=0.3,
|
||||||
|
lsm_weight=0.1, # label smoothing option
|
||||||
|
length_normalized_loss=False, ))
|
||||||
|
|
||||||
|
if config is not None:
|
||||||
|
config.merge_from_other_cfg(default)
|
||||||
|
return default
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_size: int,
|
||||||
|
encoder: TransformerEncoder,
|
||||||
|
decoder: TransformerDecoder,
|
||||||
|
ctc: CTCDecoder,
|
||||||
|
ctc_weight: float=0.5,
|
||||||
|
ignore_id: int=IGNORE_ID,
|
||||||
|
lsm_weight: float=0.0,
|
||||||
|
length_normalized_loss: bool=False):
|
||||||
|
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
# note that eos is the same as sos (equivalent ID)
|
||||||
|
self.sos = vocab_size - 1
|
||||||
|
self.eos = vocab_size - 1
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.ignore_id = ignore_id
|
||||||
|
self.ctc_weight = ctc_weight
|
||||||
|
|
||||||
|
self.encoder = encoder
|
||||||
|
self.decoder = decoder
|
||||||
|
self.ctc = ctc
|
||||||
|
self.criterion_att = LabelSmoothingLoss(
|
||||||
|
size=vocab_size,
|
||||||
|
padding_idx=ignore_id,
|
||||||
|
smoothing=lsm_weight,
|
||||||
|
normalize_length=length_normalized_loss, )
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
text: paddle.Tensor,
|
||||||
|
text_lengths: paddle.Tensor,
|
||||||
|
) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[
|
||||||
|
paddle.Tensor]]:
|
||||||
|
"""Frontend + Encoder + Decoder + Calc loss
|
||||||
|
Args:
|
||||||
|
speech: (Batch, Length, ...)
|
||||||
|
speech_lengths: (Batch, )
|
||||||
|
text: (Batch, Length)
|
||||||
|
text_lengths: (Batch,)
|
||||||
|
Returns:
|
||||||
|
total_loss, attention_loss, ctc_loss
|
||||||
|
"""
|
||||||
|
assert text_lengths.dim() == 1, text_lengths.shape
|
||||||
|
# Check that batch_size is unified
|
||||||
|
assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
|
||||||
|
text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
|
||||||
|
text.shape, text_lengths.shape)
|
||||||
|
# 1. Encoder
|
||||||
|
start = time.time()
|
||||||
|
encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
|
||||||
|
encoder_time = time.time() - start
|
||||||
|
#logger.debug(f"encoder time: {encoder_time}")
|
||||||
|
#TODO(Hui Zhang): sum not support bool type
|
||||||
|
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
|
||||||
|
encoder_out_lens = encoder_mask.squeeze(1).cast(paddle.int64).sum(
|
||||||
|
1) #[B, 1, T] -> [B]
|
||||||
|
|
||||||
|
# 2a. Attention-decoder branch
|
||||||
|
loss_att = None
|
||||||
|
if self.ctc_weight != 1.0:
|
||||||
|
start = time.time()
|
||||||
|
loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask,
|
||||||
|
text, text_lengths)
|
||||||
|
decoder_time = time.time() - start
|
||||||
|
#logger.debug(f"decoder time: {decoder_time}")
|
||||||
|
|
||||||
|
# 2b. CTC branch
|
||||||
|
loss_ctc = None
|
||||||
|
if self.ctc_weight != 0.0:
|
||||||
|
start = time.time()
|
||||||
|
loss_ctc = self.ctc(encoder_out, encoder_out_lens, text,
|
||||||
|
text_lengths)
|
||||||
|
ctc_time = time.time() - start
|
||||||
|
#logger.debug(f"ctc time: {ctc_time}")
|
||||||
|
|
||||||
|
if loss_ctc is None:
|
||||||
|
loss = loss_att
|
||||||
|
elif loss_att is None:
|
||||||
|
loss = loss_ctc
|
||||||
|
else:
|
||||||
|
loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
|
||||||
|
return loss, loss_att, loss_ctc
|
||||||
|
|
||||||
|
def _calc_att_loss(
|
||||||
|
self,
|
||||||
|
encoder_out: paddle.Tensor,
|
||||||
|
encoder_mask: paddle.Tensor,
|
||||||
|
ys_pad: paddle.Tensor,
|
||||||
|
ys_pad_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, float]:
|
||||||
|
"""Calc attention loss.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
encoder_out (paddle.Tensor): [B, Tmax, D]
|
||||||
|
encoder_mask (paddle.Tensor): [B, 1, Tmax]
|
||||||
|
ys_pad (paddle.Tensor): [B, Umax]
|
||||||
|
ys_pad_lens (paddle.Tensor): [B]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[paddle.Tensor, float]: attention_loss, accuracy rate
|
||||||
|
"""
|
||||||
|
ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos,
|
||||||
|
self.ignore_id)
|
||||||
|
ys_in_lens = ys_pad_lens + 1
|
||||||
|
|
||||||
|
# 1. Forward decoder
|
||||||
|
decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
|
||||||
|
ys_in_lens)
|
||||||
|
|
||||||
|
# 2. Compute attention loss
|
||||||
|
loss_att = self.criterion_att(decoder_out, ys_out_pad)
|
||||||
|
acc_att = th_accuracy(
|
||||||
|
decoder_out.view(-1, self.vocab_size),
|
||||||
|
ys_out_pad,
|
||||||
|
ignore_label=self.ignore_id, )
|
||||||
|
return loss_att, acc_att
|
||||||
|
|
||||||
|
def _forward_encoder(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
simulate_streaming: bool=False,
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Encoder pass.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): [B, Tmax, D]
|
||||||
|
speech_lengths (paddle.Tensor): [B]
|
||||||
|
decoding_chunk_size (int, optional): chuck size. Defaults to -1.
|
||||||
|
num_decoding_left_chunks (int, optional): nums chunks. Defaults to -1.
|
||||||
|
simulate_streaming (bool, optional): streaming or not. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
encoder hiddens (B, Tmax, D),
|
||||||
|
encoder hiddens mask (B, 1, Tmax).
|
||||||
|
"""
|
||||||
|
# Let's assume B = batch_size
|
||||||
|
# 1. Encoder
|
||||||
|
if simulate_streaming and decoding_chunk_size > 0:
|
||||||
|
encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk(
|
||||||
|
speech,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks
|
||||||
|
) # (B, maxlen, encoder_dim)
|
||||||
|
else:
|
||||||
|
encoder_out, encoder_mask = self.encoder(
|
||||||
|
speech,
|
||||||
|
speech_lengths,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks
|
||||||
|
) # (B, maxlen, encoder_dim)
|
||||||
|
return encoder_out, encoder_mask
|
||||||
|
|
||||||
|
def recognize(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
beam_size: int=10,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
simulate_streaming: bool=False, ) -> paddle.Tensor:
|
||||||
|
""" Apply beam search on attention decoder
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: decoding result, (batch, max_result_len)
|
||||||
|
"""
|
||||||
|
assert speech.shape[0] == speech_lengths.shape[0]
|
||||||
|
assert decoding_chunk_size != 0
|
||||||
|
device = speech.place
|
||||||
|
batch_size = speech.shape[0]
|
||||||
|
|
||||||
|
# Let's assume B = batch_size and N = beam_size
|
||||||
|
# 1. Encoder
|
||||||
|
encoder_out, encoder_mask = self._forward_encoder(
|
||||||
|
speech, speech_lengths, decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks,
|
||||||
|
simulate_streaming) # (B, maxlen, encoder_dim)
|
||||||
|
maxlen = encoder_out.size(1)
|
||||||
|
encoder_dim = encoder_out.size(2)
|
||||||
|
running_size = batch_size * beam_size
|
||||||
|
encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
|
||||||
|
running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim)
|
||||||
|
encoder_mask = encoder_mask.unsqueeze(1).repeat(
|
||||||
|
1, beam_size, 1, 1).view(running_size, 1,
|
||||||
|
maxlen) # (B*N, 1, max_len)
|
||||||
|
|
||||||
|
hyps = paddle.ones(
|
||||||
|
[running_size, 1], dtype=paddle.long).fill_(self.sos) # (B*N, 1)
|
||||||
|
# log scale score
|
||||||
|
scores = paddle.to_tensor(
|
||||||
|
[0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float)
|
||||||
|
scores = scores.to(device).repeat(batch_size).unsqueeze(1).to(
|
||||||
|
device) # (B*N, 1)
|
||||||
|
end_flag = paddle.zeros_like(scores, dtype=paddle.bool) # (B*N, 1)
|
||||||
|
cache: Optional[List[paddle.Tensor]] = None
|
||||||
|
# 2. Decoder forward step by step
|
||||||
|
for i in range(1, maxlen + 1):
|
||||||
|
# Stop if all batch and all beam produce eos
|
||||||
|
# TODO(Hui Zhang): if end_flag.sum() == running_size:
|
||||||
|
if end_flag.cast(paddle.int64).sum() == running_size:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 2.1 Forward decoder step
|
||||||
|
hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
|
||||||
|
running_size, 1, 1).to(device) # (B*N, i, i)
|
||||||
|
# logp: (B*N, vocab)
|
||||||
|
logp, cache = self.decoder.forward_one_step(
|
||||||
|
encoder_out, encoder_mask, hyps, hyps_mask, cache)
|
||||||
|
|
||||||
|
# 2.2 First beam prune: select topk best prob at current time
|
||||||
|
top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N)
|
||||||
|
top_k_logp = mask_finished_scores(top_k_logp, end_flag)
|
||||||
|
top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
|
||||||
|
|
||||||
|
# 2.3 Seconde beam prune: select topk score with history
|
||||||
|
scores = scores + top_k_logp # (B*N, N), broadcast add
|
||||||
|
scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N)
|
||||||
|
scores, offset_k_index = scores.topk(k=beam_size) # (B, N)
|
||||||
|
scores = scores.view(-1, 1) # (B*N, 1)
|
||||||
|
|
||||||
|
# 2.4. Compute base index in top_k_index,
|
||||||
|
# regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
|
||||||
|
# then find offset_k_index in top_k_index
|
||||||
|
base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
|
||||||
|
1, beam_size) # (B, N)
|
||||||
|
base_k_index = base_k_index * beam_size * beam_size
|
||||||
|
best_k_index = base_k_index.view(-1) + offset_k_index.view(
|
||||||
|
-1) # (B*N)
|
||||||
|
|
||||||
|
# 2.5 Update best hyps
|
||||||
|
best_k_pred = paddle.index_select(
|
||||||
|
top_k_index.view(-1), index=best_k_index, axis=0) # (B*N)
|
||||||
|
best_hyps_index = best_k_index // beam_size
|
||||||
|
last_best_k_hyps = paddle.index_select(
|
||||||
|
hyps, index=best_hyps_index, axis=0) # (B*N, i)
|
||||||
|
hyps = paddle.cat(
|
||||||
|
(last_best_k_hyps, best_k_pred.view(-1, 1)),
|
||||||
|
dim=1) # (B*N, i+1)
|
||||||
|
|
||||||
|
# 2.6 Update end flag
|
||||||
|
end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1)
|
||||||
|
|
||||||
|
# 3. Select best of best
|
||||||
|
scores = scores.view(batch_size, beam_size)
|
||||||
|
# TODO: length normalization
|
||||||
|
best_index = paddle.argmax(scores, axis=-1).long() # (B)
|
||||||
|
best_hyps_index = best_index + paddle.arange(
|
||||||
|
batch_size, dtype=paddle.long) * beam_size
|
||||||
|
best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0)
|
||||||
|
best_hyps = best_hyps[:, 1:]
|
||||||
|
return best_hyps
|
||||||
|
|
||||||
|
def ctc_greedy_search(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
simulate_streaming: bool=False, ) -> List[List[int]]:
|
||||||
|
""" Apply CTC greedy search
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
List[List[int]]: best path result
|
||||||
|
"""
|
||||||
|
assert speech.shape[0] == speech_lengths.shape[0]
|
||||||
|
assert decoding_chunk_size != 0
|
||||||
|
batch_size = speech.shape[0]
|
||||||
|
# Let's assume B = batch_size
|
||||||
|
# encoder_out: (B, maxlen, encoder_dim)
|
||||||
|
# encoder_mask: (B, 1, Tmax)
|
||||||
|
encoder_out, encoder_mask = self._forward_encoder(
|
||||||
|
speech, speech_lengths, decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks, simulate_streaming)
|
||||||
|
maxlen = encoder_out.size(1)
|
||||||
|
# (TODO Hui Zhang): bool no support reduce_sum
|
||||||
|
# encoder_out_lens = encoder_mask.squeeze(1).sum(1)
|
||||||
|
encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1)
|
||||||
|
ctc_probs = self.ctc.log_softmax(encoder_out) # (B, maxlen, vocab_size)
|
||||||
|
topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
|
||||||
|
topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen)
|
||||||
|
pad_mask = make_pad_mask(encoder_out_lens) # (B, maxlen)
|
||||||
|
topk_index = topk_index.masked_fill_(pad_mask, self.eos) # (B, maxlen)
|
||||||
|
hyps = [hyp.tolist() for hyp in topk_index]
|
||||||
|
hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
|
||||||
|
return hyps
|
||||||
|
|
||||||
|
def _ctc_prefix_beam_search(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
beam_size: int,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
simulate_streaming: bool=False,
|
||||||
|
blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
|
||||||
|
""" CTC prefix beam search inner implementation
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
|
||||||
|
paddle.Tensor: encoder output, (1, max_len, encoder_dim),
|
||||||
|
it will be used for rescoring in attention rescoring mode
|
||||||
|
"""
|
||||||
|
assert speech.shape[0] == speech_lengths.shape[0]
|
||||||
|
assert decoding_chunk_size != 0
|
||||||
|
batch_size = speech.shape[0]
|
||||||
|
# For CTC prefix beam search, we only support batch_size=1
|
||||||
|
assert batch_size == 1
|
||||||
|
# Let's assume B = batch_size and N = beam_size
|
||||||
|
# 1. Encoder forward and get CTC score
|
||||||
|
encoder_out, encoder_mask = self._forward_encoder(
|
||||||
|
speech, speech_lengths, decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks,
|
||||||
|
simulate_streaming) # (B, maxlen, encoder_dim)
|
||||||
|
maxlen = encoder_out.size(1)
|
||||||
|
ctc_probs = self.ctc.log_softmax(encoder_out) # (1, maxlen, vocab_size)
|
||||||
|
ctc_probs = ctc_probs.squeeze(0)
|
||||||
|
# cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
|
||||||
|
cur_hyps = [(tuple(), (0.0, -float('inf')))]
|
||||||
|
# 2. CTC beam search step by step
|
||||||
|
for t in range(0, maxlen):
|
||||||
|
logp = ctc_probs[t] # (vocab_size,)
|
||||||
|
# key: prefix, value (pb, pnb), default value(-inf, -inf)
|
||||||
|
next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
|
||||||
|
# 2.1 First beam prune: select topk best
|
||||||
|
top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,)
|
||||||
|
for s in top_k_index:
|
||||||
|
s = s.item()
|
||||||
|
ps = logp[s].item()
|
||||||
|
for prefix, (pb, pnb) in cur_hyps:
|
||||||
|
last = prefix[-1] if len(prefix) > 0 else None
|
||||||
|
if s == blank_id: # blank
|
||||||
|
n_pb, n_pnb = next_hyps[prefix]
|
||||||
|
n_pb = log_add([n_pb, pb + ps, pnb + ps])
|
||||||
|
next_hyps[prefix] = (n_pb, n_pnb)
|
||||||
|
elif s == last:
|
||||||
|
# Update *ss -> *s;
|
||||||
|
n_pb, n_pnb = next_hyps[prefix]
|
||||||
|
n_pnb = log_add([n_pnb, pnb + ps])
|
||||||
|
next_hyps[prefix] = (n_pb, n_pnb)
|
||||||
|
# Update *s-s -> *ss, - is for blank
|
||||||
|
n_prefix = prefix + (s, )
|
||||||
|
n_pb, n_pnb = next_hyps[n_prefix]
|
||||||
|
n_pnb = log_add([n_pnb, pb + ps])
|
||||||
|
next_hyps[n_prefix] = (n_pb, n_pnb)
|
||||||
|
else:
|
||||||
|
n_prefix = prefix + (s, )
|
||||||
|
n_pb, n_pnb = next_hyps[n_prefix]
|
||||||
|
n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
|
||||||
|
next_hyps[n_prefix] = (n_pb, n_pnb)
|
||||||
|
|
||||||
|
# 2.2 Second beam prune
|
||||||
|
next_hyps = sorted(
|
||||||
|
next_hyps.items(),
|
||||||
|
key=lambda x: log_add(list(x[1])),
|
||||||
|
reverse=True)
|
||||||
|
cur_hyps = next_hyps[:beam_size]
|
||||||
|
hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
|
||||||
|
return hyps, encoder_out
|
||||||
|
|
||||||
|
def ctc_prefix_beam_search(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
beam_size: int,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
simulate_streaming: bool=False, ) -> List[int]:
|
||||||
|
""" Apply CTC prefix beam search
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
List[int]: CTC prefix beam search nbest results
|
||||||
|
"""
|
||||||
|
hyps, _ = self._ctc_prefix_beam_search(
|
||||||
|
speech, speech_lengths, beam_size, decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks, simulate_streaming)
|
||||||
|
return hyps[0][0]
|
||||||
|
|
||||||
|
def attention_rescoring(
|
||||||
|
self,
|
||||||
|
speech: paddle.Tensor,
|
||||||
|
speech_lengths: paddle.Tensor,
|
||||||
|
beam_size: int,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
ctc_weight: float=0.0,
|
||||||
|
simulate_streaming: bool=False, ) -> List[int]:
|
||||||
|
""" Apply attention rescoring decoding, CTC prefix beam search
|
||||||
|
is applied first to get nbest, then we resoring the nbest on
|
||||||
|
attention decoder with corresponding encoder out
|
||||||
|
Args:
|
||||||
|
speech (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
speech_length (paddle.Tensor): (batch, )
|
||||||
|
beam_size (int): beam size for beam search
|
||||||
|
decoding_chunk_size (int): decoding chunk for dynamic chunk
|
||||||
|
trained model.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here
|
||||||
|
simulate_streaming (bool): whether do encoder forward in a
|
||||||
|
streaming fashion
|
||||||
|
Returns:
|
||||||
|
List[int]: Attention rescoring result
|
||||||
|
"""
|
||||||
|
assert speech.shape[0] == speech_lengths.shape[0]
|
||||||
|
assert decoding_chunk_size != 0
|
||||||
|
device = speech.place
|
||||||
|
batch_size = speech.shape[0]
|
||||||
|
# For attention rescoring we only support batch_size=1
|
||||||
|
assert batch_size == 1
|
||||||
|
# encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
|
||||||
|
hyps, encoder_out = self._ctc_prefix_beam_search(
|
||||||
|
speech, speech_lengths, beam_size, decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks, simulate_streaming)
|
||||||
|
|
||||||
|
assert len(hyps) == beam_size
|
||||||
|
hyps_pad = pad_sequence([
|
||||||
|
paddle.to_tensor(hyp[0], place=device, dtype=paddle.long)
|
||||||
|
for hyp in hyps
|
||||||
|
], True, self.ignore_id) # (beam_size, max_hyps_len)
|
||||||
|
hyps_lens = paddle.to_tensor(
|
||||||
|
[len(hyp[0]) for hyp in hyps], place=device,
|
||||||
|
dtype=paddle.long) # (beam_size,)
|
||||||
|
hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
|
||||||
|
hyps_lens = hyps_lens + 1 # Add <sos> at begining
|
||||||
|
encoder_out = encoder_out.repeat(beam_size, 1, 1)
|
||||||
|
encoder_mask = paddle.ones(
|
||||||
|
(beam_size, 1, encoder_out.size(1)), dtype=paddle.bool)
|
||||||
|
decoder_out, _ = self.decoder(
|
||||||
|
encoder_out, encoder_mask, hyps_pad,
|
||||||
|
hyps_lens) # (beam_size, max_hyps_len, vocab_size)
|
||||||
|
decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
|
||||||
|
decoder_out = decoder_out.numpy()
|
||||||
|
# Only use decoder score for rescoring
|
||||||
|
best_score = -float('inf')
|
||||||
|
best_index = 0
|
||||||
|
for i, hyp in enumerate(hyps):
|
||||||
|
score = 0.0
|
||||||
|
for j, w in enumerate(hyp[0]):
|
||||||
|
score += decoder_out[i][j][w]
|
||||||
|
score += decoder_out[i][len(hyp[0])][self.eos]
|
||||||
|
# add ctc score
|
||||||
|
score += hyp[1] * ctc_weight
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_index = i
|
||||||
|
return hyps[best_index][0]
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def subsampling_rate(self) -> int:
|
||||||
|
""" Export interface for c++ call, return subsampling_rate of the
|
||||||
|
model
|
||||||
|
"""
|
||||||
|
return self.encoder.embed.subsampling_rate
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def right_context(self) -> int:
|
||||||
|
""" Export interface for c++ call, return right_context of the model
|
||||||
|
"""
|
||||||
|
return self.encoder.embed.right_context
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def sos_symbol(self) -> int:
|
||||||
|
""" Export interface for c++ call, return sos symbol id of the model
|
||||||
|
"""
|
||||||
|
return self.sos
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def eos_symbol(self) -> int:
|
||||||
|
""" Export interface for c++ call, return eos symbol id of the model
|
||||||
|
"""
|
||||||
|
return self.eos
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def forward_encoder_chunk(
|
||||||
|
self,
|
||||||
|
xs: paddle.Tensor,
|
||||||
|
offset: int,
|
||||||
|
required_cache_size: int,
|
||||||
|
subsampling_cache: Optional[paddle.Tensor]=None,
|
||||||
|
elayers_output_cache: Optional[List[paddle.Tensor]]=None,
|
||||||
|
conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
|
||||||
|
paddle.Tensor]]:
|
||||||
|
""" Export interface for c++ call, give input chunk xs, and return
|
||||||
|
output from time 0 to current chunk.
|
||||||
|
Args:
|
||||||
|
xs (paddle.Tensor): chunk input
|
||||||
|
subsampling_cache (Optional[paddle.Tensor]): subsampling cache
|
||||||
|
elayers_output_cache (Optional[List[paddle.Tensor]]):
|
||||||
|
transformer/conformer encoder layers output cache
|
||||||
|
conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
|
||||||
|
cnn cache
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: output, it ranges from time 0 to current chunk.
|
||||||
|
paddle.Tensor: subsampling cache
|
||||||
|
List[paddle.Tensor]: attention cache
|
||||||
|
List[paddle.Tensor]: conformer cnn cache
|
||||||
|
"""
|
||||||
|
return self.encoder.forward_chunk(
|
||||||
|
xs, offset, required_cache_size, subsampling_cache,
|
||||||
|
elayers_output_cache, conformer_cnn_cache)
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
|
||||||
|
""" Export interface for c++ call, apply linear transform and log
|
||||||
|
softmax before ctc
|
||||||
|
Args:
|
||||||
|
xs (paddle.Tensor): encoder output
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: activation before ctc
|
||||||
|
"""
|
||||||
|
return self.ctc.log_softmax(xs)
|
||||||
|
|
||||||
|
@jit.export
|
||||||
|
def forward_attention_decoder(
|
||||||
|
self,
|
||||||
|
hyps: paddle.Tensor,
|
||||||
|
hyps_lens: paddle.Tensor,
|
||||||
|
encoder_out: paddle.Tensor, ) -> paddle.Tensor:
|
||||||
|
""" Export interface for c++ call, forward decoder with multiple
|
||||||
|
hypothesis from ctc prefix beam search and one encoder output
|
||||||
|
Args:
|
||||||
|
hyps (paddle.Tensor): hyps from ctc prefix beam search, already
|
||||||
|
pad sos at the begining, (B, T)
|
||||||
|
hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
|
||||||
|
encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: decoder output, (B, L)
|
||||||
|
"""
|
||||||
|
assert encoder_out.size(0) == 1
|
||||||
|
num_hyps = hyps.size(0)
|
||||||
|
assert hyps_lens.size(0) == num_hyps
|
||||||
|
encoder_out = encoder_out.repeat(num_hyps, 1, 1)
|
||||||
|
# (B, 1, T)
|
||||||
|
encoder_mask = paddle.ones(
|
||||||
|
[num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool)
|
||||||
|
# (num_hyps, max_hyps_len, vocab_size)
|
||||||
|
decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
|
||||||
|
hyps_lens)
|
||||||
|
decoder_out = paddle.nn.functional.log_softmax(decoder_out, dim=-1)
|
||||||
|
return decoder_out
|
||||||
|
|
||||||
|
@paddle.no_grad()
|
||||||
|
def decode(self,
|
||||||
|
feats: paddle.Tensor,
|
||||||
|
feats_lengths: paddle.Tensor,
|
||||||
|
text_feature: Dict[str, int],
|
||||||
|
decoding_method: str,
|
||||||
|
lang_model_path: str,
|
||||||
|
beam_alpha: float,
|
||||||
|
beam_beta: float,
|
||||||
|
beam_size: int,
|
||||||
|
cutoff_prob: float,
|
||||||
|
cutoff_top_n: int,
|
||||||
|
num_processes: int,
|
||||||
|
ctc_weight: float=0.0,
|
||||||
|
decoding_chunk_size: int=-1,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
simulate_streaming: bool=False):
|
||||||
|
"""u2 decoding.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feats (Tenosr): audio features, (B, T, D)
|
||||||
|
feats_lengths (Tenosr): (B)
|
||||||
|
text_feature (TextFeaturizer): text feature object.
|
||||||
|
decoding_method (str): decoding mode, e.g.
|
||||||
|
'attention', 'ctc_greedy_search',
|
||||||
|
'ctc_prefix_beam_search', 'attention_rescoring'
|
||||||
|
lang_model_path (str): lm path.
|
||||||
|
beam_alpha (float): lm weight.
|
||||||
|
beam_beta (float): length penalty.
|
||||||
|
beam_size (int): beam size for search
|
||||||
|
cutoff_prob (float): for prune.
|
||||||
|
cutoff_top_n (int): for prune.
|
||||||
|
num_processes (int):
|
||||||
|
ctc_weight (float, optional): ctc weight for attention rescoring decode mode. Defaults to 0.0.
|
||||||
|
decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
0: used for training, it's prohibited here.
|
||||||
|
num_decoding_left_chunks (int, optional):
|
||||||
|
number of left chunks for decoding. Defaults to -1.
|
||||||
|
simulate_streaming (bool, optional): simulate streaming inference. Defaults to False.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: when not support decoding_method.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[List[int]]: transcripts.
|
||||||
|
"""
|
||||||
|
batch_size = feats.size(0)
|
||||||
|
if decoding_method in ['ctc_prefix_beam_search',
|
||||||
|
'attention_rescoring'] and batch_size > 1:
|
||||||
|
logger.fatal(
|
||||||
|
f'decoding mode {decoding_method} must be running with batch_size == 1'
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if decoding_method == 'attention':
|
||||||
|
hyps = self.recognize(
|
||||||
|
feats,
|
||||||
|
feats_lengths,
|
||||||
|
beam_size=beam_size,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks,
|
||||||
|
simulate_streaming=simulate_streaming)
|
||||||
|
hyps = [hyp.tolist() for hyp in hyps]
|
||||||
|
elif decoding_method == 'ctc_greedy_search':
|
||||||
|
hyps = self.ctc_greedy_search(
|
||||||
|
feats,
|
||||||
|
feats_lengths,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks,
|
||||||
|
simulate_streaming=simulate_streaming)
|
||||||
|
# ctc_prefix_beam_search and attention_rescoring only return one
|
||||||
|
# result in List[int], change it to List[List[int]] for compatible
|
||||||
|
# with other batch decoding mode
|
||||||
|
elif decoding_method == 'ctc_prefix_beam_search':
|
||||||
|
assert feats.size(0) == 1
|
||||||
|
hyp = self.ctc_prefix_beam_search(
|
||||||
|
feats,
|
||||||
|
feats_lengths,
|
||||||
|
beam_size,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks,
|
||||||
|
simulate_streaming=simulate_streaming)
|
||||||
|
hyps = [hyp]
|
||||||
|
elif decoding_method == 'attention_rescoring':
|
||||||
|
assert feats.size(0) == 1
|
||||||
|
hyp = self.attention_rescoring(
|
||||||
|
feats,
|
||||||
|
feats_lengths,
|
||||||
|
beam_size,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks,
|
||||||
|
ctc_weight=ctc_weight,
|
||||||
|
simulate_streaming=simulate_streaming)
|
||||||
|
hyps = [hyp]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Not support decoding method: {decoding_method}")
|
||||||
|
|
||||||
|
res = [text_feature.defeaturize(hyp) for hyp in hyps]
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
class U2Model(U2BaseModel):
|
||||||
|
def __init__(self, configs: dict):
|
||||||
|
vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
encoder=encoder,
|
||||||
|
decoder=decoder,
|
||||||
|
ctc=ctc,
|
||||||
|
**configs['model_conf'])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _init_from_config(cls, configs: dict):
|
||||||
|
"""init sub module for model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
configs (dict): config dict.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: raise when using not support encoder type.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
|
||||||
|
"""
|
||||||
|
if configs['cmvn_file'] is not None:
|
||||||
|
mean, istd = load_cmvn(configs['cmvn_file'],
|
||||||
|
configs['cmvn_file_type'])
|
||||||
|
global_cmvn = GlobalCMVN(
|
||||||
|
paddle.to_tensor(mean, dtype=paddle.float),
|
||||||
|
paddle.to_tensor(istd, dtype=paddle.float))
|
||||||
|
else:
|
||||||
|
global_cmvn = None
|
||||||
|
|
||||||
|
input_dim = configs['input_dim']
|
||||||
|
vocab_size = configs['output_dim']
|
||||||
|
assert input_dim != 0, input_dim
|
||||||
|
assert vocab_size != 0, vocab_size
|
||||||
|
|
||||||
|
encoder_type = configs.get('encoder', 'transformer')
|
||||||
|
logger.info(f"U2 Encoder type: {encoder_type}")
|
||||||
|
if encoder_type == 'transformer':
|
||||||
|
encoder = TransformerEncoder(
|
||||||
|
input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
|
||||||
|
elif encoder_type == 'conformer':
|
||||||
|
encoder = ConformerEncoder(
|
||||||
|
input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
|
||||||
|
else:
|
||||||
|
raise ValueError(f"not support encoder type:{encoder_type}")
|
||||||
|
|
||||||
|
decoder = TransformerDecoder(vocab_size,
|
||||||
|
encoder.output_size(),
|
||||||
|
**configs['decoder_conf'])
|
||||||
|
ctc = CTCDecoder(
|
||||||
|
odim=vocab_size,
|
||||||
|
enc_n_units=encoder.output_size(),
|
||||||
|
blank_id=0,
|
||||||
|
dropout_rate=0.0,
|
||||||
|
reduction=True, # sum
|
||||||
|
batch_average=True) # sum / batch_size
|
||||||
|
|
||||||
|
return vocab_size, encoder, decoder, ctc
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_config(cls, configs: dict):
|
||||||
|
"""init model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
configs (dict): config dict.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: raise when using not support encoder type.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
nn.Layer: U2Model
|
||||||
|
"""
|
||||||
|
model = cls(configs)
|
||||||
|
return model
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, dataset, config, checkpoint_path):
|
||||||
|
"""Build a DeepSpeech2Model model from a pretrained model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset (paddle.io.Dataset): not used.
|
||||||
|
config (yacs.config.CfgNode): model configs
|
||||||
|
checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DeepSpeech2Model: The model built from pretrained result.
|
||||||
|
"""
|
||||||
|
config.defrost()
|
||||||
|
config.input_dim = dataset.feature_size
|
||||||
|
config.output_dim = dataset.vocab_size
|
||||||
|
config.freeze()
|
||||||
|
model = cls.from_config(config)
|
||||||
|
|
||||||
|
if checkpoint_path:
|
||||||
|
infos = checkpoint.load_parameters(
|
||||||
|
model, checkpoint_path=checkpoint_path)
|
||||||
|
logger.info(f"checkpoint info: {infos}")
|
||||||
|
layer_tools.summary(model)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
class U2InferModel(U2Model):
|
||||||
|
def __init__(self, configs: dict):
|
||||||
|
super().__init__(configs)
|
||||||
|
|
||||||
|
def forward(self,
|
||||||
|
feats,
|
||||||
|
feats_lengths,
|
||||||
|
decoding_chunk_size=-1,
|
||||||
|
num_decoding_left_chunks=-1,
|
||||||
|
simulate_streaming=False):
|
||||||
|
"""export model function
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feats (Tensor): [B, T, D]
|
||||||
|
feats_lengths (Tensor): [B]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[List[int]]: best path result
|
||||||
|
"""
|
||||||
|
return self.ctc_greedy_search(
|
||||||
|
feats,
|
||||||
|
feats_lengths,
|
||||||
|
decoding_chunk_size=decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=num_decoding_left_chunks,
|
||||||
|
simulate_streaming=simulate_streaming)
|
@ -0,0 +1,233 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Multi-Head Attention layer definition."""
|
||||||
|
import math
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn import initializer as I
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
|
||||||
|
|
||||||
|
# Relative Positional Encodings
|
||||||
|
# https://www.jianshu.com/p/c0608efcc26f
|
||||||
|
# https://zhuanlan.zhihu.com/p/344604604
|
||||||
|
|
||||||
|
|
||||||
|
class MultiHeadedAttention(nn.Layer):
|
||||||
|
"""Multi-Head Attention layer."""
|
||||||
|
|
||||||
|
def __init__(self, n_head: int, n_feat: int, dropout_rate: float):
|
||||||
|
"""Construct an MultiHeadedAttention object.
|
||||||
|
Args:
|
||||||
|
n_head (int): The number of heads.
|
||||||
|
n_feat (int): The number of features.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
assert n_feat % n_head == 0
|
||||||
|
# We assume d_v always equals d_k
|
||||||
|
self.d_k = n_feat // n_head
|
||||||
|
self.h = n_head
|
||||||
|
self.linear_q = nn.Linear(n_feat, n_feat)
|
||||||
|
self.linear_k = nn.Linear(n_feat, n_feat)
|
||||||
|
self.linear_v = nn.Linear(n_feat, n_feat)
|
||||||
|
self.linear_out = nn.Linear(n_feat, n_feat)
|
||||||
|
self.dropout = nn.Dropout(p=dropout_rate)
|
||||||
|
|
||||||
|
def forward_qkv(self,
|
||||||
|
query: paddle.Tensor,
|
||||||
|
key: paddle.Tensor,
|
||||||
|
value: paddle.Tensor
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Transform query, key and value.
|
||||||
|
Args:
|
||||||
|
query (paddle.Tensor): Query tensor (#batch, time1, size).
|
||||||
|
key (paddle.Tensor): Key tensor (#batch, time2, size).
|
||||||
|
value (paddle.Tensor): Value tensor (#batch, time2, size).
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Transformed query tensor, size
|
||||||
|
(#batch, n_head, time1, d_k).
|
||||||
|
paddle.Tensor: Transformed key tensor, size
|
||||||
|
(#batch, n_head, time2, d_k).
|
||||||
|
paddle.Tensor: Transformed value tensor, size
|
||||||
|
(#batch, n_head, time2, d_k).
|
||||||
|
"""
|
||||||
|
n_batch = query.size(0)
|
||||||
|
q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
|
||||||
|
k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
|
||||||
|
v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
|
||||||
|
q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k)
|
||||||
|
k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k)
|
||||||
|
v = v.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k)
|
||||||
|
|
||||||
|
return q, k, v
|
||||||
|
|
||||||
|
def forward_attention(self,
|
||||||
|
value: paddle.Tensor,
|
||||||
|
scores: paddle.Tensor,
|
||||||
|
mask: Optional[paddle.Tensor]) -> paddle.Tensor:
|
||||||
|
"""Compute attention context vector.
|
||||||
|
Args:
|
||||||
|
value (paddle.Tensor): Transformed value, size
|
||||||
|
(#batch, n_head, time2, d_k).
|
||||||
|
scores (paddle.Tensor): Attention score, size
|
||||||
|
(#batch, n_head, time1, time2).
|
||||||
|
mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
|
||||||
|
(#batch, time1, time2).
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Transformed value weighted
|
||||||
|
by the attention score, (#batch, time1, d_model).
|
||||||
|
"""
|
||||||
|
n_batch = value.size(0)
|
||||||
|
if mask is not None:
|
||||||
|
mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
|
||||||
|
scores = scores.masked_fill(mask, -float('inf'))
|
||||||
|
attn = paddle.softmax(
|
||||||
|
scores, axis=-1).masked_fill(mask,
|
||||||
|
0.0) # (batch, head, time1, time2)
|
||||||
|
else:
|
||||||
|
attn = paddle.softmax(
|
||||||
|
scores, axis=-1) # (batch, head, time1, time2)
|
||||||
|
|
||||||
|
p_attn = self.dropout(attn)
|
||||||
|
x = paddle.matmul(p_attn, value) # (batch, head, time1, d_k)
|
||||||
|
x = x.transpose([0, 2, 1, 3]).contiguous().view(
|
||||||
|
n_batch, -1, self.h * self.d_k) # (batch, time1, d_model)
|
||||||
|
|
||||||
|
return self.linear_out(x) # (batch, time1, d_model)
|
||||||
|
|
||||||
|
def forward(self,
|
||||||
|
query: paddle.Tensor,
|
||||||
|
key: paddle.Tensor,
|
||||||
|
value: paddle.Tensor,
|
||||||
|
mask: Optional[paddle.Tensor]) -> paddle.Tensor:
|
||||||
|
"""Compute scaled dot product attention.
|
||||||
|
Args:
|
||||||
|
query (torch.Tensor): Query tensor (#batch, time1, size).
|
||||||
|
key (torch.Tensor): Key tensor (#batch, time2, size).
|
||||||
|
value (torch.Tensor): Value tensor (#batch, time2, size).
|
||||||
|
mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
|
||||||
|
(#batch, time1, time2).
|
||||||
|
Returns:
|
||||||
|
torch.Tensor: Output tensor (#batch, time1, d_model).
|
||||||
|
"""
|
||||||
|
q, k, v = self.forward_qkv(query, key, value)
|
||||||
|
scores = paddle.matmul(q,
|
||||||
|
k.transpose([0, 1, 3, 2])) / math.sqrt(self.d_k)
|
||||||
|
return self.forward_attention(v, scores, mask)
|
||||||
|
|
||||||
|
|
||||||
|
class RelPositionMultiHeadedAttention(MultiHeadedAttention):
|
||||||
|
"""Multi-Head Attention layer with relative position encoding."""
|
||||||
|
|
||||||
|
def __init__(self, n_head, n_feat, dropout_rate):
|
||||||
|
"""Construct an RelPositionMultiHeadedAttention object.
|
||||||
|
Paper: https://arxiv.org/abs/1901.02860
|
||||||
|
Args:
|
||||||
|
n_head (int): The number of heads.
|
||||||
|
n_feat (int): The number of features.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
"""
|
||||||
|
super().__init__(n_head, n_feat, dropout_rate)
|
||||||
|
# linear transformation for positional encoding
|
||||||
|
self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
|
||||||
|
# these two learnable bias are used in matrix c and matrix d
|
||||||
|
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
|
||||||
|
#self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
|
||||||
|
#self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
|
||||||
|
#torch.nn.init.xavier_uniform_(self.pos_bias_u)
|
||||||
|
#torch.nn.init.xavier_uniform_(self.pos_bias_v)
|
||||||
|
pos_bias_u = self.create_parameter(
|
||||||
|
[self.h, self.d_k], default_initializer=I.XavierUniform())
|
||||||
|
self.add_parameter('pos_bias_u', pos_bias_u)
|
||||||
|
pos_bias_v = self.create_parameter(
|
||||||
|
(self.h, self.d_k), default_initializer=I.XavierUniform())
|
||||||
|
self.add_parameter('pos_bias_v', pos_bias_v)
|
||||||
|
|
||||||
|
def rel_shift(self, x, zero_triu: bool=False):
|
||||||
|
"""Compute relative positinal encoding.
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): Input tensor (batch, head, time1, time1).
|
||||||
|
zero_triu (bool): If true, return the lower triangular part of
|
||||||
|
the matrix.
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Output tensor. (batch, head, time1, time1)
|
||||||
|
"""
|
||||||
|
zero_pad = paddle.zeros(
|
||||||
|
(x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype)
|
||||||
|
x_padded = paddle.cat([zero_pad, x], dim=-1)
|
||||||
|
|
||||||
|
x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2))
|
||||||
|
x = x_padded[:, :, 1:].view_as(x) # [B, H, T1, T1]
|
||||||
|
|
||||||
|
if zero_triu:
|
||||||
|
ones = paddle.ones((x.size(2), x.size(3)))
|
||||||
|
x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def forward(self,
|
||||||
|
query: paddle.Tensor,
|
||||||
|
key: paddle.Tensor,
|
||||||
|
value: paddle.Tensor,
|
||||||
|
pos_emb: paddle.Tensor,
|
||||||
|
mask: Optional[paddle.Tensor]):
|
||||||
|
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
|
||||||
|
Args:
|
||||||
|
query (paddle.Tensor): Query tensor (#batch, time1, size).
|
||||||
|
key (paddle.Tensor): Key tensor (#batch, time2, size).
|
||||||
|
value (paddle.Tensor): Value tensor (#batch, time2, size).
|
||||||
|
pos_emb (paddle.Tensor): Positional embedding tensor
|
||||||
|
(#batch, time1, size).
|
||||||
|
mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
|
||||||
|
(#batch, time1, time2).
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Output tensor (#batch, time1, d_model).
|
||||||
|
"""
|
||||||
|
q, k, v = self.forward_qkv(query, key, value)
|
||||||
|
q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k)
|
||||||
|
|
||||||
|
n_batch_pos = pos_emb.size(0)
|
||||||
|
p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
|
||||||
|
p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k)
|
||||||
|
|
||||||
|
# (batch, head, time1, d_k)
|
||||||
|
q_with_bias_u = (q + self.pos_bias_u).transpose([0, 2, 1, 3])
|
||||||
|
# (batch, head, time1, d_k)
|
||||||
|
q_with_bias_v = (q + self.pos_bias_v).transpose([0, 2, 1, 3])
|
||||||
|
|
||||||
|
# compute attention score
|
||||||
|
# first compute matrix a and matrix c
|
||||||
|
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
|
||||||
|
# (batch, head, time1, time2)
|
||||||
|
matrix_ac = paddle.matmul(q_with_bias_u, k.transpose([0, 1, 3, 2]))
|
||||||
|
|
||||||
|
# compute matrix b and matrix d
|
||||||
|
# (batch, head, time1, time2)
|
||||||
|
matrix_bd = paddle.matmul(q_with_bias_v, p.transpose([0, 1, 3, 2]))
|
||||||
|
# Remove rel_shift since it is useless in speech recognition,
|
||||||
|
# and it requires special attention for streaming.
|
||||||
|
# matrix_bd = self.rel_shift(matrix_bd)
|
||||||
|
|
||||||
|
scores = (matrix_ac + matrix_bd) / math.sqrt(
|
||||||
|
self.d_k) # (batch, head, time1, time2)
|
||||||
|
|
||||||
|
return self.forward_attention(v, scores, mask)
|
@ -0,0 +1,51 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ['GlobalCMVN']
|
||||||
|
|
||||||
|
|
||||||
|
class GlobalCMVN(nn.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
mean: paddle.Tensor,
|
||||||
|
istd: paddle.Tensor,
|
||||||
|
norm_var: bool=True):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
mean (paddle.Tensor): mean stats
|
||||||
|
istd (paddle.Tensor): inverse std, std which is 1.0 / std
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
assert mean.shape == istd.shape
|
||||||
|
self.norm_var = norm_var
|
||||||
|
# The buffer can be accessed from this module using self.mean
|
||||||
|
self.register_buffer("mean", mean)
|
||||||
|
self.register_buffer("istd", istd)
|
||||||
|
|
||||||
|
def forward(self, x: paddle.Tensor):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): (batch, max_len, feat_dim)
|
||||||
|
Returns:
|
||||||
|
(paddle.Tensor): normalized feature
|
||||||
|
"""
|
||||||
|
x = x - self.mean
|
||||||
|
if self.norm_var:
|
||||||
|
x = x * self.istd
|
||||||
|
return x
|
@ -0,0 +1,161 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""ConvolutionModule definition."""
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ['ConvolutionModule']
|
||||||
|
|
||||||
|
|
||||||
|
class ConvolutionModule(nn.Layer):
|
||||||
|
"""ConvolutionModule in Conformer model."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
channels: int,
|
||||||
|
kernel_size: int=15,
|
||||||
|
activation: nn.Layer=nn.ReLU(),
|
||||||
|
norm: str="batch_norm",
|
||||||
|
causal: bool=False,
|
||||||
|
bias: bool=True):
|
||||||
|
"""Construct an ConvolutionModule object.
|
||||||
|
Args:
|
||||||
|
channels (int): The number of channels of conv layers.
|
||||||
|
kernel_size (int): Kernel size of conv layers.
|
||||||
|
activation (nn.Layer): Activation Layer.
|
||||||
|
norm (str): Normalization type, 'batch_norm' or 'layer_norm'
|
||||||
|
causal (bool): Whether use causal convolution or not
|
||||||
|
bias (bool): Whether Conv with bias or not
|
||||||
|
"""
|
||||||
|
assert check_argument_types()
|
||||||
|
super().__init__()
|
||||||
|
self.pointwise_conv1 = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
2 * channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=None
|
||||||
|
if bias else False, # None for True, using bias as default config
|
||||||
|
)
|
||||||
|
|
||||||
|
# self.lorder is used to distinguish if it's a causal convolution,
|
||||||
|
# if self.lorder > 0:
|
||||||
|
# it's a causal convolution, the input will be padded with
|
||||||
|
# `self.lorder` frames on the left in forward (causal conv impl).
|
||||||
|
# else: it's a symmetrical convolution
|
||||||
|
if causal:
|
||||||
|
padding = 0
|
||||||
|
self.lorder = kernel_size - 1
|
||||||
|
else:
|
||||||
|
# kernel_size should be an odd number for none causal convolution
|
||||||
|
assert (kernel_size - 1) % 2 == 0
|
||||||
|
padding = (kernel_size - 1) // 2
|
||||||
|
self.lorder = 0
|
||||||
|
|
||||||
|
self.depthwise_conv = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
stride=1,
|
||||||
|
padding=padding,
|
||||||
|
groups=channels,
|
||||||
|
bias_attr=None
|
||||||
|
if bias else False, # None for True, using bias as default config
|
||||||
|
)
|
||||||
|
|
||||||
|
assert norm in ['batch_norm', 'layer_norm']
|
||||||
|
if norm == "batch_norm":
|
||||||
|
self.use_layer_norm = False
|
||||||
|
self.norm = nn.BatchNorm1D(channels)
|
||||||
|
else:
|
||||||
|
self.use_layer_norm = True
|
||||||
|
self.norm = nn.LayerNorm(channels)
|
||||||
|
|
||||||
|
self.pointwise_conv2 = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=None
|
||||||
|
if bias else False, # None for True, using bias as default config
|
||||||
|
)
|
||||||
|
self.activation = activation
|
||||||
|
|
||||||
|
def forward(self,
|
||||||
|
x: paddle.Tensor,
|
||||||
|
mask_pad: Optional[paddle.Tensor]=None,
|
||||||
|
cache: Optional[paddle.Tensor]=None
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Compute convolution module.
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): Input tensor (#batch, time, channels).
|
||||||
|
mask_pad (paddle.Tensor): used for batch padding, (#batch, channels, time).
|
||||||
|
cache (paddle.Tensor): left context cache, it is only
|
||||||
|
used in causal convolution. (#batch, channels, time')
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Output tensor (#batch, time, channels).
|
||||||
|
paddle.Tensor: Output cache tensor (#batch, channels, time')
|
||||||
|
"""
|
||||||
|
# exchange the temporal dimension and the feature dimension
|
||||||
|
x = x.transpose([0, 2, 1]) # [B, C, T]
|
||||||
|
|
||||||
|
# mask batch padding
|
||||||
|
if mask_pad is not None:
|
||||||
|
x = x.masked_fill(mask_pad, 0.0)
|
||||||
|
|
||||||
|
if self.lorder > 0:
|
||||||
|
if cache is None:
|
||||||
|
x = nn.functional.pad(
|
||||||
|
x, (self.lorder, 0), 'constant', 0.0, data_format='NCL')
|
||||||
|
else:
|
||||||
|
assert cache.shape[0] == x.shape[0] # B
|
||||||
|
assert cache.shape[1] == x.shape[1] # C
|
||||||
|
x = paddle.concat((cache, x), axis=2)
|
||||||
|
|
||||||
|
assert (x.shape[2] > self.lorder)
|
||||||
|
new_cache = x[:, :, -self.lorder:] #[B, C, T]
|
||||||
|
else:
|
||||||
|
# It's better we just return None if no cache is requried,
|
||||||
|
# However, for JIT export, here we just fake one tensor instead of
|
||||||
|
# None.
|
||||||
|
new_cache = paddle.zeros([1], dtype=x.dtype)
|
||||||
|
|
||||||
|
# GLU mechanism
|
||||||
|
x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
|
||||||
|
x = nn.functional.glu(x, axis=1) # (batch, channel, dim)
|
||||||
|
|
||||||
|
# 1D Depthwise Conv
|
||||||
|
x = self.depthwise_conv(x)
|
||||||
|
if self.use_layer_norm:
|
||||||
|
x = x.transpose([0, 2, 1]) # [B, T, C]
|
||||||
|
x = self.activation(self.norm(x))
|
||||||
|
if self.use_layer_norm:
|
||||||
|
x = x.transpose([0, 2, 1]) # [B, C, T]
|
||||||
|
x = self.pointwise_conv2(x)
|
||||||
|
|
||||||
|
# mask batch padding
|
||||||
|
if mask_pad is not None:
|
||||||
|
x = x.masked_fill(mask_pad, 0.0)
|
||||||
|
|
||||||
|
x = x.transpose([0, 2, 1]) # [B, T, C]
|
||||||
|
return x, new_cache
|
@ -0,0 +1,182 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Decoder definition."""
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
from deepspeech.modules.attention import MultiHeadedAttention
|
||||||
|
from deepspeech.modules.decoder_layer import DecoderLayer
|
||||||
|
from deepspeech.modules.embedding import PositionalEncoding
|
||||||
|
from deepspeech.modules.mask import make_non_pad_mask
|
||||||
|
from deepspeech.modules.mask import subsequent_mask
|
||||||
|
from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ["TransformerDecoder"]
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerDecoder(nn.Module):
|
||||||
|
"""Base class of Transfomer decoder module.
|
||||||
|
Args:
|
||||||
|
vocab_size: output dim
|
||||||
|
encoder_output_size: dimension of attention
|
||||||
|
attention_heads: the number of heads of multi head attention
|
||||||
|
linear_units: the hidden units number of position-wise feedforward
|
||||||
|
num_blocks: the number of decoder blocks
|
||||||
|
dropout_rate: dropout rate
|
||||||
|
self_attention_dropout_rate: dropout rate for attention
|
||||||
|
input_layer: input layer type, `embed`
|
||||||
|
use_output_layer: whether to use output layer
|
||||||
|
pos_enc_class: PositionalEncoding module
|
||||||
|
normalize_before:
|
||||||
|
True: use layer_norm before each sub-block of a layer.
|
||||||
|
False: use layer_norm after each sub-block of a layer.
|
||||||
|
concat_after: whether to concat attention layer's input and output
|
||||||
|
True: x -> x + linear(concat(x, att(x)))
|
||||||
|
False: x -> x + att(x)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size: int,
|
||||||
|
encoder_output_size: int,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
self_attention_dropout_rate: float=0.0,
|
||||||
|
src_attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="embed",
|
||||||
|
use_output_layer: bool=True,
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False, ):
|
||||||
|
|
||||||
|
assert check_argument_types()
|
||||||
|
super().__init__()
|
||||||
|
attention_dim = encoder_output_size
|
||||||
|
|
||||||
|
if input_layer == "embed":
|
||||||
|
self.embed = nn.Sequential(
|
||||||
|
nn.Embedding(vocab_size, attention_dim),
|
||||||
|
PositionalEncoding(attention_dim, positional_dropout_rate), )
|
||||||
|
else:
|
||||||
|
raise ValueError(f"only 'embed' is supported: {input_layer}")
|
||||||
|
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
self.after_norm = nn.LayerNorm(attention_dim, epsilon=1e-12)
|
||||||
|
self.use_output_layer = use_output_layer
|
||||||
|
self.output_layer = nn.Linear(attention_dim, vocab_size)
|
||||||
|
|
||||||
|
self.decoders = nn.ModuleList([
|
||||||
|
DecoderLayer(
|
||||||
|
size=attention_dim,
|
||||||
|
self_attn=MultiHeadedAttention(attention_heads, attention_dim,
|
||||||
|
self_attention_dropout_rate),
|
||||||
|
src_attn=MultiHeadedAttention(attention_heads, attention_dim,
|
||||||
|
src_attention_dropout_rate),
|
||||||
|
feed_forward=PositionwiseFeedForward(
|
||||||
|
attention_dim, linear_units, dropout_rate),
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
normalize_before=normalize_before,
|
||||||
|
concat_after=concat_after, ) for _ in range(num_blocks)
|
||||||
|
])
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
memory: paddle.Tensor,
|
||||||
|
memory_mask: paddle.Tensor,
|
||||||
|
ys_in_pad: paddle.Tensor,
|
||||||
|
ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Forward decoder.
|
||||||
|
Args:
|
||||||
|
memory: encoded memory, float32 (batch, maxlen_in, feat)
|
||||||
|
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
|
||||||
|
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
|
||||||
|
ys_in_lens: input lengths of this batch (batch)
|
||||||
|
Returns:
|
||||||
|
(tuple): tuple containing:
|
||||||
|
x: decoded token score before softmax (batch, maxlen_out, vocab_size)
|
||||||
|
if use_output_layer is True,
|
||||||
|
olens: (batch, )
|
||||||
|
"""
|
||||||
|
tgt = ys_in_pad
|
||||||
|
# tgt_mask: (B, 1, L)
|
||||||
|
tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1))
|
||||||
|
# m: (1, L, L)
|
||||||
|
m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0)
|
||||||
|
# tgt_mask: (B, L, L)
|
||||||
|
# TODO(Hui Zhang): not support & for tensor
|
||||||
|
# tgt_mask = tgt_mask & m
|
||||||
|
tgt_mask = tgt_mask.logical_and(m)
|
||||||
|
|
||||||
|
x, _ = self.embed(tgt)
|
||||||
|
for layer in self.decoders:
|
||||||
|
x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
|
||||||
|
memory_mask)
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.after_norm(x)
|
||||||
|
if self.use_output_layer:
|
||||||
|
x = self.output_layer(x)
|
||||||
|
|
||||||
|
# TODO(Hui Zhang): reduce_sum not support bool type
|
||||||
|
# olens = tgt_mask.sum(1)
|
||||||
|
olens = tgt_mask.astype(paddle.int).sum(1)
|
||||||
|
return x, olens
|
||||||
|
|
||||||
|
def forward_one_step(
|
||||||
|
self,
|
||||||
|
memory: paddle.Tensor,
|
||||||
|
memory_mask: paddle.Tensor,
|
||||||
|
tgt: paddle.Tensor,
|
||||||
|
tgt_mask: paddle.Tensor,
|
||||||
|
cache: Optional[List[paddle.Tensor]]=None,
|
||||||
|
) -> Tuple[paddle.Tensor, List[paddle.Tensor]]:
|
||||||
|
"""Forward one step.
|
||||||
|
This is only used for decoding.
|
||||||
|
Args:
|
||||||
|
memory: encoded memory, float32 (batch, maxlen_in, feat)
|
||||||
|
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
|
||||||
|
tgt: input token ids, int64 (batch, maxlen_out)
|
||||||
|
tgt_mask: input token mask, (batch, maxlen_out, maxlen_out)
|
||||||
|
dtype=paddle.bool
|
||||||
|
cache: cached output list of (batch, max_time_out-1, size)
|
||||||
|
Returns:
|
||||||
|
y, cache: NN output value and cache per `self.decoders`.
|
||||||
|
y.shape` is (batch, token)
|
||||||
|
"""
|
||||||
|
x, _ = self.embed(tgt)
|
||||||
|
new_cache = []
|
||||||
|
for i, decoder in enumerate(self.decoders):
|
||||||
|
if cache is None:
|
||||||
|
c = None
|
||||||
|
else:
|
||||||
|
c = cache[i]
|
||||||
|
x, tgt_mask, memory, memory_mask = decoder(
|
||||||
|
x, tgt_mask, memory, memory_mask, cache=c)
|
||||||
|
new_cache.append(x)
|
||||||
|
if self.normalize_before:
|
||||||
|
y = self.after_norm(x[:, -1])
|
||||||
|
else:
|
||||||
|
y = x[:, -1]
|
||||||
|
if self.use_output_layer:
|
||||||
|
y = paddle.log_softmax(self.output_layer(y), axis=-1)
|
||||||
|
return y, new_cache
|
@ -0,0 +1,151 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Decoder self-attention layer definition."""
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ["DecoderLayer"]
|
||||||
|
|
||||||
|
|
||||||
|
class DecoderLayer(nn.Module):
|
||||||
|
"""Single decoder layer module.
|
||||||
|
Args:
|
||||||
|
size (int): Input dimension.
|
||||||
|
self_attn (nn.Module): Self-attention module instance.
|
||||||
|
`MultiHeadedAttention` instance can be used as the argument.
|
||||||
|
src_attn (nn.Module): Self-attention module instance.
|
||||||
|
`MultiHeadedAttention` instance can be used as the argument.
|
||||||
|
feed_forward (nn.Module): Feed-forward module instance.
|
||||||
|
`PositionwiseFeedForward` instance can be used as the argument.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
normalize_before (bool):
|
||||||
|
True: use layer_norm before each sub-block.
|
||||||
|
False: to use layer_norm after each sub-block.
|
||||||
|
concat_after (bool): Whether to concat attention layer's input
|
||||||
|
and output.
|
||||||
|
True: x -> x + linear(concat(x, att(x)))
|
||||||
|
False: x -> x + att(x)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
size: int,
|
||||||
|
self_attn: nn.Module,
|
||||||
|
src_attn: nn.Module,
|
||||||
|
feed_forward: nn.Module,
|
||||||
|
dropout_rate: float,
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False, ):
|
||||||
|
"""Construct an DecoderLayer object."""
|
||||||
|
super().__init__()
|
||||||
|
self.size = size
|
||||||
|
self.self_attn = self_attn
|
||||||
|
self.src_attn = src_attn
|
||||||
|
self.feed_forward = feed_forward
|
||||||
|
self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
|
||||||
|
self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
|
||||||
|
self.norm3 = nn.LayerNorm(size, epsilon=1e-12)
|
||||||
|
self.dropout = nn.Dropout(dropout_rate)
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
self.concat_after = concat_after
|
||||||
|
self.concat_linear1 = nn.Linear(size + size, size)
|
||||||
|
self.concat_linear2 = nn.Linear(size + size, size)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
tgt: paddle.Tensor,
|
||||||
|
tgt_mask: paddle.Tensor,
|
||||||
|
memory: paddle.Tensor,
|
||||||
|
memory_mask: paddle.Tensor,
|
||||||
|
cache: Optional[paddle.Tensor]=None
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Compute decoded features.
|
||||||
|
Args:
|
||||||
|
tgt (paddle.Tensor): Input tensor (#batch, maxlen_out, size).
|
||||||
|
tgt_mask (paddle.Tensor): Mask for input tensor
|
||||||
|
(#batch, maxlen_out).
|
||||||
|
memory (paddle.Tensor): Encoded memory
|
||||||
|
(#batch, maxlen_in, size).
|
||||||
|
memory_mask (paddle.Tensor): Encoded memory mask
|
||||||
|
(#batch, maxlen_in).
|
||||||
|
cache (paddle.Tensor): cached tensors.
|
||||||
|
(#batch, maxlen_out - 1, size).
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Output tensor (#batch, maxlen_out, size).
|
||||||
|
paddle.Tensor: Mask for output tensor (#batch, maxlen_out).
|
||||||
|
paddle.Tensor: Encoded memory (#batch, maxlen_in, size).
|
||||||
|
paddle.Tensor: Encoded memory mask (#batch, maxlen_in).
|
||||||
|
"""
|
||||||
|
residual = tgt
|
||||||
|
if self.normalize_before:
|
||||||
|
tgt = self.norm1(tgt)
|
||||||
|
|
||||||
|
if cache is None:
|
||||||
|
tgt_q = tgt
|
||||||
|
tgt_q_mask = tgt_mask
|
||||||
|
else:
|
||||||
|
# compute only the last frame query keeping dim: max_time_out -> 1
|
||||||
|
assert cache.shape == [
|
||||||
|
tgt.shape[0],
|
||||||
|
tgt.shape[1] - 1,
|
||||||
|
self.size,
|
||||||
|
], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}"
|
||||||
|
tgt_q = tgt[:, -1:, :]
|
||||||
|
residual = residual[:, -1:, :]
|
||||||
|
# TODO(Hui Zhang): slice not support bool type
|
||||||
|
# tgt_q_mask = tgt_mask[:, -1:, :]
|
||||||
|
tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast(
|
||||||
|
paddle.bool)
|
||||||
|
|
||||||
|
if self.concat_after:
|
||||||
|
tgt_concat = paddle.cat(
|
||||||
|
(tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1)
|
||||||
|
x = residual + self.concat_linear1(tgt_concat)
|
||||||
|
else:
|
||||||
|
x = residual + self.dropout(
|
||||||
|
self.self_attn(tgt_q, tgt, tgt, tgt_q_mask))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm1(x)
|
||||||
|
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm2(x)
|
||||||
|
if self.concat_after:
|
||||||
|
x_concat = paddle.cat(
|
||||||
|
(x, self.src_attn(x, memory, memory, memory_mask)), dim=-1)
|
||||||
|
x = residual + self.concat_linear2(x_concat)
|
||||||
|
else:
|
||||||
|
x = residual + self.dropout(
|
||||||
|
self.src_attn(x, memory, memory, memory_mask))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm2(x)
|
||||||
|
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm3(x)
|
||||||
|
x = residual + self.dropout(self.feed_forward(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm3(x)
|
||||||
|
|
||||||
|
if cache is not None:
|
||||||
|
x = paddle.cat([cache, x], dim=1)
|
||||||
|
|
||||||
|
return x, tgt_mask, memory, memory_mask
|
@ -0,0 +1,448 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Encoder definition."""
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
from deepspeech.modules.activation import get_activation
|
||||||
|
from deepspeech.modules.attention import MultiHeadedAttention
|
||||||
|
from deepspeech.modules.attention import RelPositionMultiHeadedAttention
|
||||||
|
from deepspeech.modules.conformer_convolution import ConvolutionModule
|
||||||
|
from deepspeech.modules.embedding import PositionalEncoding
|
||||||
|
from deepspeech.modules.embedding import RelPositionalEncoding
|
||||||
|
from deepspeech.modules.encoder_layer import ConformerEncoderLayer
|
||||||
|
from deepspeech.modules.encoder_layer import TransformerEncoderLayer
|
||||||
|
from deepspeech.modules.mask import add_optional_chunk_mask
|
||||||
|
from deepspeech.modules.mask import make_non_pad_mask
|
||||||
|
from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward
|
||||||
|
from deepspeech.modules.subsampling import Conv2dSubsampling4
|
||||||
|
from deepspeech.modules.subsampling import Conv2dSubsampling6
|
||||||
|
from deepspeech.modules.subsampling import Conv2dSubsampling8
|
||||||
|
from deepspeech.modules.subsampling import LinearNoSubsampling
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"]
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEncoder(nn.Layer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_size: int,
|
||||||
|
output_size: int=256,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="conv2d",
|
||||||
|
pos_enc_layer_type: str="abs_pos",
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False,
|
||||||
|
static_chunk_size: int=0,
|
||||||
|
use_dynamic_chunk: bool=False,
|
||||||
|
global_cmvn: paddle.nn.Layer=None,
|
||||||
|
use_dynamic_left_chunk: bool=False, ):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
input_size (int): input dim, d_feature
|
||||||
|
output_size (int): dimension of attention, d_model
|
||||||
|
attention_heads (int): the number of heads of multi head attention
|
||||||
|
linear_units (int): the hidden units number of position-wise feed
|
||||||
|
forward
|
||||||
|
num_blocks (int): the number of encoder blocks
|
||||||
|
dropout_rate (float): dropout rate
|
||||||
|
attention_dropout_rate (float): dropout rate in attention
|
||||||
|
positional_dropout_rate (float): dropout rate after adding
|
||||||
|
positional encoding
|
||||||
|
input_layer (str): input layer type.
|
||||||
|
optional [linear, conv2d, conv2d6, conv2d8]
|
||||||
|
pos_enc_layer_type (str): Encoder positional encoding layer type.
|
||||||
|
opitonal [abs_pos, scaled_abs_pos, rel_pos]
|
||||||
|
normalize_before (bool):
|
||||||
|
True: use layer_norm before each sub-block of a layer.
|
||||||
|
False: use layer_norm after each sub-block of a layer.
|
||||||
|
concat_after (bool): whether to concat attention layer's input
|
||||||
|
and output.
|
||||||
|
True: x -> x + linear(concat(x, att(x)))
|
||||||
|
False: x -> x + att(x)
|
||||||
|
static_chunk_size (int): chunk size for static chunk training and
|
||||||
|
decoding
|
||||||
|
use_dynamic_chunk (bool): whether use dynamic chunk size for
|
||||||
|
training or not, You can only use fixed chunk(chunk_size > 0)
|
||||||
|
or dyanmic chunk size(use_dynamic_chunk = True)
|
||||||
|
global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
|
||||||
|
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
|
||||||
|
dynamic chunk training
|
||||||
|
"""
|
||||||
|
assert check_argument_types()
|
||||||
|
super().__init__()
|
||||||
|
self._output_size = output_size
|
||||||
|
|
||||||
|
if pos_enc_layer_type == "abs_pos":
|
||||||
|
pos_enc_class = PositionalEncoding
|
||||||
|
elif pos_enc_layer_type == "rel_pos":
|
||||||
|
pos_enc_class = RelPositionalEncoding
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
|
||||||
|
|
||||||
|
if input_layer == "linear":
|
||||||
|
subsampling_class = LinearNoSubsampling
|
||||||
|
elif input_layer == "conv2d":
|
||||||
|
subsampling_class = Conv2dSubsampling4
|
||||||
|
elif input_layer == "conv2d6":
|
||||||
|
subsampling_class = Conv2dSubsampling6
|
||||||
|
elif input_layer == "conv2d8":
|
||||||
|
subsampling_class = Conv2dSubsampling8
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown input_layer: " + input_layer)
|
||||||
|
|
||||||
|
self.global_cmvn = global_cmvn
|
||||||
|
self.embed = subsampling_class(
|
||||||
|
idim=input_size,
|
||||||
|
odim=output_size,
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
pos_enc_class=pos_enc_class(
|
||||||
|
d_model=output_size, dropout_rate=positional_dropout_rate), )
|
||||||
|
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
|
||||||
|
self.static_chunk_size = static_chunk_size
|
||||||
|
self.use_dynamic_chunk = use_dynamic_chunk
|
||||||
|
self.use_dynamic_left_chunk = use_dynamic_left_chunk
|
||||||
|
|
||||||
|
def output_size(self) -> int:
|
||||||
|
return self._output_size
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
xs: paddle.Tensor,
|
||||||
|
xs_lens: paddle.Tensor,
|
||||||
|
decoding_chunk_size: int=0,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Embed positions in tensor.
|
||||||
|
Args:
|
||||||
|
xs: padded input tensor (B, L, D)
|
||||||
|
xs_lens: input length (B)
|
||||||
|
decoding_chunk_size: decoding chunk size for dynamic chunk
|
||||||
|
0: default for training, use random dynamic chunk.
|
||||||
|
<0: for decoding, use full chunk.
|
||||||
|
>0: for decoding, use fixed chunk size as set.
|
||||||
|
num_decoding_left_chunks: number of left chunks, this is for decoding,
|
||||||
|
the chunk size is decoding_chunk_size.
|
||||||
|
>=0: use num_decoding_left_chunks
|
||||||
|
<0: use all left chunks
|
||||||
|
Returns:
|
||||||
|
encoder output tensor, lens and mask
|
||||||
|
"""
|
||||||
|
masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L)
|
||||||
|
|
||||||
|
if self.global_cmvn is not None:
|
||||||
|
xs = self.global_cmvn(xs)
|
||||||
|
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
|
||||||
|
xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)
|
||||||
|
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
|
||||||
|
masks = masks.astype(paddle.bool)
|
||||||
|
#TODO(Hui Zhang): mask_pad = ~masks
|
||||||
|
mask_pad = masks.logical_not()
|
||||||
|
chunk_masks = add_optional_chunk_mask(
|
||||||
|
xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
|
||||||
|
decoding_chunk_size, self.static_chunk_size,
|
||||||
|
num_decoding_left_chunks)
|
||||||
|
for layer in self.encoders:
|
||||||
|
xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
# Here we assume the mask is not changed in encoder layers, so just
|
||||||
|
# return the masks before encoder layers, and the masks will be used
|
||||||
|
# for cross attention with decoder later
|
||||||
|
return xs, masks
|
||||||
|
|
||||||
|
def forward_chunk(
|
||||||
|
self,
|
||||||
|
xs: paddle.Tensor,
|
||||||
|
offset: int,
|
||||||
|
required_cache_size: int,
|
||||||
|
subsampling_cache: Optional[paddle.Tensor]=None,
|
||||||
|
elayers_output_cache: Optional[List[paddle.Tensor]]=None,
|
||||||
|
conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
|
||||||
|
paddle.Tensor]]:
|
||||||
|
""" Forward just one chunk
|
||||||
|
Args:
|
||||||
|
xs (paddle.Tensor): chunk input, [B=1, T, D]
|
||||||
|
offset (int): current offset in encoder output time stamp
|
||||||
|
required_cache_size (int): cache size required for next chunk
|
||||||
|
compuation
|
||||||
|
>=0: actual cache size
|
||||||
|
<0: means all history cache is required
|
||||||
|
subsampling_cache (Optional[paddle.Tensor]): subsampling cache
|
||||||
|
elayers_output_cache (Optional[List[paddle.Tensor]]):
|
||||||
|
transformer/conformer encoder layers output cache
|
||||||
|
conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
|
||||||
|
cnn cache
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: output of current input xs
|
||||||
|
paddle.Tensor: subsampling cache required for next chunk computation
|
||||||
|
List[paddle.Tensor]: encoder layers output cache required for next
|
||||||
|
chunk computation
|
||||||
|
List[paddle.Tensor]: conformer cnn cache
|
||||||
|
"""
|
||||||
|
assert xs.size(0) == 1 # batch size must be one
|
||||||
|
# tmp_masks is just for interface compatibility
|
||||||
|
tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
|
||||||
|
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
|
||||||
|
|
||||||
|
if self.global_cmvn is not None:
|
||||||
|
xs = self.global_cmvn(xs)
|
||||||
|
|
||||||
|
xs, pos_emb, _ = self.embed(
|
||||||
|
xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D)
|
||||||
|
if subsampling_cache is not None:
|
||||||
|
cache_size = subsampling_cache.size(1) #T
|
||||||
|
xs = paddle.cat((subsampling_cache, xs), dim=1)
|
||||||
|
else:
|
||||||
|
cache_size = 0
|
||||||
|
pos_emb = self.embed.position_encoding(
|
||||||
|
offset=offset - cache_size, size=xs.size(1))
|
||||||
|
|
||||||
|
if required_cache_size < 0:
|
||||||
|
next_cache_start = 0
|
||||||
|
elif required_cache_size == 0:
|
||||||
|
next_cache_start = xs.size(1)
|
||||||
|
else:
|
||||||
|
next_cache_start = xs.size(1) - required_cache_size
|
||||||
|
r_subsampling_cache = xs[:, next_cache_start:, :]
|
||||||
|
|
||||||
|
# Real mask for transformer/conformer layers
|
||||||
|
masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
|
||||||
|
masks = masks.unsqueeze(1) #[B=1, C=1, T]
|
||||||
|
r_elayers_output_cache = []
|
||||||
|
r_conformer_cnn_cache = []
|
||||||
|
for i, layer in enumerate(self.encoders):
|
||||||
|
attn_cache = None if elayers_output_cache is None else elayers_output_cache[
|
||||||
|
i]
|
||||||
|
cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
|
||||||
|
i]
|
||||||
|
xs, _, new_cnn_cache = layer(
|
||||||
|
xs,
|
||||||
|
masks,
|
||||||
|
pos_emb,
|
||||||
|
output_cache=attn_cache,
|
||||||
|
cnn_cache=cnn_cache)
|
||||||
|
r_elayers_output_cache.append(xs[:, next_cache_start:, :])
|
||||||
|
r_conformer_cnn_cache.append(new_cnn_cache)
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
|
||||||
|
return (xs[:, cache_size:, :], r_subsampling_cache,
|
||||||
|
r_elayers_output_cache, r_conformer_cnn_cache)
|
||||||
|
|
||||||
|
def forward_chunk_by_chunk(
|
||||||
|
self,
|
||||||
|
xs: paddle.Tensor,
|
||||||
|
decoding_chunk_size: int,
|
||||||
|
num_decoding_left_chunks: int=-1,
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
|
""" Forward input chunk by chunk with chunk_size like a streaming
|
||||||
|
fashion
|
||||||
|
Here we should pay special attention to computation cache in the
|
||||||
|
streaming style forward chunk by chunk. Three things should be taken
|
||||||
|
into account for computation in the current network:
|
||||||
|
1. transformer/conformer encoder layers output cache
|
||||||
|
2. convolution in conformer
|
||||||
|
3. convolution in subsampling
|
||||||
|
However, we don't implement subsampling cache for:
|
||||||
|
1. We can control subsampling module to output the right result by
|
||||||
|
overlapping input instead of cache left context, even though it
|
||||||
|
wastes some computation, but subsampling only takes a very
|
||||||
|
small fraction of computation in the whole model.
|
||||||
|
2. Typically, there are several covolution layers with subsampling
|
||||||
|
in subsampling module, it is tricky and complicated to do cache
|
||||||
|
with different convolution layers with different subsampling
|
||||||
|
rate.
|
||||||
|
3. Currently, nn.Sequential is used to stack all the convolution
|
||||||
|
layers in subsampling, we need to rewrite it to make it work
|
||||||
|
with cache, which is not prefered.
|
||||||
|
Args:
|
||||||
|
xs (paddle.Tensor): (1, max_len, dim)
|
||||||
|
chunk_size (int): decoding chunk size.
|
||||||
|
num_left_chunks (int): decoding with num left chunks.
|
||||||
|
"""
|
||||||
|
assert decoding_chunk_size > 0
|
||||||
|
# The model is trained by static or dynamic chunk
|
||||||
|
assert self.static_chunk_size > 0 or self.use_dynamic_chunk
|
||||||
|
|
||||||
|
# feature stride and window for `subsampling` module
|
||||||
|
subsampling = self.embed.subsampling_rate
|
||||||
|
context = self.embed.right_context + 1 # Add current frame
|
||||||
|
stride = subsampling * decoding_chunk_size
|
||||||
|
decoding_window = (decoding_chunk_size - 1) * subsampling + context
|
||||||
|
|
||||||
|
num_frames = xs.size(1)
|
||||||
|
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
|
||||||
|
subsampling_cache: Optional[paddle.Tensor] = None
|
||||||
|
elayers_output_cache: Optional[List[paddle.Tensor]] = None
|
||||||
|
conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
|
||||||
|
outputs = []
|
||||||
|
offset = 0
|
||||||
|
# Feed forward overlap input step by step
|
||||||
|
for cur in range(0, num_frames - context + 1, stride):
|
||||||
|
end = min(cur + decoding_window, num_frames)
|
||||||
|
chunk_xs = xs[:, cur:end, :]
|
||||||
|
(y, subsampling_cache, elayers_output_cache,
|
||||||
|
conformer_cnn_cache) = self.forward_chunk(
|
||||||
|
chunk_xs, offset, required_cache_size, subsampling_cache,
|
||||||
|
elayers_output_cache, conformer_cnn_cache)
|
||||||
|
outputs.append(y)
|
||||||
|
offset += y.size(1)
|
||||||
|
ys = paddle.cat(outputs, 1)
|
||||||
|
# fake mask, just for jit script and compatibility with `forward` api
|
||||||
|
masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
|
||||||
|
masks = masks.unsqueeze(1)
|
||||||
|
return ys, masks
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerEncoder(BaseEncoder):
|
||||||
|
"""Transformer encoder module."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_size: int,
|
||||||
|
output_size: int=256,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="conv2d",
|
||||||
|
pos_enc_layer_type: str="abs_pos",
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False,
|
||||||
|
static_chunk_size: int=0,
|
||||||
|
use_dynamic_chunk: bool=False,
|
||||||
|
global_cmvn: nn.Layer=None,
|
||||||
|
use_dynamic_left_chunk: bool=False, ):
|
||||||
|
""" Construct TransformerEncoder
|
||||||
|
See Encoder for the meaning of each parameter.
|
||||||
|
"""
|
||||||
|
assert check_argument_types()
|
||||||
|
super().__init__(input_size, output_size, attention_heads, linear_units,
|
||||||
|
num_blocks, dropout_rate, positional_dropout_rate,
|
||||||
|
attention_dropout_rate, input_layer,
|
||||||
|
pos_enc_layer_type, normalize_before, concat_after,
|
||||||
|
static_chunk_size, use_dynamic_chunk, global_cmvn,
|
||||||
|
use_dynamic_left_chunk)
|
||||||
|
self.encoders = nn.ModuleList([
|
||||||
|
TransformerEncoderLayer(
|
||||||
|
size=output_size,
|
||||||
|
self_attn=MultiHeadedAttention(attention_heads, output_size,
|
||||||
|
attention_dropout_rate),
|
||||||
|
feed_forward=PositionwiseFeedForward(output_size, linear_units,
|
||||||
|
dropout_rate),
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
normalize_before=normalize_before,
|
||||||
|
concat_after=concat_after) for _ in range(num_blocks)
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
class ConformerEncoder(BaseEncoder):
|
||||||
|
"""Conformer encoder module."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_size: int,
|
||||||
|
output_size: int=256,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="conv2d",
|
||||||
|
pos_enc_layer_type: str="rel_pos",
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False,
|
||||||
|
static_chunk_size: int=0,
|
||||||
|
use_dynamic_chunk: bool=False,
|
||||||
|
global_cmvn: nn.Layer=None,
|
||||||
|
use_dynamic_left_chunk: bool=False,
|
||||||
|
positionwise_conv_kernel_size: int=1,
|
||||||
|
macaron_style: bool=True,
|
||||||
|
selfattention_layer_type: str="rel_selfattn",
|
||||||
|
activation_type: str="swish",
|
||||||
|
use_cnn_module: bool=True,
|
||||||
|
cnn_module_kernel: int=15,
|
||||||
|
causal: bool=False,
|
||||||
|
cnn_module_norm: str="batch_norm", ):
|
||||||
|
"""Construct ConformerEncoder
|
||||||
|
Args:
|
||||||
|
input_size to use_dynamic_chunk, see in BaseEncoder
|
||||||
|
positionwise_conv_kernel_size (int): Kernel size of positionwise
|
||||||
|
conv1d layer.
|
||||||
|
macaron_style (bool): Whether to use macaron style for
|
||||||
|
positionwise layer.
|
||||||
|
selfattention_layer_type (str): Encoder attention layer type,
|
||||||
|
the parameter has no effect now, it's just for configure
|
||||||
|
compatibility.
|
||||||
|
activation_type (str): Encoder activation function type.
|
||||||
|
use_cnn_module (bool): Whether to use convolution module.
|
||||||
|
cnn_module_kernel (int): Kernel size of convolution module.
|
||||||
|
causal (bool): whether to use causal convolution or not.
|
||||||
|
cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
|
||||||
|
"""
|
||||||
|
assert check_argument_types()
|
||||||
|
super().__init__(input_size, output_size, attention_heads, linear_units,
|
||||||
|
num_blocks, dropout_rate, positional_dropout_rate,
|
||||||
|
attention_dropout_rate, input_layer,
|
||||||
|
pos_enc_layer_type, normalize_before, concat_after,
|
||||||
|
static_chunk_size, use_dynamic_chunk, global_cmvn,
|
||||||
|
use_dynamic_left_chunk)
|
||||||
|
activation = get_activation(activation_type)
|
||||||
|
|
||||||
|
# self-attention module definition
|
||||||
|
encoder_selfattn_layer = RelPositionMultiHeadedAttention
|
||||||
|
encoder_selfattn_layer_args = (attention_heads, output_size,
|
||||||
|
attention_dropout_rate)
|
||||||
|
# feed-forward module definition
|
||||||
|
positionwise_layer = PositionwiseFeedForward
|
||||||
|
positionwise_layer_args = (output_size, linear_units, dropout_rate,
|
||||||
|
activation)
|
||||||
|
# convolution module definition
|
||||||
|
convolution_layer = ConvolutionModule
|
||||||
|
convolution_layer_args = (output_size, cnn_module_kernel, activation,
|
||||||
|
cnn_module_norm, causal)
|
||||||
|
|
||||||
|
self.encoders = nn.ModuleList([
|
||||||
|
ConformerEncoderLayer(
|
||||||
|
size=output_size,
|
||||||
|
self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
|
||||||
|
feed_forward=positionwise_layer(*positionwise_layer_args),
|
||||||
|
feed_forward_macaron=positionwise_layer(
|
||||||
|
*positionwise_layer_args) if macaron_style else None,
|
||||||
|
conv_module=convolution_layer(*convolution_layer_args)
|
||||||
|
if use_cnn_module else None,
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
normalize_before=normalize_before,
|
||||||
|
concat_after=concat_after) for _ in range(num_blocks)
|
||||||
|
])
|
@ -0,0 +1,57 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Positionwise feed forward layer definition."""
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = ["PositionwiseFeedForward"]
|
||||||
|
|
||||||
|
|
||||||
|
class PositionwiseFeedForward(nn.Layer):
|
||||||
|
"""Positionwise feed forward layer."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim: int,
|
||||||
|
hidden_units: int,
|
||||||
|
dropout_rate: float,
|
||||||
|
activation: nn.Layer=nn.ReLU()):
|
||||||
|
"""Construct a PositionwiseFeedForward object.
|
||||||
|
|
||||||
|
FeedForward are appied on each position of the sequence.
|
||||||
|
The output dim is same with the input dim.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
idim (int): Input dimenstion.
|
||||||
|
hidden_units (int): The number of hidden units.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
activation (paddle.nn.Layer): Activation function
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.w_1 = nn.Linear(idim, hidden_units)
|
||||||
|
self.activation = activation
|
||||||
|
self.dropout = nn.Dropout(dropout_rate)
|
||||||
|
self.w_2 = nn.Linear(hidden_units, idim)
|
||||||
|
|
||||||
|
def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
|
||||||
|
"""Forward function.
|
||||||
|
Args:
|
||||||
|
xs: input tensor (B, Lmax, D)
|
||||||
|
Returns:
|
||||||
|
output tensor, (B, Lmax, D)
|
||||||
|
"""
|
||||||
|
return self.w_2(self.dropout(self.activation(self.w_1(xs))))
|
@ -0,0 +1,239 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Subsampling layer definition."""
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from deepspeech.modules.embedding import PositionalEncoding
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"LinearNoSubsampling", "Conv2dSubsampling4", "Conv2dSubsampling6",
|
||||||
|
"Conv2dSubsampling8"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSubsampling(nn.Layer):
|
||||||
|
def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding):
|
||||||
|
super().__init__()
|
||||||
|
self.pos_enc = pos_enc_class
|
||||||
|
# window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
|
||||||
|
self.right_context = 0
|
||||||
|
# stride = subsampling_rate * chunk_size
|
||||||
|
self.subsampling_rate = 1
|
||||||
|
|
||||||
|
def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
|
||||||
|
return self.pos_enc.position_encoding(offset, size)
|
||||||
|
|
||||||
|
|
||||||
|
class LinearNoSubsampling(BaseSubsampling):
|
||||||
|
"""Linear transform the input without subsampling."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim: int,
|
||||||
|
odim: int,
|
||||||
|
dropout_rate: float,
|
||||||
|
pos_enc_class: nn.Layer=PositionalEncoding):
|
||||||
|
"""Construct an linear object.
|
||||||
|
Args:
|
||||||
|
idim (int): Input dimension.
|
||||||
|
odim (int): Output dimension.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
pos_enc_class (PositionalEncoding): position encoding class
|
||||||
|
"""
|
||||||
|
super().__init__(pos_enc_class)
|
||||||
|
self.out = nn.Sequential(
|
||||||
|
nn.Linear(idim, odim),
|
||||||
|
nn.LayerNorm(odim, epsilon=1e-12),
|
||||||
|
nn.Dropout(dropout_rate), )
|
||||||
|
self.right_context = 0
|
||||||
|
self.subsampling_rate = 1
|
||||||
|
|
||||||
|
def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Input x.
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): Input tensor (#batch, time, idim).
|
||||||
|
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
|
||||||
|
offset (int): position encoding offset.
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: linear input tensor (#batch, time', odim),
|
||||||
|
where time' = time .
|
||||||
|
paddle.Tensor: positional encoding
|
||||||
|
paddle.Tensor: linear input mask (#batch, 1, time'),
|
||||||
|
where time' = time .
|
||||||
|
"""
|
||||||
|
x = self.out(x)
|
||||||
|
x, pos_emb = self.pos_enc(x, offset)
|
||||||
|
return x, pos_emb, x_mask
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling4(BaseSubsampling):
|
||||||
|
"""Convolutional 2D subsampling (to 1/4 length)."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim: int,
|
||||||
|
odim: int,
|
||||||
|
dropout_rate: float,
|
||||||
|
pos_enc_class: nn.Layer=PositionalEncoding):
|
||||||
|
"""Construct an Conv2dSubsampling4 object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
idim (int): Input dimension.
|
||||||
|
odim (int): Output dimension.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
"""
|
||||||
|
super().__init__(pos_enc_class)
|
||||||
|
self.conv = nn.Sequential(
|
||||||
|
nn.Conv2D(1, odim, 3, 2),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
nn.ReLU(), )
|
||||||
|
self.out = nn.Sequential(
|
||||||
|
nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
|
||||||
|
self.subsampling_rate = 4
|
||||||
|
# The right context for every conv layer is computed by:
|
||||||
|
# (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer
|
||||||
|
# 6 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2
|
||||||
|
self.right_context = 6
|
||||||
|
|
||||||
|
def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Subsample x.
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): Input tensor (#batch, time, idim).
|
||||||
|
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
|
||||||
|
offset (int): position encoding offset.
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 4.
|
||||||
|
paddle.Tensor: positional encoding
|
||||||
|
paddle.Tensor: Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 4.
|
||||||
|
"""
|
||||||
|
x = x.unsqueeze(1) # (b, c=1, t, f)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = paddle.shape(x)
|
||||||
|
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
x, pos_emb = self.pos_enc(x, offset)
|
||||||
|
return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling6(BaseSubsampling):
|
||||||
|
"""Convolutional 2D subsampling (to 1/6 length)."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim: int,
|
||||||
|
odim: int,
|
||||||
|
dropout_rate: float,
|
||||||
|
pos_enc_class: nn.Layer=PositionalEncoding):
|
||||||
|
"""Construct an Conv2dSubsampling6 object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
idim (int): Input dimension.
|
||||||
|
odim (int): Output dimension.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
pos_enc (PositionalEncoding): Custom position encoding layer.
|
||||||
|
"""
|
||||||
|
super().__init__(pos_enc_class)
|
||||||
|
self.conv = nn.Sequential(
|
||||||
|
nn.Conv2D(1, odim, 3, 2),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv2D(odim, odim, 5, 3),
|
||||||
|
nn.ReLU(), )
|
||||||
|
# O = (I - F + Pstart + Pend) // S + 1
|
||||||
|
# when Padding == 0, O = (I - F - S) // S
|
||||||
|
self.linear = nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
|
||||||
|
# The right context for every conv layer is computed by:
|
||||||
|
# (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer
|
||||||
|
# 14 = (3 - 1) / 2 * 2 * 1 + (5 - 1) / 2 * 3 * 2
|
||||||
|
self.subsampling_rate = 6
|
||||||
|
self.right_context = 14
|
||||||
|
|
||||||
|
def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Subsample x.
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): Input tensor (#batch, time, idim).
|
||||||
|
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
|
||||||
|
offset (int): position encoding offset.
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 6.
|
||||||
|
paddle.Tensor: positional encoding
|
||||||
|
paddle.Tensor: Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 6.
|
||||||
|
"""
|
||||||
|
x = x.unsqueeze(1) # (b, c, t, f)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = paddle.shape(x)
|
||||||
|
x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
x, pos_emb = self.pos_enc(x, offset)
|
||||||
|
return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3]
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling8(BaseSubsampling):
|
||||||
|
"""Convolutional 2D subsampling (to 1/8 length)."""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim: int,
|
||||||
|
odim: int,
|
||||||
|
dropout_rate: float,
|
||||||
|
pos_enc_class: nn.Layer=PositionalEncoding):
|
||||||
|
"""Construct an Conv2dSubsampling8 object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
idim (int): Input dimension.
|
||||||
|
odim (int): Output dimension.
|
||||||
|
dropout_rate (float): Dropout rate.
|
||||||
|
"""
|
||||||
|
super().__init__(pos_enc_class)
|
||||||
|
self.conv = nn.Sequential(
|
||||||
|
nn.Conv2D(1, odim, 3, 2),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
nn.ReLU(), )
|
||||||
|
self.linear = nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
|
||||||
|
odim)
|
||||||
|
self.subsampling_rate = 8
|
||||||
|
# The right context for every conv layer is computed by:
|
||||||
|
# (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer
|
||||||
|
# 14 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + (3 - 1) / 2 * 2 * 4
|
||||||
|
self.right_context = 14
|
||||||
|
|
||||||
|
def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
|
||||||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||||||
|
"""Subsample x.
|
||||||
|
Args:
|
||||||
|
x (paddle.Tensor): Input tensor (#batch, time, idim).
|
||||||
|
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
|
||||||
|
offset (int): position encoding offset.
|
||||||
|
Returns:
|
||||||
|
paddle.Tensor: Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 8.
|
||||||
|
paddle.Tensor: positional encoding
|
||||||
|
paddle.Tensor: Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 8.
|
||||||
|
"""
|
||||||
|
x = x.unsqueeze(1) # (b, c, t, f)
|
||||||
|
x = self.conv(x)
|
||||||
|
x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
x, pos_emb = self.pos_enc(x, offset)
|
||||||
|
return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
|
@ -0,0 +1,66 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from paddle.optimizer.lr import LRScheduler
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
from deepspeech.utils.log import Log
|
||||||
|
|
||||||
|
__all__ = ["WarmupLR"]
|
||||||
|
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class WarmupLR(LRScheduler):
|
||||||
|
"""The WarmupLR scheduler
|
||||||
|
This scheduler is almost same as NoamLR Scheduler except for following
|
||||||
|
difference:
|
||||||
|
NoamLR:
|
||||||
|
lr = optimizer.lr * model_size ** -0.5
|
||||||
|
* min(step ** -0.5, step * warmup_step ** -1.5)
|
||||||
|
WarmupLR:
|
||||||
|
lr = optimizer.lr * warmup_step ** 0.5
|
||||||
|
* min(step ** -0.5, step * warmup_step ** -1.5)
|
||||||
|
Note that the maximum lr equals to optimizer.lr in this scheduler.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
warmup_steps: Union[int, float]=25000,
|
||||||
|
learning_rate=1.0,
|
||||||
|
last_epoch=-1,
|
||||||
|
verbose=False):
|
||||||
|
assert check_argument_types()
|
||||||
|
self.warmup_steps = warmup_steps
|
||||||
|
super().__init__(learning_rate, last_epoch, verbose)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
|
||||||
|
|
||||||
|
def get_lr(self):
|
||||||
|
step_num = self.last_epoch + 1
|
||||||
|
return self.base_lr * self.warmup_steps**0.5 * min(
|
||||||
|
step_num**-0.5, step_num * self.warmup_steps**-1.5)
|
||||||
|
|
||||||
|
def set_step(self, step: int=None):
|
||||||
|
'''
|
||||||
|
It will update the learning rate in optimizer according to current ``epoch`` .
|
||||||
|
The new learning rate will take effect on next ``optimizer.step`` .
|
||||||
|
|
||||||
|
Args:
|
||||||
|
step (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
'''
|
||||||
|
self.step(epoch=step)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue