From 28658cc1690b6eff378ebc390a3549bb3aea175f Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 16 Apr 2021 08:34:25 +0000
Subject: [PATCH] fix cmvn and print prarams

---
 .notebook/u2_model.ipynb          | 1555 +++++++++++++++++++++++++++++
 deepspeech/frontend/normalizer.py |   16 +-
 deepspeech/frontend/utility.py    |    6 +-
 deepspeech/modules/mask.py        |    2 +-
 deepspeech/utils/layer_tools.py   |   51 +-
 5 files changed, 1593 insertions(+), 37 deletions(-)
 create mode 100644 .notebook/u2_model.ipynb

diff --git a/.notebook/u2_model.ipynb b/.notebook/u2_model.ipynb
new file mode 100644
index 000000000..9658af0ef
--- /dev/null
+++ b/.notebook/u2_model.ipynb
@@ -0,0 +1,1555 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "comic-scotland",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'/workspace/DeepSpeech-2.x'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "%pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "trying-palestinian",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  def convert_to_list(value, n, name, dtype=np.int):\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:93] register user softmax to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:109] register user relu to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 08:20:33 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import paddle\n",
+    "from yacs.config import CfgNode as CN\n",
+    "\n",
+    "from deepspeech.models.u2 import U2Model\n",
+    "from deepspeech.utils.layer_tools import print_params\n",
+    "from deepspeech.utils.layer_tools import summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "committed-glance",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
+      "  and should_run_async(code)\n",
+      "[INFO 2021/04/16 08:20:34 u2.py:834] U2 Encoder type: conformer\n",
+      "[INFO 2021/04/16 08:20:34 u2.py:834] U2 Encoder type: conformer\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "encoder.embed.conv.0.weight | [256, 1, 3, 3] | 2304 | True\n",
+      "encoder.embed.conv.0.bias | [256] | 256 | True\n",
+      "encoder.embed.conv.2.weight | [256, 256, 3, 3] | 589824 | True\n",
+      "encoder.embed.conv.2.bias | [256] | 256 | True\n",
+      "encoder.embed.linear.weight | [4864, 256] | 1245184 | True\n",
+      "encoder.embed.linear.bias | [256] | 256 | True\n",
+      "encoder.after_norm.weight | [256] | 256 | True\n",
+      "encoder.after_norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.0.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.0.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.0.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.0.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.0.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.0.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.0.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.0.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.0.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.0.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.0.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.0.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.0.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.0.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.0.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.0.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.0.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.0.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.0.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.0.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.0.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.0.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.0.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.1.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.1.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.1.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.1.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.1.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.1.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.1.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.1.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.1.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.1.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.1.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.1.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.1.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.1.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.1.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.1.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.1.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.1.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.1.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.1.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.1.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.1.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.1.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.2.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.2.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.2.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.2.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.2.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.2.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.2.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.2.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.2.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.2.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.2.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.2.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.2.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.2.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.2.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.2.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.2.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.2.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.2.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.2.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.2.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.2.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.2.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.3.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.3.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.3.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.3.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.3.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.3.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.3.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.3.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.3.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.3.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.3.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.3.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.3.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.3.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.3.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.3.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.3.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.3.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.3.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.3.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.3.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.3.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.3.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.4.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.4.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.4.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.4.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.4.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.4.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.4.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.4.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.4.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.4.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.4.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.4.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.4.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.4.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.4.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.4.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.4.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.4.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.4.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.4.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.4.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.4.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.4.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.5.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.5.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.5.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.5.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.5.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.5.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.5.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.5.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.5.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.5.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.5.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.5.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.5.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.5.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.5.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.5.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.5.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.5.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.5.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.5.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.5.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.5.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.5.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.6.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.6.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.6.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.6.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.6.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.6.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.6.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.6.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.6.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.6.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.6.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.6.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.6.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.6.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.6.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.6.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.6.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.6.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.6.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.6.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.6.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.6.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.6.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.7.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.7.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.7.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.7.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.7.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.7.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.7.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.7.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.7.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.7.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.7.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.7.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.7.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.7.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.7.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.7.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.7.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.7.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.7.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.7.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.7.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.7.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.7.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.8.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.8.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.8.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.8.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.8.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.8.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.8.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.8.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.8.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.8.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.8.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.8.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.8.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.8.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.8.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.8.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.8.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.8.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.8.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.8.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.8.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.8.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.8.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.9.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.9.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.9.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.9.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.9.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.9.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.9.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.9.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.9.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.9.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.9.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.9.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.9.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.9.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.9.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.9.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.9.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.9.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.9.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.9.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.9.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.9.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.9.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.10.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.10.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.10.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.10.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.10.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.10.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.10.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.10.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.10.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.10.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.10.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.10.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.10.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.10.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.10.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.10.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.10.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.10.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.10.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.10.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.10.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.10.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.10.concat_linear.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.self_attn.pos_bias_u | [4, 64] | 256 | True\n",
+      "encoder.encoders.11.self_attn.pos_bias_v | [4, 64] | 256 | True\n",
+      "encoder.encoders.11.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.11.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.11.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.11.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.11.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.self_attn.linear_pos.weight | [256, 256] | 65536 | True\n",
+      "encoder.encoders.11.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.11.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.11.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.11.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.feed_forward_macaron.w_1.weight | [256, 2048] | 524288 | True\n",
+      "encoder.encoders.11.feed_forward_macaron.w_1.bias | [2048] | 2048 | True\n",
+      "encoder.encoders.11.feed_forward_macaron.w_2.weight | [2048, 256] | 524288 | True\n",
+      "encoder.encoders.11.feed_forward_macaron.w_2.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072 | True\n",
+      "encoder.encoders.11.conv_module.pointwise_conv1.bias | [512] | 512 | True\n",
+      "encoder.encoders.11.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840 | True\n",
+      "encoder.encoders.11.conv_module.depthwise_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.conv_module.norm.weight | [256] | 256 | True\n",
+      "encoder.encoders.11.conv_module.norm.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.conv_module.norm._mean | [256] | 256 | False\n",
+      "encoder.encoders.11.conv_module.norm._variance | [256] | 256 | False\n",
+      "encoder.encoders.11.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536 | True\n",
+      "encoder.encoders.11.conv_module.pointwise_conv2.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_ff.weight | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_ff.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_mha.weight | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_mha.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_ff_macaron.weight | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_ff_macaron.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_conv.weight | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_conv.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_final.weight | [256] | 256 | True\n",
+      "encoder.encoders.11.norm_final.bias | [256] | 256 | True\n",
+      "encoder.encoders.11.concat_linear.weight | [512, 256] | 131072 | True\n",
+      "encoder.encoders.11.concat_linear.bias | [256] | 256 | True\n",
+      "decoder.embed.0.weight | [4223, 256] | 1081088 | True\n",
+      "decoder.after_norm.weight | [256] | 256 | True\n",
+      "decoder.after_norm.bias | [256] | 256 | True\n",
+      "decoder.output_layer.weight | [256, 4223] | 1081088 | True\n",
+      "decoder.output_layer.bias | [4223] | 4223 | True\n",
+      "decoder.decoders.0.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.src_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.src_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.src_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.src_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.src_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.src_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.src_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.0.src_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "decoder.decoders.0.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "decoder.decoders.0.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "decoder.decoders.0.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.norm1.weight | [256] | 256 | True\n",
+      "decoder.decoders.0.norm1.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.norm2.weight | [256] | 256 | True\n",
+      "decoder.decoders.0.norm2.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.norm3.weight | [256] | 256 | True\n",
+      "decoder.decoders.0.norm3.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.concat_linear1.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.0.concat_linear1.bias | [256] | 256 | True\n",
+      "decoder.decoders.0.concat_linear2.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.0.concat_linear2.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.src_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.src_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.src_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.src_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.src_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.src_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.src_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.1.src_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "decoder.decoders.1.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "decoder.decoders.1.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "decoder.decoders.1.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.norm1.weight | [256] | 256 | True\n",
+      "decoder.decoders.1.norm1.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.norm2.weight | [256] | 256 | True\n",
+      "decoder.decoders.1.norm2.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.norm3.weight | [256] | 256 | True\n",
+      "decoder.decoders.1.norm3.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.concat_linear1.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.1.concat_linear1.bias | [256] | 256 | True\n",
+      "decoder.decoders.1.concat_linear2.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.1.concat_linear2.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.src_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.src_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.src_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.src_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.src_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.src_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.src_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.2.src_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "decoder.decoders.2.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "decoder.decoders.2.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "decoder.decoders.2.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.norm1.weight | [256] | 256 | True\n",
+      "decoder.decoders.2.norm1.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.norm2.weight | [256] | 256 | True\n",
+      "decoder.decoders.2.norm2.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.norm3.weight | [256] | 256 | True\n",
+      "decoder.decoders.2.norm3.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.concat_linear1.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.2.concat_linear1.bias | [256] | 256 | True\n",
+      "decoder.decoders.2.concat_linear2.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.2.concat_linear2.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.src_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.src_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.src_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.src_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.src_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.src_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.src_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.3.src_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "decoder.decoders.3.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "decoder.decoders.3.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "decoder.decoders.3.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.norm1.weight | [256] | 256 | True\n",
+      "decoder.decoders.3.norm1.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.norm2.weight | [256] | 256 | True\n",
+      "decoder.decoders.3.norm2.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.norm3.weight | [256] | 256 | True\n",
+      "decoder.decoders.3.norm3.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.concat_linear1.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.3.concat_linear1.bias | [256] | 256 | True\n",
+      "decoder.decoders.3.concat_linear2.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.3.concat_linear2.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.src_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.src_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.src_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.src_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.src_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.src_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.src_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.4.src_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "decoder.decoders.4.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "decoder.decoders.4.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "decoder.decoders.4.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.norm1.weight | [256] | 256 | True\n",
+      "decoder.decoders.4.norm1.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.norm2.weight | [256] | 256 | True\n",
+      "decoder.decoders.4.norm2.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.norm3.weight | [256] | 256 | True\n",
+      "decoder.decoders.4.norm3.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.concat_linear1.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.4.concat_linear1.bias | [256] | 256 | True\n",
+      "decoder.decoders.4.concat_linear2.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.4.concat_linear2.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.self_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.self_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.self_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.self_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.self_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.self_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.self_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.self_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.src_attn.linear_q.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.src_attn.linear_q.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.src_attn.linear_k.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.src_attn.linear_k.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.src_attn.linear_v.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.src_attn.linear_v.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.src_attn.linear_out.weight | [256, 256] | 65536 | True\n",
+      "decoder.decoders.5.src_attn.linear_out.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.feed_forward.w_1.weight | [256, 2048] | 524288 | True\n",
+      "decoder.decoders.5.feed_forward.w_1.bias | [2048] | 2048 | True\n",
+      "decoder.decoders.5.feed_forward.w_2.weight | [2048, 256] | 524288 | True\n",
+      "decoder.decoders.5.feed_forward.w_2.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.norm1.weight | [256] | 256 | True\n",
+      "decoder.decoders.5.norm1.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.norm2.weight | [256] | 256 | True\n",
+      "decoder.decoders.5.norm2.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.norm3.weight | [256] | 256 | True\n",
+      "decoder.decoders.5.norm3.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.concat_linear1.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.5.concat_linear1.bias | [256] | 256 | True\n",
+      "decoder.decoders.5.concat_linear2.weight | [512, 256] | 131072 | True\n",
+      "decoder.decoders.5.concat_linear2.bias | [256] | 256 | True\n",
+      "ctc.ctc_lo.weight | [256, 4223] | 1081088 | True\n",
+      "ctc.ctc_lo.bias | [4223] | 4223 | True\n",
+      "Total parameters: 687.0, 49347582.0 elements.\n"
+     ]
+    }
+   ],
+   "source": [
+    "conf_str='examples/aishell/s1/conf/conformer.yaml'\n",
+    "cfg = CN().load_cfg(open(conf_str))\n",
+    "cfg.model.input_dim = 80\n",
+    "cfg.model.output_dim = 4223\n",
+    "cfg.model.cmvn_file = \"/workspace/wenet/examples/aishell/s0/raw_wav/train/global_cmvn\"\n",
+    "cfg.model.cmvn_file_type = 'json'\n",
+    "cfg.freeze()\n",
+    "\n",
+    "model = U2Model(cfg.model)\n",
+    "print_params(model)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "reserved-nightlife",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "encoder.global_cmvn.mean | [80] | 80\n",
+      "encoder.global_cmvn.istd | [80] | 80\n",
+      "encoder.embed.conv.0.weight | [256, 1, 3, 3] | 2304\n",
+      "encoder.embed.conv.0.bias | [256] | 256\n",
+      "encoder.embed.conv.2.weight | [256, 256, 3, 3] | 589824\n",
+      "encoder.embed.conv.2.bias | [256] | 256\n",
+      "encoder.embed.linear.weight | [4864, 256] | 1245184\n",
+      "encoder.embed.linear.bias | [256] | 256\n",
+      "encoder.after_norm.weight | [256] | 256\n",
+      "encoder.after_norm.bias | [256] | 256\n",
+      "encoder.encoders.0.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.0.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.0.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.0.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.0.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.0.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.0.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.0.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.0.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.0.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.0.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.0.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.0.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.0.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.0.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.0.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.0.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.0.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.0.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.0.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.0.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.0.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.0.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.0.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.0.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.0.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.0.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.0.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.0.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.0.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.0.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.0.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.0.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.0.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.0.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.0.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.0.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.0.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.0.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.0.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.0.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.1.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.1.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.1.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.1.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.1.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.1.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.1.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.1.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.1.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.1.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.1.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.1.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.1.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.1.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.1.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.1.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.1.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.1.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.1.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.1.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.1.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.1.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.1.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.1.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.1.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.1.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.1.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.1.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.1.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.1.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.1.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.1.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.1.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.1.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.1.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.1.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.1.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.1.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.1.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.1.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.1.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.2.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.2.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.2.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.2.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.2.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.2.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.2.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.2.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.2.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.2.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.2.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.2.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.2.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.2.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.2.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.2.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.2.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.2.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.2.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.2.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.2.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.2.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.2.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.2.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.2.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.2.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.2.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.2.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.2.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.2.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.2.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.2.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.2.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.2.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.2.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.2.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.2.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.2.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.2.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.2.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.2.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.3.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.3.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.3.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.3.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.3.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.3.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.3.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.3.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.3.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.3.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.3.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.3.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.3.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.3.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.3.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.3.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.3.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.3.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.3.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.3.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.3.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.3.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.3.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.3.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.3.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.3.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.3.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.3.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.3.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.3.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.3.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.3.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.3.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.3.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.3.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.3.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.3.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.3.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.3.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.3.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.3.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.4.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.4.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.4.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.4.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.4.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.4.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.4.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.4.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.4.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.4.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.4.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.4.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.4.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.4.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.4.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.4.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.4.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.4.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.4.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.4.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.4.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.4.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.4.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.4.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.4.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.4.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.4.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.4.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.4.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.4.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.4.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.4.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.4.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.4.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.4.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.4.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.4.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.4.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.4.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.4.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.4.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.5.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.5.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.5.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.5.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.5.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.5.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.5.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.5.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.5.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.5.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.5.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.5.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.5.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.5.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.5.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.5.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.5.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.5.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.5.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.5.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.5.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.5.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.5.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.5.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.5.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.5.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.5.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.5.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.5.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.5.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.5.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.5.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.5.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.5.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.5.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.5.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.5.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.5.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.5.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.5.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.5.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.6.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.6.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.6.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.6.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.6.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.6.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.6.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.6.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.6.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.6.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.6.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.6.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.6.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.6.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.6.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.6.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.6.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.6.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.6.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.6.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.6.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.6.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.6.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.6.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.6.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.6.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.6.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.6.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.6.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.6.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.6.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.6.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.6.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.6.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.6.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.6.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.6.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.6.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.6.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.6.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.6.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.7.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.7.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.7.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.7.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.7.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.7.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.7.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.7.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.7.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.7.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.7.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.7.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.7.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.7.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.7.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.7.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.7.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.7.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.7.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.7.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.7.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.7.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.7.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.7.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.7.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.7.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.7.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.7.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.7.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.7.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.7.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.7.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.7.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.7.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.7.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.7.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.7.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.7.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.7.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.7.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.7.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.8.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.8.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.8.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.8.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.8.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.8.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.8.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.8.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.8.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.8.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.8.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.8.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.8.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.8.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.8.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.8.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.8.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.8.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.8.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.8.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.8.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.8.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.8.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.8.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.8.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.8.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.8.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.8.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.8.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.8.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.8.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.8.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.8.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.8.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.8.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.8.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.8.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.8.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.8.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.8.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.8.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.9.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.9.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.9.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.9.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.9.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.9.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.9.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.9.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.9.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.9.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.9.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.9.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.9.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.9.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.9.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.9.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.9.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.9.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.9.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.9.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.9.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.9.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.9.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.9.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.9.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.9.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.9.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.9.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.9.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.9.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.9.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.9.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.9.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.9.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.9.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.9.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.9.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.9.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.9.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.9.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.9.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.10.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.10.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.10.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.10.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.10.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.10.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.10.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.10.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.10.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.10.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.10.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.10.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.10.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.10.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.10.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.10.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.10.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.10.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.10.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.10.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.10.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.10.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.10.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.10.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.10.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.10.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.10.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.10.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.10.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.10.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.10.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.10.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.10.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.10.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.10.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.10.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.10.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.10.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.10.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.10.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.10.concat_linear.bias | [256] | 256\n",
+      "encoder.encoders.11.self_attn.pos_bias_u | [4, 64] | 256\n",
+      "encoder.encoders.11.self_attn.pos_bias_v | [4, 64] | 256\n",
+      "encoder.encoders.11.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "encoder.encoders.11.self_attn.linear_q.bias | [256] | 256\n",
+      "encoder.encoders.11.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "encoder.encoders.11.self_attn.linear_k.bias | [256] | 256\n",
+      "encoder.encoders.11.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "encoder.encoders.11.self_attn.linear_v.bias | [256] | 256\n",
+      "encoder.encoders.11.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "encoder.encoders.11.self_attn.linear_out.bias | [256] | 256\n",
+      "encoder.encoders.11.self_attn.linear_pos.weight | [256, 256] | 65536\n",
+      "encoder.encoders.11.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.11.feed_forward.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.11.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.11.feed_forward.w_2.bias | [256] | 256\n",
+      "encoder.encoders.11.feed_forward_macaron.w_1.weight | [256, 2048] | 524288\n",
+      "encoder.encoders.11.feed_forward_macaron.w_1.bias | [2048] | 2048\n",
+      "encoder.encoders.11.feed_forward_macaron.w_2.weight | [2048, 256] | 524288\n",
+      "encoder.encoders.11.feed_forward_macaron.w_2.bias | [256] | 256\n",
+      "encoder.encoders.11.conv_module.pointwise_conv1.weight | [512, 256, 1] | 131072\n",
+      "encoder.encoders.11.conv_module.pointwise_conv1.bias | [512] | 512\n",
+      "encoder.encoders.11.conv_module.depthwise_conv.weight | [256, 1, 15] | 3840\n",
+      "encoder.encoders.11.conv_module.depthwise_conv.bias | [256] | 256\n",
+      "encoder.encoders.11.conv_module.norm.weight | [256] | 256\n",
+      "encoder.encoders.11.conv_module.norm.bias | [256] | 256\n",
+      "encoder.encoders.11.conv_module.norm._mean | [256] | 256\n",
+      "encoder.encoders.11.conv_module.norm._variance | [256] | 256\n",
+      "encoder.encoders.11.conv_module.pointwise_conv2.weight | [256, 256, 1] | 65536\n",
+      "encoder.encoders.11.conv_module.pointwise_conv2.bias | [256] | 256\n",
+      "encoder.encoders.11.norm_ff.weight | [256] | 256\n",
+      "encoder.encoders.11.norm_ff.bias | [256] | 256\n",
+      "encoder.encoders.11.norm_mha.weight | [256] | 256\n",
+      "encoder.encoders.11.norm_mha.bias | [256] | 256\n",
+      "encoder.encoders.11.norm_ff_macaron.weight | [256] | 256\n",
+      "encoder.encoders.11.norm_ff_macaron.bias | [256] | 256\n",
+      "encoder.encoders.11.norm_conv.weight | [256] | 256\n",
+      "encoder.encoders.11.norm_conv.bias | [256] | 256\n",
+      "encoder.encoders.11.norm_final.weight | [256] | 256\n",
+      "encoder.encoders.11.norm_final.bias | [256] | 256\n",
+      "encoder.encoders.11.concat_linear.weight | [512, 256] | 131072\n",
+      "encoder.encoders.11.concat_linear.bias | [256] | 256\n",
+      "decoder.embed.0.weight | [4223, 256] | 1081088\n",
+      "decoder.after_norm.weight | [256] | 256\n",
+      "decoder.after_norm.bias | [256] | 256\n",
+      "decoder.output_layer.weight | [256, 4223] | 1081088\n",
+      "decoder.output_layer.bias | [4223] | 4223\n",
+      "decoder.decoders.0.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.self_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.0.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.self_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.0.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.self_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.0.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.self_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.0.src_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.src_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.0.src_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.src_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.0.src_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.src_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.0.src_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.0.src_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.0.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "decoder.decoders.0.feed_forward.w_1.bias | [2048] | 2048\n",
+      "decoder.decoders.0.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "decoder.decoders.0.feed_forward.w_2.bias | [256] | 256\n",
+      "decoder.decoders.0.norm1.weight | [256] | 256\n",
+      "decoder.decoders.0.norm1.bias | [256] | 256\n",
+      "decoder.decoders.0.norm2.weight | [256] | 256\n",
+      "decoder.decoders.0.norm2.bias | [256] | 256\n",
+      "decoder.decoders.0.norm3.weight | [256] | 256\n",
+      "decoder.decoders.0.norm3.bias | [256] | 256\n",
+      "decoder.decoders.0.concat_linear1.weight | [512, 256] | 131072\n",
+      "decoder.decoders.0.concat_linear1.bias | [256] | 256\n",
+      "decoder.decoders.0.concat_linear2.weight | [512, 256] | 131072\n",
+      "decoder.decoders.0.concat_linear2.bias | [256] | 256\n",
+      "decoder.decoders.1.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.self_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.1.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.self_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.1.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.self_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.1.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.self_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.1.src_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.src_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.1.src_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.src_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.1.src_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.src_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.1.src_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.1.src_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.1.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "decoder.decoders.1.feed_forward.w_1.bias | [2048] | 2048\n",
+      "decoder.decoders.1.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "decoder.decoders.1.feed_forward.w_2.bias | [256] | 256\n",
+      "decoder.decoders.1.norm1.weight | [256] | 256\n",
+      "decoder.decoders.1.norm1.bias | [256] | 256\n",
+      "decoder.decoders.1.norm2.weight | [256] | 256\n",
+      "decoder.decoders.1.norm2.bias | [256] | 256\n",
+      "decoder.decoders.1.norm3.weight | [256] | 256\n",
+      "decoder.decoders.1.norm3.bias | [256] | 256\n",
+      "decoder.decoders.1.concat_linear1.weight | [512, 256] | 131072\n",
+      "decoder.decoders.1.concat_linear1.bias | [256] | 256\n",
+      "decoder.decoders.1.concat_linear2.weight | [512, 256] | 131072\n",
+      "decoder.decoders.1.concat_linear2.bias | [256] | 256\n",
+      "decoder.decoders.2.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.self_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.2.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.self_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.2.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.self_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.2.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.self_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.2.src_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.src_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.2.src_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.src_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.2.src_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.src_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.2.src_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.2.src_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.2.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "decoder.decoders.2.feed_forward.w_1.bias | [2048] | 2048\n",
+      "decoder.decoders.2.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "decoder.decoders.2.feed_forward.w_2.bias | [256] | 256\n",
+      "decoder.decoders.2.norm1.weight | [256] | 256\n",
+      "decoder.decoders.2.norm1.bias | [256] | 256\n",
+      "decoder.decoders.2.norm2.weight | [256] | 256\n",
+      "decoder.decoders.2.norm2.bias | [256] | 256\n",
+      "decoder.decoders.2.norm3.weight | [256] | 256\n",
+      "decoder.decoders.2.norm3.bias | [256] | 256\n",
+      "decoder.decoders.2.concat_linear1.weight | [512, 256] | 131072\n",
+      "decoder.decoders.2.concat_linear1.bias | [256] | 256\n",
+      "decoder.decoders.2.concat_linear2.weight | [512, 256] | 131072\n",
+      "decoder.decoders.2.concat_linear2.bias | [256] | 256\n",
+      "decoder.decoders.3.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.self_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.3.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.self_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.3.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.self_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.3.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.self_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.3.src_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.src_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.3.src_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.src_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.3.src_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.src_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.3.src_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.3.src_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.3.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "decoder.decoders.3.feed_forward.w_1.bias | [2048] | 2048\n",
+      "decoder.decoders.3.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "decoder.decoders.3.feed_forward.w_2.bias | [256] | 256\n",
+      "decoder.decoders.3.norm1.weight | [256] | 256\n",
+      "decoder.decoders.3.norm1.bias | [256] | 256\n",
+      "decoder.decoders.3.norm2.weight | [256] | 256\n",
+      "decoder.decoders.3.norm2.bias | [256] | 256\n",
+      "decoder.decoders.3.norm3.weight | [256] | 256\n",
+      "decoder.decoders.3.norm3.bias | [256] | 256\n",
+      "decoder.decoders.3.concat_linear1.weight | [512, 256] | 131072\n",
+      "decoder.decoders.3.concat_linear1.bias | [256] | 256\n",
+      "decoder.decoders.3.concat_linear2.weight | [512, 256] | 131072\n",
+      "decoder.decoders.3.concat_linear2.bias | [256] | 256\n",
+      "decoder.decoders.4.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.self_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.4.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.self_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.4.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.self_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.4.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.self_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.4.src_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.src_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.4.src_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.src_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.4.src_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.src_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.4.src_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.4.src_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.4.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "decoder.decoders.4.feed_forward.w_1.bias | [2048] | 2048\n",
+      "decoder.decoders.4.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "decoder.decoders.4.feed_forward.w_2.bias | [256] | 256\n",
+      "decoder.decoders.4.norm1.weight | [256] | 256\n",
+      "decoder.decoders.4.norm1.bias | [256] | 256\n",
+      "decoder.decoders.4.norm2.weight | [256] | 256\n",
+      "decoder.decoders.4.norm2.bias | [256] | 256\n",
+      "decoder.decoders.4.norm3.weight | [256] | 256\n",
+      "decoder.decoders.4.norm3.bias | [256] | 256\n",
+      "decoder.decoders.4.concat_linear1.weight | [512, 256] | 131072\n",
+      "decoder.decoders.4.concat_linear1.bias | [256] | 256\n",
+      "decoder.decoders.4.concat_linear2.weight | [512, 256] | 131072\n",
+      "decoder.decoders.4.concat_linear2.bias | [256] | 256\n",
+      "decoder.decoders.5.self_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.self_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.5.self_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.self_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.5.self_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.self_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.5.self_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.self_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.5.src_attn.linear_q.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.src_attn.linear_q.bias | [256] | 256\n",
+      "decoder.decoders.5.src_attn.linear_k.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.src_attn.linear_k.bias | [256] | 256\n",
+      "decoder.decoders.5.src_attn.linear_v.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.src_attn.linear_v.bias | [256] | 256\n",
+      "decoder.decoders.5.src_attn.linear_out.weight | [256, 256] | 65536\n",
+      "decoder.decoders.5.src_attn.linear_out.bias | [256] | 256\n",
+      "decoder.decoders.5.feed_forward.w_1.weight | [256, 2048] | 524288\n",
+      "decoder.decoders.5.feed_forward.w_1.bias | [2048] | 2048\n",
+      "decoder.decoders.5.feed_forward.w_2.weight | [2048, 256] | 524288\n",
+      "decoder.decoders.5.feed_forward.w_2.bias | [256] | 256\n",
+      "decoder.decoders.5.norm1.weight | [256] | 256\n",
+      "decoder.decoders.5.norm1.bias | [256] | 256\n",
+      "decoder.decoders.5.norm2.weight | [256] | 256\n",
+      "decoder.decoders.5.norm2.bias | [256] | 256\n",
+      "decoder.decoders.5.norm3.weight | [256] | 256\n",
+      "decoder.decoders.5.norm3.bias | [256] | 256\n",
+      "decoder.decoders.5.concat_linear1.weight | [512, 256] | 131072\n",
+      "decoder.decoders.5.concat_linear1.bias | [256] | 256\n",
+      "decoder.decoders.5.concat_linear2.weight | [512, 256] | 131072\n",
+      "decoder.decoders.5.concat_linear2.bias | [256] | 256\n",
+      "ctc.ctc_lo.weight | [256, 4223] | 1081088\n",
+      "ctc.ctc_lo.bias | [4223] | 4223\n",
+      "Total parameters: 689, 49347742 elements.\n"
+     ]
+    }
+   ],
+   "source": [
+    "summary(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ranking-beads",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_loss, attention_loss, ctc_loss = model(self.audio, self.audio_len,\n",
+    "                                         self.text, self.text_len)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/deepspeech/frontend/normalizer.py b/deepspeech/frontend/normalizer.py
index 9161c1e46..83c1ff905 100644
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -77,15 +77,19 @@ class FeatureNormalizer(object):
         :param filepath: File to write mean and stddev.
         :type filepath: str
         """
-        np.savez(filepath, mean=self._mean, std=self._std)
+        np.savez(filepath, mean=self._mean, istd=self._istd)
 
     def _read_mean_std_from_file(self, filepath, eps=1e-20):
         """Load mean and std from file."""
-        mean, std = load_cmvn(filepath, filetype='npz')
+        mean, istd = load_cmvn(filepath, filetype='npz')
         self._mean = mean.T
-        self._istd = 1.0 / std.T
+        self._istd = istd.T
 
-    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
+    def _compute_mean_std(self,
+                          manifest_path,
+                          featurize_func,
+                          num_samples,
+                          eps=1e-20):
         """Compute mean and std from randomly sampled instances."""
         manifest = read_manifest(manifest_path)
         if num_samples == -1:
@@ -98,4 +102,6 @@ class FeatureNormalizer(object):
                 featurize_func(AudioSegment.from_file(instance["feat"])))
         features = np.hstack(features)  #(D, T)
         self._mean = np.mean(features, axis=1).reshape([1, -1])  #(1, D)
-        self._std = np.std(features, axis=1).reshape([1, -1])  #(1, D)
+        std = np.std(features, axis=1).reshape([1, -1])  #(1, D)
+        std = np.clip(std, eps, None)
+        self._istd = 1.0 / std
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 4b17c841a..5a4989d62 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -238,10 +238,8 @@ def _load_kaldi_cmvn(kaldi_cmvn_file):
 def _load_npz_cmvn(npz_cmvn_file, eps=1e-20):
     npzfile = np.load(npz_cmvn_file)
     means = npzfile["mean"]  #(1, D)
-    std = npzfile["std"]  #(1, D)
-    std = np.clip(std, eps, None)
-    variance = 1.0 / std
-    cmvn = np.array([means, variance])
+    istd = npzfile["istd"]  #(1, D)
+    cmvn = np.array([means, istd])
     return cmvn
 
 
diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py
index fa6e0d552..f19e56f2f 100644
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
@@ -25,7 +25,7 @@ __all__ = [
 
 
 def sequence_mask(x_len, max_len=None, dtype='float32'):
-    """[summary]
+    """batch sequence mask.
 
     Args:
         x_len ([paddle.Tensor]): xs lenght, [B]
diff --git a/deepspeech/utils/layer_tools.py b/deepspeech/utils/layer_tools.py
index 0ff4f6f54..1e8e55ed1 100644
--- a/deepspeech/utils/layer_tools.py
+++ b/deepspeech/utils/layer_tools.py
@@ -22,8 +22,6 @@ __all__ = [
 
 def summary(layer: nn.Layer, print_func=print):
     num_params = num_elements = 0
-    if print_func:
-        print_func(f"{layer.__class__.__name__} summary:")
     for name, param in layer.state_dict().items():
         if print_func:
             print_func(
@@ -31,9 +29,7 @@ def summary(layer: nn.Layer, print_func=print):
         num_elements += np.prod(param.shape)
         num_params += 1
     if print_func:
-        print_func(
-            f"{layer.__class__.__name__} has {num_params} parameters, {num_elements} elements."
-        )
+        print_func(f"Total parameters: {num_params}, {num_elements} elements.")
 
 
 def gradient_norm(layer: nn.Layer):
@@ -45,25 +41,6 @@ def gradient_norm(layer: nn.Layer):
     return grad_norm_dict
 
 
-def recursively_remove_weight_norm(layer: nn.Layer):
-    for layer in layer.sublayers():
-        try:
-            nn.utils.remove_weight_norm(layer)
-        except ValueError as e:
-            # ther is not weight norm hoom in this layer
-            pass
-
-
-def freeze(layer: nn.Layer):
-    for param in layer.parameters():
-        param.trainable = False
-
-
-def unfreeze(layer: nn.Layer):
-    for param in layer.parameters():
-        param.trainable = True
-
-
 def print_grads(model, print_func=print):
     if print_func is None:
         return
@@ -75,12 +52,32 @@ def print_grads(model, print_func=print):
 def print_params(model, print_func=print):
     if print_func is None:
         return
-
     total = 0.0
+    num_params = 0.0
     for n, p in model.named_parameters():
-        msg = f"param: {n}: shape: {p.shape} stop_grad: {p.stop_gradient}"
+        msg = f"{n} | {p.shape} | {np.prod(p.shape)} | {not p.stop_gradient}"
         total += np.prod(p.shape)
+        num_params += 1
         if print_func:
             print_func(msg)
     if print_func:
-        print_func(f"Total parameters: {total}!")
+        print_func(f"Total parameters: {num_params}, {total} elements.")
+
+
+def recursively_remove_weight_norm(layer: nn.Layer):
+    for layer in layer.sublayers():
+        try:
+            nn.utils.remove_weight_norm(layer)
+        except ValueError as e:
+            # ther is not weight norm hoom in this layer
+            pass
+
+
+def freeze(layer: nn.Layer):
+    for param in layer.parameters():
+        param.trainable = False
+
+
+def unfreeze(layer: nn.Layer):
+    for param in layer.parameters():
+        param.trainable = True