add encoder

pull/578/head
Hui Zhang 5 years ago
parent 426d370413
commit b6d729a675

@ -37,26 +37,26 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2021-03-24 06:30:47,727 - WARNING - register user softmax to paddle, remove this when fixed!\n",
"2021-03-24 06:30:47,728 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
"2021-03-24 06:30:47,729 - WARNING - register user relu to paddle, remove this when fixed!\n",
"2021-03-24 06:30:47,729 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
"2021-03-24 06:30:47,730 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-24 06:30:47,731 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-24 06:30:47,731 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"2021-03-24 06:30:47,732 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,732 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,733 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,733 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,734 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,734 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,735 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,735 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,736 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,736 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
"2021-03-24 06:30:47,737 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"2021-03-24 06:30:47,737 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
"2021-03-24 06:30:47,738 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
"2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
"2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
" from numpy.dual import register_func\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
@ -102,10 +102,10 @@
"output_type": "stream",
"text": [
"0.0.0\n",
"607856a949ed7356237ed8148947f7fd2b0f4631\n",
"ON\n",
"ON\n",
"commit: 607856a949ed7356237ed8148947f7fd2b0f4631\n",
"e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
"OFF\n",
"OFF\n",
"commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
"None\n",
"0\n"
]
@ -226,7 +226,7 @@
"output: None\n",
"params_file: examples/aishell/jit.model.pdiparams\n",
"speech_save_dir: demo_cache\n",
"use_gpu: True\n",
"use_gpu: False\n",
"warmup_manifest: examples/aishell/data/manifest.test\n",
"------------------------------------------------\n"
]
@ -266,7 +266,7 @@
" help=\n",
" \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
")\n",
"add_arg(\"--use_gpu\",type=bool,default=True, help=\"Whether use gpu.\")\n",
"add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
"\n",
"\n",
"args = parser.parse_args(\n",
@ -321,7 +321,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2021-03-24 06:31:20,943 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
"2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
]
},
{
@ -407,7 +407,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -418,7 +418,7 @@
"examples/aishell/jit.model.pdmodel\n",
"examples/aishell/jit.model.pdiparams\n",
"0\n",
"True\n"
"False\n"
]
}
],
@ -428,7 +428,8 @@
"from paddle.inference import PrecisionType\n",
"from paddle.inference import create_predictor\n",
"\n",
"args.use_gpu=True\n",
"args.use_gpu=False\n",
"paddle.set_device('cpu')\n",
"\n",
"def init_predictor(args):\n",
" if args.model_dir is not None:\n",
@ -438,8 +439,8 @@
"\n",
" if args.use_gpu:\n",
" config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
" config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
" use_calib_mode=True) # 开启TensorRT预测精度为fp32开启int8离线量化\n",
"# config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
"# use_calib_mode=True) # 开启TensorRT预测精度为fp32开启int8离线量化\n",
" else:\n",
" # If not specific mkldnn, you can set the blas thread.\n",
" # The thread num should not be greater than the number of cores in the CPU.\n",
@ -447,7 +448,7 @@
" config.enable_mkldnn()\n",
" \n",
" config.enable_memory_optim()\n",
" config.switch_ir_optim(False)\n",
" config.switch_ir_optim(True)\n",
" \n",
" print(config.model_dir())\n",
" print(config.prog_file())\n",
@ -534,7 +535,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -546,49 +547,35 @@
"input: 0 audio\n",
"input: 1 audio_len\n",
"output: 0 tmp_75\n",
"jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
" 8.91578738e-12 4.64319072e-08]\n",
" [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
" 1.55893121e-15 9.99992609e-01]\n",
" [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
"jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
" 8.91573707e-12 4.64317296e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
" 1.24587264e-17 1.00000000e+00]\n",
" ...\n",
" [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
" 4.37358093e-15 1.00000000e+00]\n",
" [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
" 3.89255983e-13 1.00000000e+00]\n",
" [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
" 1.00334671e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
" [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
" 3.89252270e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
"[1, 161, 522]\n",
"[1]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
" return (isinstance(seq, collections.Sequence) and\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"paddle: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
" 8.91578738e-12 4.64319072e-08]\n",
" [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
" 1.55893121e-15 9.99992609e-01]\n",
" [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
"[1]\n",
"paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
" 8.91577090e-12 4.64319072e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
" ...\n",
" [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
" 4.37358093e-15 1.00000000e+00]\n",
" [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
" 3.89255983e-13 1.00000000e+00]\n",
" [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
" 1.00334671e-10 9.99998808e-01]]]\n",
"True\n"
" [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
" 3.89253761e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]]\n",
"False\n"
]
}
],
@ -607,7 +594,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 21,
"metadata": {},
"outputs": [
{
@ -618,19 +605,19 @@
"input: 0 audio\n",
"input: 1 audio_len\n",
"output: 0 tmp_75\n",
"jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
" 8.91578738e-12 4.64319072e-08]\n",
" [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
" 1.55893121e-15 9.99992609e-01]\n",
" [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
"jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
" 8.91577090e-12 4.64319072e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
" ...\n",
" [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
" 4.37358093e-15 1.00000000e+00]\n",
" [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
" 3.89255983e-13 1.00000000e+00]\n",
" [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
" 1.00334671e-10 9.99998808e-01]]]\n"
" [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
" 3.89253761e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]]\n"
]
}
],

@ -454,7 +454,7 @@
" act='brelu')\n",
"\n",
" out_channel = 32\n",
" self.conv_stack = nn.LayerList([\n",
" self.conv_stack = nn.Sequential([\n",
" ConvBn(\n",
" num_channels_in=32,\n",
" num_channels_out=out_channel,\n",
@ -1884,4 +1884,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}

@ -59,3 +59,6 @@ You are welcome to submit questions and bug reports in [Github Issues](https://g
## License
DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement
We depends on many open source repos. See [References](docs/reference.md) for more information.

@ -56,3 +56,6 @@ source tools/venv/bin/activate
## License
DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。
## 感谢
开发中参考一些优秀的仓库,详情参见 [References](docs/reference.md)。

@ -266,8 +266,17 @@ logger.warn(
)
F.ctc_loss = ctc_loss
########### hcak paddle.nn #############
if not hasattr(paddle.nn, 'Module'):
logger.warn("register user Module to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'Module', paddle.nn.Layer)
if not hasattr(paddle.nn, 'ModuleList'):
logger.warn(
"register user ModuleList to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""

@ -143,7 +143,8 @@ def get_activation(act):
"relu": paddle.nn.ReLU,
"selu": paddle.nn.SELU,
"swish": paddle.nn.Swish,
"gelu": paddle.nn.GELU
"gelu": paddle.nn.GELU,
"brelu": brelu,
}
return activation_funcs[act]()

@ -51,7 +51,7 @@ class PositionalEncoding(nn.Layer):
self.pe = paddle.zeros(self.max_len, self.d_model) #[T,D]
position = paddle.arange(
0, self.max_len, dtype=paddle.float32).unsqueeze(1)
0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1]
div_term = paddle.exp(
paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
-(math.log(10000.0) / self.d_model))
@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
offset (int): position offset
Returns:
paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
paddle.Tensor: for compatibility to RelPositionalEncoding
paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
"""
T = paddle.shape(x)[1]
assert offset + T < self.max_len

@ -59,16 +59,16 @@ class BaseEncoder(nn.Layer):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: torch.nn.Module=None,
global_cmvn: paddle.nn.Layer=None,
use_dynamic_left_chunk: bool=False, ):
"""
Args:
input_size (int): input dim
output_size (int): dimension of attention
input_size (int): input dim, d_feature
output_size (int): dimension of attention, d_model
attention_heads (int): the number of heads of multi head attention
linear_units (int): the hidden units number of position-wise feed
forward
num_blocks (int): the number of decoder blocks
num_blocks (int): the number of encoder blocks
dropout_rate (float): dropout rate
attention_dropout_rate (float): dropout rate in attention
positional_dropout_rate (float): dropout rate after adding
@ -89,7 +89,7 @@ class BaseEncoder(nn.Layer):
use_dynamic_chunk (bool): whether use dynamic chunk size for
training or not, You can only use fixed chunk(chunk_size > 0)
or dyanmic chunk size(use_dynamic_chunk = True)
global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training
"""
@ -117,13 +117,14 @@ class BaseEncoder(nn.Layer):
self.global_cmvn = global_cmvn
self.embed = subsampling_class(
input_size,
output_size,
dropout_rate,
pos_enc_class(output_size, positional_dropout_rate), )
idim=input_size,
odim=output_size,
dropout_rate=dropout_rate,
pos_enc_class=pos_enc_class(
d_model=output_size, dropout_rate=positional_dropout_rate), )
self.normalize_before = normalize_before
self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12)
self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
self.static_chunk_size = static_chunk_size
self.use_dynamic_chunk = use_dynamic_chunk
self.use_dynamic_left_chunk = use_dynamic_left_chunk
@ -133,11 +134,11 @@ class BaseEncoder(nn.Layer):
def forward(
self,
xs: torch.Tensor,
xs_lens: torch.Tensor,
xs: paddle.Tensor,
xs_lens: paddle.Tensor,
decoding_chunk_size: int=0,
num_decoding_left_chunks: int=-1,
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Embed positions in tensor.
Args:
xs: padded input tensor (B, L, D)
@ -147,16 +148,16 @@ class BaseEncoder(nn.Layer):
<0: for decoding, use full chunk.
>0: for decoding, use fixed chunk size as set.
num_decoding_left_chunks: number of left chunks, this is for decoding,
the chunk size is decoding_chunk_size.
the chunk size is decoding_chunk_size.
>=0: use num_decoding_left_chunks
<0: use all left chunks
Returns:
encoder output tensor, lens and mask
"""
masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L)
masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L)
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
xs, pos_emb, masks = self.embed(xs, masks)
xs, pos_emb, masks = self.embed(xs, masks, offset=0)
mask_pad = ~masks
chunk_masks = add_optional_chunk_mask(
xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
@ -173,48 +174,52 @@ class BaseEncoder(nn.Layer):
def forward_chunk(
self,
xs: torch.Tensor,
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
subsampling_cache: Optional[torch.Tensor]=None,
elayers_output_cache: Optional[List[torch.Tensor]]=None,
conformer_cnn_cache: Optional[List[torch.Tensor]]=None,
) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], List[
torch.Tensor]]:
subsampling_cache: Optional[paddle.Tensor]=None,
elayers_output_cache: Optional[List[paddle.Tensor]]=None,
conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
paddle.Tensor]]:
""" Forward just one chunk
Args:
xs (torch.Tensor): chunk input
xs (paddle.Tensor): chunk input, [B=1, T, D]
offset (int): current offset in encoder output time stamp
required_cache_size (int): cache size required for next chunk
compuation
>=0: actual cache size
<0: means all history cache is required
subsampling_cache (Optional[torch.Tensor]): subsampling cache
elayers_output_cache (Optional[List[torch.Tensor]]):
subsampling_cache (Optional[paddle.Tensor]): subsampling cache
elayers_output_cache (Optional[List[paddle.Tensor]]):
transformer/conformer encoder layers output cache
conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer
conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
cnn cache
Returns:
torch.Tensor: output of current input xs
torch.Tensor: subsampling cache required for next chunk computation
List[torch.Tensor]: encoder layers output cache required for next
paddle.Tensor: output of current input xs
paddle.Tensor: subsampling cache required for next chunk computation
List[paddle.Tensor]: encoder layers output cache required for next
chunk computation
List[torch.Tensor]: conformer cnn cache
List[paddle.Tensor]: conformer cnn cache
"""
assert xs.size(0) == 1
assert xs.size(0) == 1 # batch size must be one
# tmp_masks is just for interface compatibility
tmp_masks = torch.ones(
1, xs.size(1), device=xs.device, dtype=torch.bool)
tmp_masks = tmp_masks.unsqueeze(1)
tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
xs, pos_emb, _ = self.embed(
xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D)
if subsampling_cache is not None:
cache_size = subsampling_cache.size(1)
xs = torch.cat((subsampling_cache, xs), dim=1)
cache_size = subsampling_cache.size(1) #T
xs = paddle.cat((subsampling_cache, xs), dim=1)
else:
cache_size = 0
pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1))
pos_emb = self.embed.position_encoding(
offset=offset - cache_size, size=xs.size(1))
if required_cache_size < 0:
next_cache_start = 0
elif required_cache_size == 0:
@ -222,20 +227,17 @@ class BaseEncoder(nn.Layer):
else:
next_cache_start = xs.size(1) - required_cache_size
r_subsampling_cache = xs[:, next_cache_start:, :]
# Real mask for transformer/conformer layers
masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
masks = masks.unsqueeze(1)
masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
masks = masks.unsqueeze(1) #[B=1, C=1, T]
r_elayers_output_cache = []
r_conformer_cnn_cache = []
for i, layer in enumerate(self.encoders):
if elayers_output_cache is None:
attn_cache = None
else:
attn_cache = elayers_output_cache[i]
if conformer_cnn_cache is None:
cnn_cache = None
else:
cnn_cache = conformer_cnn_cache[i]
attn_cache = None if elayers_output_cache is None else elayers_output_cache[
i]
cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
i]
xs, _, new_cnn_cache = layer(
xs,
masks,
@ -252,10 +254,10 @@ class BaseEncoder(nn.Layer):
def forward_chunk_by_chunk(
self,
xs: torch.Tensor,
xs: paddle.Tensor,
decoding_chunk_size: int,
num_decoding_left_chunks: int=-1,
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> Tuple[paddle.Tensor, paddle.Tensor]:
""" Forward input chunk by chunk with chunk_size like a streaming
fashion
Here we should pay special attention to computation cache in the
@ -277,24 +279,27 @@ class BaseEncoder(nn.Layer):
layers in subsampling, we need to rewrite it to make it work
with cache, which is not prefered.
Args:
xs (torch.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size
xs (paddle.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size.
num_left_chunks (int): decoding with num left chunks.
"""
assert decoding_chunk_size > 0
# The model is trained by static or dynamic chunk
assert self.static_chunk_size > 0 or self.use_dynamic_chunk
# feature stride and window for `subsampling` module
subsampling = self.embed.subsampling_rate
context = self.embed.right_context + 1 # Add current frame
stride = subsampling * decoding_chunk_size
decoding_window = (decoding_chunk_size - 1) * subsampling + context
num_frames = xs.size(1)
subsampling_cache: Optional[torch.Tensor] = None
elayers_output_cache: Optional[List[torch.Tensor]] = None
conformer_cnn_cache: Optional[List[torch.Tensor]] = None
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
subsampling_cache: Optional[paddle.Tensor] = None
elayers_output_cache: Optional[List[paddle.Tensor]] = None
conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
outputs = []
offset = 0
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
# Feed forward overlap input step by step
for cur in range(0, num_frames - context + 1, stride):
end = min(cur + decoding_window, num_frames)
@ -305,8 +310,9 @@ class BaseEncoder(nn.Layer):
elayers_output_cache, conformer_cnn_cache)
outputs.append(y)
offset += y.size(1)
ys = torch.cat(outputs, 1)
masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool)
ys = paddle.cat(outputs, 1)
# fake mask, just for jit script and compatibility with `forward` api
masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
masks = masks.unsqueeze(1)
return ys, masks
@ -330,7 +336,7 @@ class TransformerEncoder(BaseEncoder):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: torch.nn.Module=None,
global_cmvn: nn.Layer=None,
use_dynamic_left_chunk: bool=False, ):
""" Construct TransformerEncoder
See Encoder for the meaning of each parameter.
@ -342,14 +348,16 @@ class TransformerEncoder(BaseEncoder):
pos_enc_layer_type, normalize_before, concat_after,
static_chunk_size, use_dynamic_chunk, global_cmvn,
use_dynamic_left_chunk)
self.encoders = torch.nn.ModuleList([
self.encoders = nn.ModuleList([
TransformerEncoderLayer(
output_size,
MultiHeadedAttention(attention_heads, output_size,
attention_dropout_rate),
PositionwiseFeedForward(output_size, linear_units,
dropout_rate), dropout_rate,
normalize_before, concat_after) for _ in range(num_blocks)
size=output_size,
self_attn=MultiHeadedAttention(attention_heads, output_size,
attention_dropout_rate),
feed_forward=PositionwiseFeedForward(output_size, linear_units,
dropout_rate),
dropout_rate=dropout_rate,
normalize_before=normalize_before,
concat_after=concat_after) for _ in range(num_blocks)
])
@ -396,6 +404,7 @@ class ConformerEncoder(BaseEncoder):
use_cnn_module (bool): Whether to use convolution module.
cnn_module_kernel (int): Kernel size of convolution module.
causal (bool): whether to use causal convolution or not.
cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
"""
assert check_argument_types()
super().__init__(input_size, output_size, attention_heads, linear_units,
@ -409,26 +418,26 @@ class ConformerEncoder(BaseEncoder):
# self-attention module definition
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, output_size,
attention_dropout_rate, )
attention_dropout_rate)
# feed-forward module definition
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (output_size, linear_units, dropout_rate,
activation, )
activation)
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (output_size, cnn_module_kernel, activation,
cnn_module_norm, causal)
self.encoders = torch.nn.ModuleList([
self.encoders = nn.ModuleList([
ConformerEncoderLayer(
output_size,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args)
if macaron_style else None,
convolution_layer(*convolution_layer_args)
size=output_size,
eself_attn=ncoder_selfattn_layer(*encoder_selfattn_layer_args),
feed_forward=positionwise_layer(*positionwise_layer_args),
feed_forward_macaron=positionwise_layer(
*positionwise_layer_args) if macaron_style else None,
conv_module=convolution_layer(*convolution_layer_args)
if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after, ) for _ in range(num_blocks)
dropout_rate=dropout_rate,
normalize_before=normalize_before,
concat_after=concat_after) for _ in range(num_blocks)
])

@ -72,6 +72,7 @@ class TransformerEncoderLayer(nn.Layer):
x: paddle.Tensor,
mask: paddle.Tensor,
pos_emb: paddle.Tensor,
mask_pad: Optional[paddle.Tensor]=None,
output_cache: Optional[paddle.Tensor]=None,
cnn_cache: Optional[paddle.Tensor]=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -81,6 +82,8 @@ class TransformerEncoderLayer(nn.Layer):
mask (paddle.Tensor): Mask tensor for the input (#batch, time).
pos_emb (paddle.Tensor): just for interface compatibility
to ConformerEncoderLayer
mask_pad (paddle.Tensor): does not used in transformer layer,
just for unified api with conformer.
output_cache (paddle.Tensor): Cache tensor of the output
(#batch, time2, size), time2 < time in x.
cnn_cache (paddle.Tensor): not used here, it's for interface
@ -88,6 +91,7 @@ class TransformerEncoderLayer(nn.Layer):
Returns:
paddle.Tensor: Output tensor (#batch, time, size).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor: Fake cnn cache tensor for api compatibility with Conformer (#batch, channels, time').
"""
residual = x
if self.normalize_before:
@ -202,12 +206,13 @@ class ConformerEncoderLayer(nn.Layer):
pos_emb (paddle.Tensor): positional encoding, must not be None
for ConformerEncoderLayer.
mask_pad (paddle.Tensor): batch padding mask used for conv module, (B, 1, T).
output_cache (paddle.Tensor): Cache tensor of the output
output_cache (paddle.Tensor): Cache tensor of the encoder output
(#batch, time2, size), time2 < time in x.
cnn_cache (paddle.Tensor): Convolution cache in conformer layer
Returns:
paddle.Tensor: Output tensor (#batch, time, size).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor: New cnn cache tensor (#batch, channels, time').
"""
# whether to use macaron style FFN
if self.feed_forward_macaron is not None:

@ -56,13 +56,13 @@ def subsequent_mask(
This mask is used only in decoder which works in an auto-regressive mode.
This means the current step could only do attention with its left steps.
In encoder, fully attention is used when streaming is not necessary and
the sequence is not long. In this case, no attention mask is needed.
the sequence is not long. In this case, no attention mask is needed.
When streaming is need, chunk-based attention is used in encoder. See
subsequent_chunk_mask for the chunk-based attention mask.
Args:
size (int): size of mask
Returns:
paddle.Tensor: mask
paddle.Tensor: mask, [size, size]
Examples:
>>> subsequent_mask(3)
[[1, 0, 0],
@ -86,7 +86,7 @@ def subsequent_chunk_mask(
<0: use full chunk
>=0: use num_left_chunks
Returns:
paddle.Tensor: mask
paddle.Tensor: mask, [size, size]
Examples:
>>> subsequent_chunk_mask(4, 2)
[[1, 1, 0, 0],
@ -99,8 +99,8 @@ def subsequent_chunk_mask(
if num_left_chunks < 0:
start = 0
else:
start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
ending = min((i // chunk_size + 1) * chunk_size, size)
start = max(0, (i // chunk_size - num_left_chunks) * chunk_size)
ending = min(size, (i // chunk_size + 1) * chunk_size)
ret[i, start:ending] = True
return ret

@ -41,7 +41,7 @@ class RNNCell(nn.RNNCellBase):
"""
def __init__(self,
hidden_size,
hidden_size: int,
activation="tanh",
weight_ih_attr=None,
weight_hh_attr=None,
@ -108,8 +108,8 @@ class GRUCell(nn.RNNCellBase):
"""
def __init__(self,
input_size,
hidden_size,
input_size: int,
hidden_size: int,
weight_ih_attr=None,
weight_hh_attr=None,
bias_ih_attr=None,
@ -132,7 +132,6 @@ class GRUCell(nn.RNNCellBase):
self.input_size = input_size
self._gate_activation = F.sigmoid
self._activation = paddle.tanh
#self._activation = F.relu
def forward(self, inputs, states=None):
if states is None:
@ -171,8 +170,6 @@ class BiRNNWithBN(nn.Layer):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer parameters.
:type name: string
:param size: Dimension of RNN cells.
:type size: int
:param share_weights: Whether to share input-hidden weights between
@ -182,7 +179,7 @@ class BiRNNWithBN(nn.Layer):
:rtype: Variable
"""
def __init__(self, i_size, h_size, share_weights):
def __init__(self, i_size: int, h_size: int, share_weights: bool):
super().__init__()
self.share_weights = share_weights
if self.share_weights:
@ -208,7 +205,7 @@ class BiRNNWithBN(nn.Layer):
self.bw_rnn = nn.RNN(
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
def forward(self, x, x_len):
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
# x, shape [B, T, D]
fw_x = self.fw_bn(self.fw_fc(x))
bw_x = self.bw_bn(self.bw_fc(x))
@ -234,7 +231,7 @@ class BiGRUWithBN(nn.Layer):
:rtype: Variable
"""
def __init__(self, i_size, h_size, act):
def __init__(self, i_size: int, h_size: int):
super().__init__()
hidden_size = h_size * 3
@ -281,23 +278,29 @@ class RNNStack(nn.Layer):
:rtype: Variable
"""
def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
def __init__(self,
i_size: int,
h_size: int,
num_stacks: int,
use_gru: bool,
share_rnn_weights: bool):
super().__init__()
self.rnn_stacks = nn.LayerList()
rnn_stacks = []
for i in range(num_stacks):
if use_gru:
#default:GRU using tanh
self.rnn_stacks.append(
BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
else:
self.rnn_stacks.append(
rnn_stacks.append(
BiRNNWithBN(
i_size=i_size,
h_size=h_size,
share_weights=share_rnn_weights))
i_size = h_size * 2
def forward(self, x, x_len):
self.rnn_stacks = nn.Sequential(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
"""
x: shape [B, T, D]
x_len: shpae [B]

@ -32,10 +32,12 @@ __all__ = [
class BaseSubsampling(nn.Layer):
def __init__(self, pos_enc_class: PositionalEncoding):
def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding):
super().__init__()
self.pos_enc = pos_enc_class
# window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
self.right_context = 0
# stride = chunk_size * subsampling_rate
self.subsampling_rate = 1
def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
@ -49,7 +51,7 @@ class LinearNoSubsampling(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an linear object.
Args:
idim (int): Input dimension.
@ -71,6 +73,7 @@ class LinearNoSubsampling(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
@ -90,7 +93,7 @@ class Conv2dSubsampling4(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an Conv2dSubsampling4 object.
Args:
@ -117,6 +120,7 @@ class Conv2dSubsampling4(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 4.
@ -139,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an Conv2dSubsampling6 object.
Args:
@ -169,6 +173,7 @@ class Conv2dSubsampling6(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 6.
@ -191,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an Conv2dSubsampling8 object.
Args:
@ -221,6 +226,7 @@ class Conv2dSubsampling8(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 8.

@ -43,7 +43,7 @@ bash setup.sh
source tools/venv/bin/activate
```
## Running in Docker Container
## Running in Docker Container (optional)
Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.

@ -0,0 +1,3 @@
# Reference
* [wenet](https://github.com/mobvoi/wenet)
Loading…
Cancel
Save