add encoder

pull/578/head
Hui Zhang 5 years ago
parent 426d370413
commit b6d729a675

@ -37,26 +37,26 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2021-03-24 06:30:47,727 - WARNING - register user softmax to paddle, remove this when fixed!\n",
"2021-03-24 06:30:47,728 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
"2021-03-24 06:30:47,729 - WARNING - register user relu to paddle, remove this when fixed!\n",
"2021-03-24 06:30:47,729 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
"2021-03-24 06:30:47,730 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-24 06:30:47,731 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-24 06:30:47,731 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"2021-03-24 06:30:47,732 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,732 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,733 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,733 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,734 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,734 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,735 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,735 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,736 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
"2021-03-24 06:30:47,736 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
"2021-03-24 06:30:47,737 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"2021-03-24 06:30:47,737 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
"2021-03-24 06:30:47,738 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n",
"2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n",
"2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n",
"2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n",
"2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n",
" from numpy.dual import register_func\n",
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
@ -102,10 +102,10 @@
"output_type": "stream",
"text": [
"0.0.0\n",
"607856a949ed7356237ed8148947f7fd2b0f4631\n",
"ON\n",
"ON\n",
"commit: 607856a949ed7356237ed8148947f7fd2b0f4631\n",
"e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
"OFF\n",
"OFF\n",
"commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n",
"None\n",
"0\n"
]
@ -226,7 +226,7 @@
"output: None\n",
"params_file: examples/aishell/jit.model.pdiparams\n",
"speech_save_dir: demo_cache\n",
"use_gpu: True\n",
"use_gpu: False\n",
"warmup_manifest: examples/aishell/data/manifest.test\n",
"------------------------------------------------\n"
]
@ -266,7 +266,7 @@
" help=\n",
" \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n",
")\n",
"add_arg(\"--use_gpu\",type=bool,default=True, help=\"Whether use gpu.\")\n",
"add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n",
"\n",
"\n",
"args = parser.parse_args(\n",
@ -321,7 +321,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2021-03-24 06:31:20,943 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
"2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n"
]
},
{
@ -407,7 +407,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 22,
"metadata": {},
"outputs": [
{
@ -418,7 +418,7 @@
"examples/aishell/jit.model.pdmodel\n",
"examples/aishell/jit.model.pdiparams\n",
"0\n",
"True\n"
"False\n"
]
}
],
@ -428,7 +428,8 @@
"from paddle.inference import PrecisionType\n",
"from paddle.inference import create_predictor\n",
"\n",
"args.use_gpu=True\n",
"args.use_gpu=False\n",
"paddle.set_device('cpu')\n",
"\n",
"def init_predictor(args):\n",
" if args.model_dir is not None:\n",
@ -438,8 +439,8 @@
"\n",
" if args.use_gpu:\n",
" config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n",
" config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
" use_calib_mode=True) # 开启TensorRT预测精度为fp32开启int8离线量化\n",
"# config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n",
"# use_calib_mode=True) # 开启TensorRT预测精度为fp32开启int8离线量化\n",
" else:\n",
" # If not specific mkldnn, you can set the blas thread.\n",
" # The thread num should not be greater than the number of cores in the CPU.\n",
@ -447,7 +448,7 @@
" config.enable_mkldnn()\n",
" \n",
" config.enable_memory_optim()\n",
" config.switch_ir_optim(False)\n",
" config.switch_ir_optim(True)\n",
" \n",
" print(config.model_dir())\n",
" print(config.prog_file())\n",
@ -534,7 +535,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 23,
"metadata": {},
"outputs": [
{
@ -546,49 +547,35 @@
"input: 0 audio\n",
"input: 1 audio_len\n",
"output: 0 tmp_75\n",
"jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
" 8.91578738e-12 4.64319072e-08]\n",
" [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
" 1.55893121e-15 9.99992609e-01]\n",
" [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
"jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n",
" 8.91573707e-12 4.64317296e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n",
" 1.24587264e-17 1.00000000e+00]\n",
" ...\n",
" [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
" 4.37358093e-15 1.00000000e+00]\n",
" [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
" 3.89255983e-13 1.00000000e+00]\n",
" [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
" 1.00334671e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
" [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n",
" 3.89252270e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]] <class 'numpy.ndarray'>\n",
"[1, 161, 522]\n",
"[1]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n",
" return (isinstance(seq, collections.Sequence) and\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"paddle: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
" 8.91578738e-12 4.64319072e-08]\n",
" [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
" 1.55893121e-15 9.99992609e-01]\n",
" [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
"[1]\n",
"paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
" 8.91577090e-12 4.64319072e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
" ...\n",
" [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
" 4.37358093e-15 1.00000000e+00]\n",
" [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
" 3.89255983e-13 1.00000000e+00]\n",
" [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
" 1.00334671e-10 9.99998808e-01]]]\n",
"True\n"
" [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
" 3.89253761e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]]\n",
"False\n"
]
}
],
@ -607,7 +594,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 21,
"metadata": {},
"outputs": [
{
@ -618,19 +605,19 @@
"input: 0 audio\n",
"input: 1 audio_len\n",
"output: 0 tmp_75\n",
"jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n",
" 8.91578738e-12 4.64319072e-08]\n",
" [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n",
" 1.55893121e-15 9.99992609e-01]\n",
" [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n",
"jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n",
" 8.91577090e-12 4.64319072e-08]\n",
" [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n",
" 1.55891342e-15 9.99992609e-01]\n",
" [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n",
" 1.24587735e-17 1.00000000e+00]\n",
" ...\n",
" [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n",
" 4.37358093e-15 1.00000000e+00]\n",
" [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n",
" 3.89255983e-13 1.00000000e+00]\n",
" [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n",
" 1.00334671e-10 9.99998808e-01]]]\n"
" [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n",
" 4.37354747e-15 1.00000000e+00]\n",
" [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n",
" 3.89253761e-13 1.00000000e+00]\n",
" [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n",
" 1.00334095e-10 9.99998808e-01]]]\n"
]
}
],

@ -454,7 +454,7 @@
" act='brelu')\n",
"\n",
" out_channel = 32\n",
" self.conv_stack = nn.LayerList([\n",
" self.conv_stack = nn.Sequential([\n",
" ConvBn(\n",
" num_channels_in=32,\n",
" num_channels_out=out_channel,\n",

@ -59,3 +59,6 @@ You are welcome to submit questions and bug reports in [Github Issues](https://g
## License
DeepSpeech is provided under the [Apache-2.0 License](./LICENSE).
## Acknowledgement
We depends on many open source repos. See [References](docs/reference.md) for more information.

@ -56,3 +56,6 @@ source tools/venv/bin/activate
## License
DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。
## 感谢
开发中参考一些优秀的仓库,详情参见 [References](docs/reference.md)。

@ -266,8 +266,17 @@ logger.warn(
)
F.ctc_loss = ctc_loss
########### hcak paddle.nn #############
if not hasattr(paddle.nn, 'Module'):
logger.warn("register user Module to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'Module', paddle.nn.Layer)
if not hasattr(paddle.nn, 'ModuleList'):
logger.warn(
"register user ModuleList to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
class GLU(nn.Layer):
"""Gated Linear Units (GLU) Layer"""

@ -143,7 +143,8 @@ def get_activation(act):
"relu": paddle.nn.ReLU,
"selu": paddle.nn.SELU,
"swish": paddle.nn.Swish,
"gelu": paddle.nn.GELU
"gelu": paddle.nn.GELU,
"brelu": brelu,
}
return activation_funcs[act]()

@ -51,7 +51,7 @@ class PositionalEncoding(nn.Layer):
self.pe = paddle.zeros(self.max_len, self.d_model) #[T,D]
position = paddle.arange(
0, self.max_len, dtype=paddle.float32).unsqueeze(1)
0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1]
div_term = paddle.exp(
paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
-(math.log(10000.0) / self.d_model))
@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
offset (int): position offset
Returns:
paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...)
paddle.Tensor: for compatibility to RelPositionalEncoding
paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
"""
T = paddle.shape(x)[1]
assert offset + T < self.max_len

@ -59,16 +59,16 @@ class BaseEncoder(nn.Layer):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: torch.nn.Module=None,
global_cmvn: paddle.nn.Layer=None,
use_dynamic_left_chunk: bool=False, ):
"""
Args:
input_size (int): input dim
output_size (int): dimension of attention
input_size (int): input dim, d_feature
output_size (int): dimension of attention, d_model
attention_heads (int): the number of heads of multi head attention
linear_units (int): the hidden units number of position-wise feed
forward
num_blocks (int): the number of decoder blocks
num_blocks (int): the number of encoder blocks
dropout_rate (float): dropout rate
attention_dropout_rate (float): dropout rate in attention
positional_dropout_rate (float): dropout rate after adding
@ -89,7 +89,7 @@ class BaseEncoder(nn.Layer):
use_dynamic_chunk (bool): whether use dynamic chunk size for
training or not, You can only use fixed chunk(chunk_size > 0)
or dyanmic chunk size(use_dynamic_chunk = True)
global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer
use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training
"""
@ -117,13 +117,14 @@ class BaseEncoder(nn.Layer):
self.global_cmvn = global_cmvn
self.embed = subsampling_class(
input_size,
output_size,
dropout_rate,
pos_enc_class(output_size, positional_dropout_rate), )
idim=input_size,
odim=output_size,
dropout_rate=dropout_rate,
pos_enc_class=pos_enc_class(
d_model=output_size, dropout_rate=positional_dropout_rate), )
self.normalize_before = normalize_before
self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12)
self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
self.static_chunk_size = static_chunk_size
self.use_dynamic_chunk = use_dynamic_chunk
self.use_dynamic_left_chunk = use_dynamic_left_chunk
@ -133,11 +134,11 @@ class BaseEncoder(nn.Layer):
def forward(
self,
xs: torch.Tensor,
xs_lens: torch.Tensor,
xs: paddle.Tensor,
xs_lens: paddle.Tensor,
decoding_chunk_size: int=0,
num_decoding_left_chunks: int=-1,
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Embed positions in tensor.
Args:
xs: padded input tensor (B, L, D)
@ -153,10 +154,10 @@ class BaseEncoder(nn.Layer):
Returns:
encoder output tensor, lens and mask
"""
masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L)
masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L)
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
xs, pos_emb, masks = self.embed(xs, masks)
xs, pos_emb, masks = self.embed(xs, masks, offset=0)
mask_pad = ~masks
chunk_masks = add_optional_chunk_mask(
xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
@ -173,48 +174,52 @@ class BaseEncoder(nn.Layer):
def forward_chunk(
self,
xs: torch.Tensor,
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
subsampling_cache: Optional[torch.Tensor]=None,
elayers_output_cache: Optional[List[torch.Tensor]]=None,
conformer_cnn_cache: Optional[List[torch.Tensor]]=None,
) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], List[
torch.Tensor]]:
subsampling_cache: Optional[paddle.Tensor]=None,
elayers_output_cache: Optional[List[paddle.Tensor]]=None,
conformer_cnn_cache: Optional[List[paddle.Tensor]]=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[
paddle.Tensor]]:
""" Forward just one chunk
Args:
xs (torch.Tensor): chunk input
xs (paddle.Tensor): chunk input, [B=1, T, D]
offset (int): current offset in encoder output time stamp
required_cache_size (int): cache size required for next chunk
compuation
>=0: actual cache size
<0: means all history cache is required
subsampling_cache (Optional[torch.Tensor]): subsampling cache
elayers_output_cache (Optional[List[torch.Tensor]]):
subsampling_cache (Optional[paddle.Tensor]): subsampling cache
elayers_output_cache (Optional[List[paddle.Tensor]]):
transformer/conformer encoder layers output cache
conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer
conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer
cnn cache
Returns:
torch.Tensor: output of current input xs
torch.Tensor: subsampling cache required for next chunk computation
List[torch.Tensor]: encoder layers output cache required for next
paddle.Tensor: output of current input xs
paddle.Tensor: subsampling cache required for next chunk computation
List[paddle.Tensor]: encoder layers output cache required for next
chunk computation
List[torch.Tensor]: conformer cnn cache
List[paddle.Tensor]: conformer cnn cache
"""
assert xs.size(0) == 1
assert xs.size(0) == 1 # batch size must be one
# tmp_masks is just for interface compatibility
tmp_masks = torch.ones(
1, xs.size(1), device=xs.device, dtype=torch.bool)
tmp_masks = tmp_masks.unsqueeze(1)
tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
if self.global_cmvn is not None:
xs = self.global_cmvn(xs)
xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
xs, pos_emb, _ = self.embed(
xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D)
if subsampling_cache is not None:
cache_size = subsampling_cache.size(1)
xs = torch.cat((subsampling_cache, xs), dim=1)
cache_size = subsampling_cache.size(1) #T
xs = paddle.cat((subsampling_cache, xs), dim=1)
else:
cache_size = 0
pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1))
pos_emb = self.embed.position_encoding(
offset=offset - cache_size, size=xs.size(1))
if required_cache_size < 0:
next_cache_start = 0
elif required_cache_size == 0:
@ -222,20 +227,17 @@ class BaseEncoder(nn.Layer):
else:
next_cache_start = xs.size(1) - required_cache_size
r_subsampling_cache = xs[:, next_cache_start:, :]
# Real mask for transformer/conformer layers
masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
masks = masks.unsqueeze(1)
masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
masks = masks.unsqueeze(1) #[B=1, C=1, T]
r_elayers_output_cache = []
r_conformer_cnn_cache = []
for i, layer in enumerate(self.encoders):
if elayers_output_cache is None:
attn_cache = None
else:
attn_cache = elayers_output_cache[i]
if conformer_cnn_cache is None:
cnn_cache = None
else:
cnn_cache = conformer_cnn_cache[i]
attn_cache = None if elayers_output_cache is None else elayers_output_cache[
i]
cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[
i]
xs, _, new_cnn_cache = layer(
xs,
masks,
@ -252,10 +254,10 @@ class BaseEncoder(nn.Layer):
def forward_chunk_by_chunk(
self,
xs: torch.Tensor,
xs: paddle.Tensor,
decoding_chunk_size: int,
num_decoding_left_chunks: int=-1,
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> Tuple[paddle.Tensor, paddle.Tensor]:
""" Forward input chunk by chunk with chunk_size like a streaming
fashion
Here we should pay special attention to computation cache in the
@ -277,24 +279,27 @@ class BaseEncoder(nn.Layer):
layers in subsampling, we need to rewrite it to make it work
with cache, which is not prefered.
Args:
xs (torch.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size
xs (paddle.Tensor): (1, max_len, dim)
chunk_size (int): decoding chunk size.
num_left_chunks (int): decoding with num left chunks.
"""
assert decoding_chunk_size > 0
# The model is trained by static or dynamic chunk
assert self.static_chunk_size > 0 or self.use_dynamic_chunk
# feature stride and window for `subsampling` module
subsampling = self.embed.subsampling_rate
context = self.embed.right_context + 1 # Add current frame
stride = subsampling * decoding_chunk_size
decoding_window = (decoding_chunk_size - 1) * subsampling + context
num_frames = xs.size(1)
subsampling_cache: Optional[torch.Tensor] = None
elayers_output_cache: Optional[List[torch.Tensor]] = None
conformer_cnn_cache: Optional[List[torch.Tensor]] = None
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
subsampling_cache: Optional[paddle.Tensor] = None
elayers_output_cache: Optional[List[paddle.Tensor]] = None
conformer_cnn_cache: Optional[List[paddle.Tensor]] = None
outputs = []
offset = 0
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
# Feed forward overlap input step by step
for cur in range(0, num_frames - context + 1, stride):
end = min(cur + decoding_window, num_frames)
@ -305,8 +310,9 @@ class BaseEncoder(nn.Layer):
elayers_output_cache, conformer_cnn_cache)
outputs.append(y)
offset += y.size(1)
ys = torch.cat(outputs, 1)
masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool)
ys = paddle.cat(outputs, 1)
# fake mask, just for jit script and compatibility with `forward` api
masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool)
masks = masks.unsqueeze(1)
return ys, masks
@ -330,7 +336,7 @@ class TransformerEncoder(BaseEncoder):
concat_after: bool=False,
static_chunk_size: int=0,
use_dynamic_chunk: bool=False,
global_cmvn: torch.nn.Module=None,
global_cmvn: nn.Layer=None,
use_dynamic_left_chunk: bool=False, ):
""" Construct TransformerEncoder
See Encoder for the meaning of each parameter.
@ -342,14 +348,16 @@ class TransformerEncoder(BaseEncoder):
pos_enc_layer_type, normalize_before, concat_after,
static_chunk_size, use_dynamic_chunk, global_cmvn,
use_dynamic_left_chunk)
self.encoders = torch.nn.ModuleList([
self.encoders = nn.ModuleList([
TransformerEncoderLayer(
output_size,
MultiHeadedAttention(attention_heads, output_size,
size=output_size,
self_attn=MultiHeadedAttention(attention_heads, output_size,
attention_dropout_rate),
PositionwiseFeedForward(output_size, linear_units,
dropout_rate), dropout_rate,
normalize_before, concat_after) for _ in range(num_blocks)
feed_forward=PositionwiseFeedForward(output_size, linear_units,
dropout_rate),
dropout_rate=dropout_rate,
normalize_before=normalize_before,
concat_after=concat_after) for _ in range(num_blocks)
])
@ -396,6 +404,7 @@ class ConformerEncoder(BaseEncoder):
use_cnn_module (bool): Whether to use convolution module.
cnn_module_kernel (int): Kernel size of convolution module.
causal (bool): whether to use causal convolution or not.
cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
"""
assert check_argument_types()
super().__init__(input_size, output_size, attention_heads, linear_units,
@ -409,26 +418,26 @@ class ConformerEncoder(BaseEncoder):
# self-attention module definition
encoder_selfattn_layer = RelPositionMultiHeadedAttention
encoder_selfattn_layer_args = (attention_heads, output_size,
attention_dropout_rate, )
attention_dropout_rate)
# feed-forward module definition
positionwise_layer = PositionwiseFeedForward
positionwise_layer_args = (output_size, linear_units, dropout_rate,
activation, )
activation)
# convolution module definition
convolution_layer = ConvolutionModule
convolution_layer_args = (output_size, cnn_module_kernel, activation,
cnn_module_norm, causal)
self.encoders = torch.nn.ModuleList([
self.encoders = nn.ModuleList([
ConformerEncoderLayer(
output_size,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args)
if macaron_style else None,
convolution_layer(*convolution_layer_args)
size=output_size,
eself_attn=ncoder_selfattn_layer(*encoder_selfattn_layer_args),
feed_forward=positionwise_layer(*positionwise_layer_args),
feed_forward_macaron=positionwise_layer(
*positionwise_layer_args) if macaron_style else None,
conv_module=convolution_layer(*convolution_layer_args)
if use_cnn_module else None,
dropout_rate,
normalize_before,
concat_after, ) for _ in range(num_blocks)
dropout_rate=dropout_rate,
normalize_before=normalize_before,
concat_after=concat_after) for _ in range(num_blocks)
])

@ -72,6 +72,7 @@ class TransformerEncoderLayer(nn.Layer):
x: paddle.Tensor,
mask: paddle.Tensor,
pos_emb: paddle.Tensor,
mask_pad: Optional[paddle.Tensor]=None,
output_cache: Optional[paddle.Tensor]=None,
cnn_cache: Optional[paddle.Tensor]=None,
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@ -81,6 +82,8 @@ class TransformerEncoderLayer(nn.Layer):
mask (paddle.Tensor): Mask tensor for the input (#batch, time).
pos_emb (paddle.Tensor): just for interface compatibility
to ConformerEncoderLayer
mask_pad (paddle.Tensor): does not used in transformer layer,
just for unified api with conformer.
output_cache (paddle.Tensor): Cache tensor of the output
(#batch, time2, size), time2 < time in x.
cnn_cache (paddle.Tensor): not used here, it's for interface
@ -88,6 +91,7 @@ class TransformerEncoderLayer(nn.Layer):
Returns:
paddle.Tensor: Output tensor (#batch, time, size).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor: Fake cnn cache tensor for api compatibility with Conformer (#batch, channels, time').
"""
residual = x
if self.normalize_before:
@ -202,12 +206,13 @@ class ConformerEncoderLayer(nn.Layer):
pos_emb (paddle.Tensor): positional encoding, must not be None
for ConformerEncoderLayer.
mask_pad (paddle.Tensor): batch padding mask used for conv module, (B, 1, T).
output_cache (paddle.Tensor): Cache tensor of the output
output_cache (paddle.Tensor): Cache tensor of the encoder output
(#batch, time2, size), time2 < time in x.
cnn_cache (paddle.Tensor): Convolution cache in conformer layer
Returns:
paddle.Tensor: Output tensor (#batch, time, size).
paddle.Tensor: Mask tensor (#batch, time).
paddle.Tensor: New cnn cache tensor (#batch, channels, time').
"""
# whether to use macaron style FFN
if self.feed_forward_macaron is not None:

@ -62,7 +62,7 @@ def subsequent_mask(
Args:
size (int): size of mask
Returns:
paddle.Tensor: mask
paddle.Tensor: mask, [size, size]
Examples:
>>> subsequent_mask(3)
[[1, 0, 0],
@ -86,7 +86,7 @@ def subsequent_chunk_mask(
<0: use full chunk
>=0: use num_left_chunks
Returns:
paddle.Tensor: mask
paddle.Tensor: mask, [size, size]
Examples:
>>> subsequent_chunk_mask(4, 2)
[[1, 1, 0, 0],
@ -99,8 +99,8 @@ def subsequent_chunk_mask(
if num_left_chunks < 0:
start = 0
else:
start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
ending = min((i // chunk_size + 1) * chunk_size, size)
start = max(0, (i // chunk_size - num_left_chunks) * chunk_size)
ending = min(size, (i // chunk_size + 1) * chunk_size)
ret[i, start:ending] = True
return ret

@ -41,7 +41,7 @@ class RNNCell(nn.RNNCellBase):
"""
def __init__(self,
hidden_size,
hidden_size: int,
activation="tanh",
weight_ih_attr=None,
weight_hh_attr=None,
@ -108,8 +108,8 @@ class GRUCell(nn.RNNCellBase):
"""
def __init__(self,
input_size,
hidden_size,
input_size: int,
hidden_size: int,
weight_ih_attr=None,
weight_hh_attr=None,
bias_ih_attr=None,
@ -132,7 +132,6 @@ class GRUCell(nn.RNNCellBase):
self.input_size = input_size
self._gate_activation = F.sigmoid
self._activation = paddle.tanh
#self._activation = F.relu
def forward(self, inputs, states=None):
if states is None:
@ -171,8 +170,6 @@ class BiRNNWithBN(nn.Layer):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer parameters.
:type name: string
:param size: Dimension of RNN cells.
:type size: int
:param share_weights: Whether to share input-hidden weights between
@ -182,7 +179,7 @@ class BiRNNWithBN(nn.Layer):
:rtype: Variable
"""
def __init__(self, i_size, h_size, share_weights):
def __init__(self, i_size: int, h_size: int, share_weights: bool):
super().__init__()
self.share_weights = share_weights
if self.share_weights:
@ -208,7 +205,7 @@ class BiRNNWithBN(nn.Layer):
self.bw_rnn = nn.RNN(
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
def forward(self, x, x_len):
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
# x, shape [B, T, D]
fw_x = self.fw_bn(self.fw_fc(x))
bw_x = self.bw_bn(self.bw_fc(x))
@ -234,7 +231,7 @@ class BiGRUWithBN(nn.Layer):
:rtype: Variable
"""
def __init__(self, i_size, h_size, act):
def __init__(self, i_size: int, h_size: int):
super().__init__()
hidden_size = h_size * 3
@ -281,23 +278,29 @@ class RNNStack(nn.Layer):
:rtype: Variable
"""
def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights):
def __init__(self,
i_size: int,
h_size: int,
num_stacks: int,
use_gru: bool,
share_rnn_weights: bool):
super().__init__()
self.rnn_stacks = nn.LayerList()
rnn_stacks = []
for i in range(num_stacks):
if use_gru:
#default:GRU using tanh
self.rnn_stacks.append(
BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu"))
rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
else:
self.rnn_stacks.append(
rnn_stacks.append(
BiRNNWithBN(
i_size=i_size,
h_size=h_size,
share_weights=share_rnn_weights))
i_size = h_size * 2
def forward(self, x, x_len):
self.rnn_stacks = nn.Sequential(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
"""
x: shape [B, T, D]
x_len: shpae [B]

@ -32,10 +32,12 @@ __all__ = [
class BaseSubsampling(nn.Layer):
def __init__(self, pos_enc_class: PositionalEncoding):
def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding):
super().__init__()
self.pos_enc = pos_enc_class
# window size = (1 + right_context) + (chunk_size -1) * subsampling_rate
self.right_context = 0
# stride = chunk_size * subsampling_rate
self.subsampling_rate = 1
def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
@ -49,7 +51,7 @@ class LinearNoSubsampling(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an linear object.
Args:
idim (int): Input dimension.
@ -71,6 +73,7 @@ class LinearNoSubsampling(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: linear input tensor (#batch, time', odim),
where time' = time .
@ -90,7 +93,7 @@ class Conv2dSubsampling4(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an Conv2dSubsampling4 object.
Args:
@ -117,6 +120,7 @@ class Conv2dSubsampling4(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 4.
@ -139,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an Conv2dSubsampling6 object.
Args:
@ -169,6 +173,7 @@ class Conv2dSubsampling6(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 6.
@ -191,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling):
idim: int,
odim: int,
dropout_rate: float,
pos_enc_class: PositionalEncoding):
pos_enc_class: nn.Layer=PositionalEncoding):
"""Construct an Conv2dSubsampling8 object.
Args:
@ -221,6 +226,7 @@ class Conv2dSubsampling8(BaseSubsampling):
Args:
x (paddle.Tensor): Input tensor (#batch, time, idim).
x_mask (paddle.Tensor): Input mask (#batch, 1, time).
offset (int): position encoding offset.
Returns:
paddle.Tensor: Subsampled tensor (#batch, time', odim),
where time' = time // 8.

@ -43,7 +43,7 @@ bash setup.sh
source tools/venv/bin/activate
```
## Running in Docker Container
## Running in Docker Container (optional)
Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.

@ -0,0 +1,3 @@
# Reference
* [wenet](https://github.com/mobvoi/wenet)
Loading…
Cancel
Save