diff --git a/.notebook/jit_infer.ipynb b/.notebook/jit_infer.ipynb index 6c48bb407..397c59603 100644 --- a/.notebook/jit_infer.ipynb +++ b/.notebook/jit_infer.ipynb @@ -37,26 +37,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-03-24 06:30:47,727 - WARNING - register user softmax to paddle, remove this when fixed!\n", - "2021-03-24 06:30:47,728 - WARNING - register user sigmoid to paddle, remove this when fixed!\n", - "2021-03-24 06:30:47,729 - WARNING - register user relu to paddle, remove this when fixed!\n", - "2021-03-24 06:30:47,729 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n", - "2021-03-24 06:30:47,730 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n", - "2021-03-24 06:30:47,731 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", - "2021-03-24 06:30:47,731 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", - "2021-03-24 06:30:47,732 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,732 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,733 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,733 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,734 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,734 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,735 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,735 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,736 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n", - "2021-03-24 06:30:47,736 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n", - "2021-03-24 06:30:47,737 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", - "2021-03-24 06:30:47,737 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n", - "2021-03-24 06:30:47,738 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n", + "2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n", + "2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n", + "2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n", + "2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n", + "2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", + "2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n", + "2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n", + "2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", + "2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n", + "2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n", "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", " from numpy.dual import register_func\n", "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", @@ -102,10 +102,10 @@ "output_type": "stream", "text": [ "0.0.0\n", - "607856a949ed7356237ed8148947f7fd2b0f4631\n", - "ON\n", - "ON\n", - "commit: 607856a949ed7356237ed8148947f7fd2b0f4631\n", + "e7f28d6c0db54eb9c9a810612300b526687e56a6\n", + "OFF\n", + "OFF\n", + "commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n", "None\n", "0\n" ] @@ -226,7 +226,7 @@ "output: None\n", "params_file: examples/aishell/jit.model.pdiparams\n", "speech_save_dir: demo_cache\n", - "use_gpu: True\n", + "use_gpu: False\n", "warmup_manifest: examples/aishell/data/manifest.test\n", "------------------------------------------------\n" ] @@ -266,7 +266,7 @@ " help=\n", " \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n", ")\n", - "add_arg(\"--use_gpu\",type=bool,default=True, help=\"Whether use gpu.\")\n", + "add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n", "\n", "\n", "args = parser.parse_args(\n", @@ -321,7 +321,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-03-24 06:31:20,943 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n" + "2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n" ] }, { @@ -407,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -418,7 +418,7 @@ "examples/aishell/jit.model.pdmodel\n", "examples/aishell/jit.model.pdiparams\n", "0\n", - "True\n" + "False\n" ] } ], @@ -428,7 +428,8 @@ "from paddle.inference import PrecisionType\n", "from paddle.inference import create_predictor\n", "\n", - "args.use_gpu=True\n", + "args.use_gpu=False\n", + "paddle.set_device('cpu')\n", "\n", "def init_predictor(args):\n", " if args.model_dir is not None:\n", @@ -438,8 +439,8 @@ "\n", " if args.use_gpu:\n", " config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n", - " config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n", - " use_calib_mode=True) # 开启TensorRT预测,精度为fp32,开启int8离线量化\n", + "# config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n", + "# use_calib_mode=True) # 开启TensorRT预测,精度为fp32,开启int8离线量化\n", " else:\n", " # If not specific mkldnn, you can set the blas thread.\n", " # The thread num should not be greater than the number of cores in the CPU.\n", @@ -447,7 +448,7 @@ " config.enable_mkldnn()\n", " \n", " config.enable_memory_optim()\n", - " config.switch_ir_optim(False)\n", + " config.switch_ir_optim(True)\n", " \n", " print(config.model_dir())\n", " print(config.prog_file())\n", @@ -534,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -546,49 +547,35 @@ "input: 0 audio\n", "input: 1 audio_len\n", "output: 0 tmp_75\n", - "jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n", - " 8.91578738e-12 4.64319072e-08]\n", - " [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n", - " 1.55893121e-15 9.99992609e-01]\n", - " [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n", - " 1.24587735e-17 1.00000000e+00]\n", + "jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n", + " 8.91573707e-12 4.64317296e-08]\n", + " [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n", + " 1.55891342e-15 9.99992609e-01]\n", + " [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n", + " 1.24587264e-17 1.00000000e+00]\n", " ...\n", - " [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n", - " 4.37358093e-15 1.00000000e+00]\n", - " [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n", - " 3.89255983e-13 1.00000000e+00]\n", - " [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n", - " 1.00334671e-10 9.99998808e-01]]] \n", + " [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n", + " 4.37354747e-15 1.00000000e+00]\n", + " [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n", + " 3.89252270e-13 1.00000000e+00]\n", + " [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n", + " 1.00334095e-10 9.99998808e-01]]] \n", "[1, 161, 522]\n", - "[1]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/paddlepaddle_gpu-0.0.0-py3.7-linux-x86_64.egg/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", - " return (isinstance(seq, collections.Sequence) and\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "paddle: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n", - " 8.91578738e-12 4.64319072e-08]\n", - " [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n", - " 1.55893121e-15 9.99992609e-01]\n", - " [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n", + "[1]\n", + "paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n", + " 8.91577090e-12 4.64319072e-08]\n", + " [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n", + " 1.55891342e-15 9.99992609e-01]\n", + " [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n", " 1.24587735e-17 1.00000000e+00]\n", " ...\n", - " [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n", - " 4.37358093e-15 1.00000000e+00]\n", - " [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n", - " 3.89255983e-13 1.00000000e+00]\n", - " [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n", - " 1.00334671e-10 9.99998808e-01]]]\n", - "True\n" + " [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n", + " 4.37354747e-15 1.00000000e+00]\n", + " [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n", + " 3.89253761e-13 1.00000000e+00]\n", + " [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n", + " 1.00334095e-10 9.99998808e-01]]]\n", + "False\n" ] } ], @@ -607,7 +594,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -618,19 +605,19 @@ "input: 0 audio\n", "input: 1 audio_len\n", "output: 0 tmp_75\n", - "jit: [[[8.91791242e-12 4.45650548e-12 3.67574104e-09 ... 8.91772593e-12\n", - " 8.91578738e-12 4.64319072e-08]\n", - " [1.55952011e-15 2.62797088e-14 4.50428670e-12 ... 1.55946061e-15\n", - " 1.55893121e-15 9.99992609e-01]\n", - " [1.24638590e-17 7.61802427e-16 2.93266930e-14 ... 1.24633842e-17\n", + "jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n", + " 8.91577090e-12 4.64319072e-08]\n", + " [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n", + " 1.55891342e-15 9.99992609e-01]\n", + " [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n", " 1.24587735e-17 1.00000000e+00]\n", " ...\n", - " [4.37491543e-15 2.43678580e-12 1.98772032e-12 ... 4.37483242e-15\n", - " 4.37358093e-15 1.00000000e+00]\n", - " [3.89338410e-13 1.66756747e-11 1.42901749e-11 ... 3.89333233e-13\n", - " 3.89255983e-13 1.00000000e+00]\n", - " [1.00350561e-10 2.56295180e-10 2.91178692e-10 ... 1.00348452e-10\n", - " 1.00334671e-10 9.99998808e-01]]]\n" + " [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n", + " 4.37354747e-15 1.00000000e+00]\n", + " [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n", + " 3.89253761e-13 1.00000000e+00]\n", + " [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n", + " 1.00334095e-10 9.99998808e-01]]]\n" ] } ], diff --git a/.notebook/train_test.ipynb b/.notebook/train_test.ipynb index bedad6e11..b2e454395 100644 --- a/.notebook/train_test.ipynb +++ b/.notebook/train_test.ipynb @@ -454,7 +454,7 @@ " act='brelu')\n", "\n", " out_channel = 32\n", - " self.conv_stack = nn.LayerList([\n", + " self.conv_stack = nn.Sequential([\n", " ConvBn(\n", " num_channels_in=32,\n", " num_channels_out=out_channel,\n", @@ -1884,4 +1884,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/README.md b/README.md index 98d523890..8757ef631 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,6 @@ You are welcome to submit questions and bug reports in [Github Issues](https://g ## License DeepSpeech is provided under the [Apache-2.0 License](./LICENSE). + +## Acknowledgement +We depends on many open source repos. See [References](docs/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index 713e16ebd..e90489689 100644 --- a/README_cn.md +++ b/README_cn.md @@ -56,3 +56,6 @@ source tools/venv/bin/activate ## License DeepSpeech遵循[Apache-2.0开源协议](./LICENSE)。 + +## 感谢 +开发中参考一些优秀的仓库,详情参见 [References](docs/reference.md)。 diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index 563746f41..3e96c40f1 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -266,8 +266,17 @@ logger.warn( ) F.ctc_loss = ctc_loss - ########### hcak paddle.nn ############# +if not hasattr(paddle.nn, 'Module'): + logger.warn("register user Module to paddle.nn, remove this when fixed!") + setattr(paddle.nn, 'Module', paddle.nn.Layer) + +if not hasattr(paddle.nn, 'ModuleList'): + logger.warn( + "register user ModuleList to paddle.nn, remove this when fixed!") + setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList) + + class GLU(nn.Layer): """Gated Linear Units (GLU) Layer""" diff --git a/deepspeech/modules/activation.py b/deepspeech/modules/activation.py index 7769a7855..60be811e0 100644 --- a/deepspeech/modules/activation.py +++ b/deepspeech/modules/activation.py @@ -143,7 +143,8 @@ def get_activation(act): "relu": paddle.nn.ReLU, "selu": paddle.nn.SELU, "swish": paddle.nn.Swish, - "gelu": paddle.nn.GELU + "gelu": paddle.nn.GELU, + "brelu": brelu, } return activation_funcs[act]() diff --git a/deepspeech/modules/embedding.py b/deepspeech/modules/embedding.py index be2103292..efefd75ac 100644 --- a/deepspeech/modules/embedding.py +++ b/deepspeech/modules/embedding.py @@ -51,7 +51,7 @@ class PositionalEncoding(nn.Layer): self.pe = paddle.zeros(self.max_len, self.d_model) #[T,D] position = paddle.arange( - 0, self.max_len, dtype=paddle.float32).unsqueeze(1) + 0, self.max_len, dtype=paddle.float32).unsqueeze(1) #[T, 1] div_term = paddle.exp( paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * -(math.log(10000.0) / self.d_model)) @@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer): offset (int): position offset Returns: paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...) - paddle.Tensor: for compatibility to RelPositionalEncoding + paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) """ T = paddle.shape(x)[1] assert offset + T < self.max_len diff --git a/deepspeech/modules/encoder.py b/deepspeech/modules/encoder.py index 0a20fe299..9a4017fec 100644 --- a/deepspeech/modules/encoder.py +++ b/deepspeech/modules/encoder.py @@ -59,16 +59,16 @@ class BaseEncoder(nn.Layer): concat_after: bool=False, static_chunk_size: int=0, use_dynamic_chunk: bool=False, - global_cmvn: torch.nn.Module=None, + global_cmvn: paddle.nn.Layer=None, use_dynamic_left_chunk: bool=False, ): """ Args: - input_size (int): input dim - output_size (int): dimension of attention + input_size (int): input dim, d_feature + output_size (int): dimension of attention, d_model attention_heads (int): the number of heads of multi head attention linear_units (int): the hidden units number of position-wise feed forward - num_blocks (int): the number of decoder blocks + num_blocks (int): the number of encoder blocks dropout_rate (float): dropout rate attention_dropout_rate (float): dropout rate in attention positional_dropout_rate (float): dropout rate after adding @@ -89,7 +89,7 @@ class BaseEncoder(nn.Layer): use_dynamic_chunk (bool): whether use dynamic chunk size for training or not, You can only use fixed chunk(chunk_size > 0) or dyanmic chunk size(use_dynamic_chunk = True) - global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module + global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer use_dynamic_left_chunk (bool): whether use dynamic left chunk in dynamic chunk training """ @@ -117,13 +117,14 @@ class BaseEncoder(nn.Layer): self.global_cmvn = global_cmvn self.embed = subsampling_class( - input_size, - output_size, - dropout_rate, - pos_enc_class(output_size, positional_dropout_rate), ) + idim=input_size, + odim=output_size, + dropout_rate=dropout_rate, + pos_enc_class=pos_enc_class( + d_model=output_size, dropout_rate=positional_dropout_rate), ) self.normalize_before = normalize_before - self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12) + self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12) self.static_chunk_size = static_chunk_size self.use_dynamic_chunk = use_dynamic_chunk self.use_dynamic_left_chunk = use_dynamic_left_chunk @@ -133,11 +134,11 @@ class BaseEncoder(nn.Layer): def forward( self, - xs: torch.Tensor, - xs_lens: torch.Tensor, + xs: paddle.Tensor, + xs_lens: paddle.Tensor, decoding_chunk_size: int=0, num_decoding_left_chunks: int=-1, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Embed positions in tensor. Args: xs: padded input tensor (B, L, D) @@ -147,16 +148,16 @@ class BaseEncoder(nn.Layer): <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. num_decoding_left_chunks: number of left chunks, this is for decoding, - the chunk size is decoding_chunk_size. + the chunk size is decoding_chunk_size. >=0: use num_decoding_left_chunks <0: use all left chunks Returns: encoder output tensor, lens and mask """ - masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L) + masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L) if self.global_cmvn is not None: xs = self.global_cmvn(xs) - xs, pos_emb, masks = self.embed(xs, masks) + xs, pos_emb, masks = self.embed(xs, masks, offset=0) mask_pad = ~masks chunk_masks = add_optional_chunk_mask( xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, @@ -173,48 +174,52 @@ class BaseEncoder(nn.Layer): def forward_chunk( self, - xs: torch.Tensor, + xs: paddle.Tensor, offset: int, required_cache_size: int, - subsampling_cache: Optional[torch.Tensor]=None, - elayers_output_cache: Optional[List[torch.Tensor]]=None, - conformer_cnn_cache: Optional[List[torch.Tensor]]=None, - ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], List[ - torch.Tensor]]: + subsampling_cache: Optional[paddle.Tensor]=None, + elayers_output_cache: Optional[List[paddle.Tensor]]=None, + conformer_cnn_cache: Optional[List[paddle.Tensor]]=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[ + paddle.Tensor]]: """ Forward just one chunk Args: - xs (torch.Tensor): chunk input + xs (paddle.Tensor): chunk input, [B=1, T, D] offset (int): current offset in encoder output time stamp required_cache_size (int): cache size required for next chunk compuation >=0: actual cache size <0: means all history cache is required - subsampling_cache (Optional[torch.Tensor]): subsampling cache - elayers_output_cache (Optional[List[torch.Tensor]]): + subsampling_cache (Optional[paddle.Tensor]): subsampling cache + elayers_output_cache (Optional[List[paddle.Tensor]]): transformer/conformer encoder layers output cache - conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer + conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer cnn cache Returns: - torch.Tensor: output of current input xs - torch.Tensor: subsampling cache required for next chunk computation - List[torch.Tensor]: encoder layers output cache required for next + paddle.Tensor: output of current input xs + paddle.Tensor: subsampling cache required for next chunk computation + List[paddle.Tensor]: encoder layers output cache required for next chunk computation - List[torch.Tensor]: conformer cnn cache + List[paddle.Tensor]: conformer cnn cache """ - assert xs.size(0) == 1 + assert xs.size(0) == 1 # batch size must be one # tmp_masks is just for interface compatibility - tmp_masks = torch.ones( - 1, xs.size(1), device=xs.device, dtype=torch.bool) - tmp_masks = tmp_masks.unsqueeze(1) + tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) + tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] + if self.global_cmvn is not None: xs = self.global_cmvn(xs) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) + + xs, pos_emb, _ = self.embed( + xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D) if subsampling_cache is not None: - cache_size = subsampling_cache.size(1) - xs = torch.cat((subsampling_cache, xs), dim=1) + cache_size = subsampling_cache.size(1) #T + xs = paddle.cat((subsampling_cache, xs), dim=1) else: cache_size = 0 - pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1)) + pos_emb = self.embed.position_encoding( + offset=offset - cache_size, size=xs.size(1)) + if required_cache_size < 0: next_cache_start = 0 elif required_cache_size == 0: @@ -222,20 +227,17 @@ class BaseEncoder(nn.Layer): else: next_cache_start = xs.size(1) - required_cache_size r_subsampling_cache = xs[:, next_cache_start:, :] + # Real mask for transformer/conformer layers - masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool) - masks = masks.unsqueeze(1) + masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) + masks = masks.unsqueeze(1) #[B=1, C=1, T] r_elayers_output_cache = [] r_conformer_cnn_cache = [] for i, layer in enumerate(self.encoders): - if elayers_output_cache is None: - attn_cache = None - else: - attn_cache = elayers_output_cache[i] - if conformer_cnn_cache is None: - cnn_cache = None - else: - cnn_cache = conformer_cnn_cache[i] + attn_cache = None if elayers_output_cache is None else elayers_output_cache[ + i] + cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[ + i] xs, _, new_cnn_cache = layer( xs, masks, @@ -252,10 +254,10 @@ class BaseEncoder(nn.Layer): def forward_chunk_by_chunk( self, - xs: torch.Tensor, + xs: paddle.Tensor, decoding_chunk_size: int, num_decoding_left_chunks: int=-1, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """ Forward input chunk by chunk with chunk_size like a streaming fashion Here we should pay special attention to computation cache in the @@ -277,24 +279,27 @@ class BaseEncoder(nn.Layer): layers in subsampling, we need to rewrite it to make it work with cache, which is not prefered. Args: - xs (torch.Tensor): (1, max_len, dim) - chunk_size (int): decoding chunk size + xs (paddle.Tensor): (1, max_len, dim) + chunk_size (int): decoding chunk size. + num_left_chunks (int): decoding with num left chunks. """ assert decoding_chunk_size > 0 # The model is trained by static or dynamic chunk assert self.static_chunk_size > 0 or self.use_dynamic_chunk + + # feature stride and window for `subsampling` module subsampling = self.embed.subsampling_rate context = self.embed.right_context + 1 # Add current frame stride = subsampling * decoding_chunk_size decoding_window = (decoding_chunk_size - 1) * subsampling + context + num_frames = xs.size(1) - subsampling_cache: Optional[torch.Tensor] = None - elayers_output_cache: Optional[List[torch.Tensor]] = None - conformer_cnn_cache: Optional[List[torch.Tensor]] = None + required_cache_size = decoding_chunk_size * num_decoding_left_chunks + subsampling_cache: Optional[paddle.Tensor] = None + elayers_output_cache: Optional[List[paddle.Tensor]] = None + conformer_cnn_cache: Optional[List[paddle.Tensor]] = None outputs = [] offset = 0 - required_cache_size = decoding_chunk_size * num_decoding_left_chunks - # Feed forward overlap input step by step for cur in range(0, num_frames - context + 1, stride): end = min(cur + decoding_window, num_frames) @@ -305,8 +310,9 @@ class BaseEncoder(nn.Layer): elayers_output_cache, conformer_cnn_cache) outputs.append(y) offset += y.size(1) - ys = torch.cat(outputs, 1) - masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool) + ys = paddle.cat(outputs, 1) + # fake mask, just for jit script and compatibility with `forward` api + masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool) masks = masks.unsqueeze(1) return ys, masks @@ -330,7 +336,7 @@ class TransformerEncoder(BaseEncoder): concat_after: bool=False, static_chunk_size: int=0, use_dynamic_chunk: bool=False, - global_cmvn: torch.nn.Module=None, + global_cmvn: nn.Layer=None, use_dynamic_left_chunk: bool=False, ): """ Construct TransformerEncoder See Encoder for the meaning of each parameter. @@ -342,14 +348,16 @@ class TransformerEncoder(BaseEncoder): pos_enc_layer_type, normalize_before, concat_after, static_chunk_size, use_dynamic_chunk, global_cmvn, use_dynamic_left_chunk) - self.encoders = torch.nn.ModuleList([ + self.encoders = nn.ModuleList([ TransformerEncoderLayer( - output_size, - MultiHeadedAttention(attention_heads, output_size, - attention_dropout_rate), - PositionwiseFeedForward(output_size, linear_units, - dropout_rate), dropout_rate, - normalize_before, concat_after) for _ in range(num_blocks) + size=output_size, + self_attn=MultiHeadedAttention(attention_heads, output_size, + attention_dropout_rate), + feed_forward=PositionwiseFeedForward(output_size, linear_units, + dropout_rate), + dropout_rate=dropout_rate, + normalize_before=normalize_before, + concat_after=concat_after) for _ in range(num_blocks) ]) @@ -396,6 +404,7 @@ class ConformerEncoder(BaseEncoder): use_cnn_module (bool): Whether to use convolution module. cnn_module_kernel (int): Kernel size of convolution module. causal (bool): whether to use causal convolution or not. + cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm'] """ assert check_argument_types() super().__init__(input_size, output_size, attention_heads, linear_units, @@ -409,26 +418,26 @@ class ConformerEncoder(BaseEncoder): # self-attention module definition encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, output_size, - attention_dropout_rate, ) + attention_dropout_rate) # feed-forward module definition positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (output_size, linear_units, dropout_rate, - activation, ) + activation) # convolution module definition convolution_layer = ConvolutionModule convolution_layer_args = (output_size, cnn_module_kernel, activation, cnn_module_norm, causal) - self.encoders = torch.nn.ModuleList([ + self.encoders = nn.ModuleList([ ConformerEncoderLayer( - output_size, - encoder_selfattn_layer(*encoder_selfattn_layer_args), - positionwise_layer(*positionwise_layer_args), - positionwise_layer(*positionwise_layer_args) - if macaron_style else None, - convolution_layer(*convolution_layer_args) + size=output_size, + eself_attn=ncoder_selfattn_layer(*encoder_selfattn_layer_args), + feed_forward=positionwise_layer(*positionwise_layer_args), + feed_forward_macaron=positionwise_layer( + *positionwise_layer_args) if macaron_style else None, + conv_module=convolution_layer(*convolution_layer_args) if use_cnn_module else None, - dropout_rate, - normalize_before, - concat_after, ) for _ in range(num_blocks) + dropout_rate=dropout_rate, + normalize_before=normalize_before, + concat_after=concat_after) for _ in range(num_blocks) ]) diff --git a/deepspeech/modules/encoder_layer.py b/deepspeech/modules/encoder_layer.py index 734caae6c..2828f0053 100644 --- a/deepspeech/modules/encoder_layer.py +++ b/deepspeech/modules/encoder_layer.py @@ -72,6 +72,7 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, + mask_pad: Optional[paddle.Tensor]=None, output_cache: Optional[paddle.Tensor]=None, cnn_cache: Optional[paddle.Tensor]=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: @@ -81,6 +82,8 @@ class TransformerEncoderLayer(nn.Layer): mask (paddle.Tensor): Mask tensor for the input (#batch, time). pos_emb (paddle.Tensor): just for interface compatibility to ConformerEncoderLayer + mask_pad (paddle.Tensor): does not used in transformer layer, + just for unified api with conformer. output_cache (paddle.Tensor): Cache tensor of the output (#batch, time2, size), time2 < time in x. cnn_cache (paddle.Tensor): not used here, it's for interface @@ -88,6 +91,7 @@ class TransformerEncoderLayer(nn.Layer): Returns: paddle.Tensor: Output tensor (#batch, time, size). paddle.Tensor: Mask tensor (#batch, time). + paddle.Tensor: Fake cnn cache tensor for api compatibility with Conformer (#batch, channels, time'). """ residual = x if self.normalize_before: @@ -202,12 +206,13 @@ class ConformerEncoderLayer(nn.Layer): pos_emb (paddle.Tensor): positional encoding, must not be None for ConformerEncoderLayer. mask_pad (paddle.Tensor): batch padding mask used for conv module, (B, 1, T). - output_cache (paddle.Tensor): Cache tensor of the output + output_cache (paddle.Tensor): Cache tensor of the encoder output (#batch, time2, size), time2 < time in x. cnn_cache (paddle.Tensor): Convolution cache in conformer layer Returns: paddle.Tensor: Output tensor (#batch, time, size). paddle.Tensor: Mask tensor (#batch, time). + paddle.Tensor: New cnn cache tensor (#batch, channels, time'). """ # whether to use macaron style FFN if self.feed_forward_macaron is not None: diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index 5749c353c..007b18e0a 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -56,13 +56,13 @@ def subsequent_mask( This mask is used only in decoder which works in an auto-regressive mode. This means the current step could only do attention with its left steps. In encoder, fully attention is used when streaming is not necessary and - the sequence is not long. In this case, no attention mask is needed. + the sequence is not long. In this case, no attention mask is needed. When streaming is need, chunk-based attention is used in encoder. See subsequent_chunk_mask for the chunk-based attention mask. Args: size (int): size of mask Returns: - paddle.Tensor: mask + paddle.Tensor: mask, [size, size] Examples: >>> subsequent_mask(3) [[1, 0, 0], @@ -86,7 +86,7 @@ def subsequent_chunk_mask( <0: use full chunk >=0: use num_left_chunks Returns: - paddle.Tensor: mask + paddle.Tensor: mask, [size, size] Examples: >>> subsequent_chunk_mask(4, 2) [[1, 1, 0, 0], @@ -99,8 +99,8 @@ def subsequent_chunk_mask( if num_left_chunks < 0: start = 0 else: - start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) - ending = min((i // chunk_size + 1) * chunk_size, size) + start = max(0, (i // chunk_size - num_left_chunks) * chunk_size) + ending = min(size, (i // chunk_size + 1) * chunk_size) ret[i, start:ending] = True return ret diff --git a/deepspeech/modules/rnn.py b/deepspeech/modules/rnn.py index 3cb8c7d05..a6da4a11c 100644 --- a/deepspeech/modules/rnn.py +++ b/deepspeech/modules/rnn.py @@ -41,7 +41,7 @@ class RNNCell(nn.RNNCellBase): """ def __init__(self, - hidden_size, + hidden_size: int, activation="tanh", weight_ih_attr=None, weight_hh_attr=None, @@ -108,8 +108,8 @@ class GRUCell(nn.RNNCellBase): """ def __init__(self, - input_size, - hidden_size, + input_size: int, + hidden_size: int, weight_ih_attr=None, weight_hh_attr=None, bias_ih_attr=None, @@ -132,7 +132,6 @@ class GRUCell(nn.RNNCellBase): self.input_size = input_size self._gate_activation = F.sigmoid self._activation = paddle.tanh - #self._activation = F.relu def forward(self, inputs, states=None): if states is None: @@ -171,8 +170,6 @@ class BiRNNWithBN(nn.Layer): """Bidirectonal simple rnn layer with sequence-wise batch normalization. The batch normalization is only performed on input-state weights. - :param name: Name of the layer parameters. - :type name: string :param size: Dimension of RNN cells. :type size: int :param share_weights: Whether to share input-hidden weights between @@ -182,7 +179,7 @@ class BiRNNWithBN(nn.Layer): :rtype: Variable """ - def __init__(self, i_size, h_size, share_weights): + def __init__(self, i_size: int, h_size: int, share_weights: bool): super().__init__() self.share_weights = share_weights if self.share_weights: @@ -208,7 +205,7 @@ class BiRNNWithBN(nn.Layer): self.bw_rnn = nn.RNN( self.fw_cell, is_reverse=True, time_major=False) #[B, T, D] - def forward(self, x, x_len): + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): # x, shape [B, T, D] fw_x = self.fw_bn(self.fw_fc(x)) bw_x = self.bw_bn(self.bw_fc(x)) @@ -234,7 +231,7 @@ class BiGRUWithBN(nn.Layer): :rtype: Variable """ - def __init__(self, i_size, h_size, act): + def __init__(self, i_size: int, h_size: int): super().__init__() hidden_size = h_size * 3 @@ -281,23 +278,29 @@ class RNNStack(nn.Layer): :rtype: Variable """ - def __init__(self, i_size, h_size, num_stacks, use_gru, share_rnn_weights): + def __init__(self, + i_size: int, + h_size: int, + num_stacks: int, + use_gru: bool, + share_rnn_weights: bool): super().__init__() - self.rnn_stacks = nn.LayerList() + rnn_stacks = [] for i in range(num_stacks): if use_gru: #default:GRU using tanh - self.rnn_stacks.append( - BiGRUWithBN(i_size=i_size, h_size=h_size, act="relu")) + rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) else: - self.rnn_stacks.append( + rnn_stacks.append( BiRNNWithBN( i_size=i_size, h_size=h_size, share_weights=share_rnn_weights)) i_size = h_size * 2 - def forward(self, x, x_len): + self.rnn_stacks = nn.Sequential(rnn_stacks) + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): """ x: shape [B, T, D] x_len: shpae [B] diff --git a/deepspeech/modules/subsampling.py b/deepspeech/modules/subsampling.py index a01374d71..a0b80b844 100644 --- a/deepspeech/modules/subsampling.py +++ b/deepspeech/modules/subsampling.py @@ -32,10 +32,12 @@ __all__ = [ class BaseSubsampling(nn.Layer): - def __init__(self, pos_enc_class: PositionalEncoding): + def __init__(self, pos_enc_class: nn.Layer=PositionalEncoding): super().__init__() self.pos_enc = pos_enc_class + # window size = (1 + right_context) + (chunk_size -1) * subsampling_rate self.right_context = 0 + # stride = chunk_size * subsampling_rate self.subsampling_rate = 1 def position_encoding(self, offset: int, size: int) -> paddle.Tensor: @@ -49,7 +51,7 @@ class LinearNoSubsampling(BaseSubsampling): idim: int, odim: int, dropout_rate: float, - pos_enc_class: PositionalEncoding): + pos_enc_class: nn.Layer=PositionalEncoding): """Construct an linear object. Args: idim (int): Input dimension. @@ -71,6 +73,7 @@ class LinearNoSubsampling(BaseSubsampling): Args: x (paddle.Tensor): Input tensor (#batch, time, idim). x_mask (paddle.Tensor): Input mask (#batch, 1, time). + offset (int): position encoding offset. Returns: paddle.Tensor: linear input tensor (#batch, time', odim), where time' = time . @@ -90,7 +93,7 @@ class Conv2dSubsampling4(BaseSubsampling): idim: int, odim: int, dropout_rate: float, - pos_enc_class: PositionalEncoding): + pos_enc_class: nn.Layer=PositionalEncoding): """Construct an Conv2dSubsampling4 object. Args: @@ -117,6 +120,7 @@ class Conv2dSubsampling4(BaseSubsampling): Args: x (paddle.Tensor): Input tensor (#batch, time, idim). x_mask (paddle.Tensor): Input mask (#batch, 1, time). + offset (int): position encoding offset. Returns: paddle.Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4. @@ -139,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling): idim: int, odim: int, dropout_rate: float, - pos_enc_class: PositionalEncoding): + pos_enc_class: nn.Layer=PositionalEncoding): """Construct an Conv2dSubsampling6 object. Args: @@ -169,6 +173,7 @@ class Conv2dSubsampling6(BaseSubsampling): Args: x (paddle.Tensor): Input tensor (#batch, time, idim). x_mask (paddle.Tensor): Input mask (#batch, 1, time). + offset (int): position encoding offset. Returns: paddle.Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 6. @@ -191,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling): idim: int, odim: int, dropout_rate: float, - pos_enc_class: PositionalEncoding): + pos_enc_class: nn.Layer=PositionalEncoding): """Construct an Conv2dSubsampling8 object. Args: @@ -221,6 +226,7 @@ class Conv2dSubsampling8(BaseSubsampling): Args: x (paddle.Tensor): Input tensor (#batch, time, idim). x_mask (paddle.Tensor): Input mask (#batch, 1, time). + offset (int): position encoding offset. Returns: paddle.Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 8. diff --git a/docs/install.md b/docs/install.md index 71396590f..bd4d5a432 100644 --- a/docs/install.md +++ b/docs/install.md @@ -43,7 +43,7 @@ bash setup.sh source tools/venv/bin/activate ``` -## Running in Docker Container +## Running in Docker Container (optional) Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed, including the pre-built PaddlePaddle, CTC decoders, and other necessary Python and third-party packages. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed. diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 000000000..69ff6ab88 --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,3 @@ +# Reference + +* [wenet](https://github.com/mobvoi/wenet)