add the subsampling as conv

4 years ago · e4ef8ed31e
parent 8716386464
commit e4ef8ed31e
9 changed files with 70 additions and 195 deletions
--- a/deepspeech/models/ds2/init.py
+++ b/deepspeech/models/ds2/init.py
@ -1,4 +1,17 @@
-from .deepspeech2 import DeepSpeech2Model
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .deepspeech2 import DeepSpeech2InferModel
 from .deepspeech2 import DeepSpeech2Model
 __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
@ -19,8 +19,8 @@ from paddle import nn
 from yacs.config import CfgNode
 from deepspeech.models.ds2.conv import ConvStack
 from deepspeech.modules.ctc import CTCDecoder
 from deepspeech.models.ds2.rnn import RNNStack
 from deepspeech.modules.ctc import CTCDecoder
 from deepspeech.utils import layer_tools
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
--- a/deepspeech/models/ds2_online/init.py
+++ b/deepspeech/models/ds2_online/init.py
@ -1,7 +1,17 @@
-from .deepspeech2 import DeepSpeech2ModelOnline
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .deepspeech2 import DeepSpeech2InferModelOnline
 from .deepspeech2 import DeepSpeech2ModelOnline
 __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
--- a/deepspeech/models/ds2_online/conv.py
+++ b/deepspeech/models/ds2_online/conv.py
@ -11,162 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
-from deepspeech.modules.activation import brelu
+from deepspeech.modules.embedding import PositionalEncoding
-from deepspeech.modules.mask import make_non_pad_mask
+from deepspeech.modules.subsampling import Conv2dSubsampling4
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
-__all__ = ['ConvStack', "conv_output_size"]
+class Conv2dSubsampling4Online(Conv2dSubsampling4):
    def __init__(self,
                 idim: int,
                 odim: int,
                 dropout_rate: float,
                 pos_enc_class: nn.Layer=PositionalEncoding):
        super().__init__(idim, odim, dropout_rate, pos_enc_class)
        self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
-
+    def forward(self, x: paddle.Tensor,
-def conv_output_size(I, F, P, S):
+                x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
-    # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
    # Output size after Conv:
    #   By noting I the length of the input volume size, 
    #   F the length of the filter, 
    #   P the amount of zero padding, 
    #   S the stride,
    #   then the output size O of the feature map along that dimension is given by:
    #       O = (I - F + Pstart + Pend) // S + 1
    #   When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
    #   When Pstart == Pend == 0
    #       O = (I - F - S) // S
    # https://iq.opengenus.org/output-size-of-convolution/
    # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
    # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
    return (I - F + 2 * P - S) // S
 # receptive field calculator
 # https://fomoro.com/research/article/receptive-field-calculator
 # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
 # https://distill.pub/2019/computing-receptive-fields/
 # Rl-1 = Sl * Rl + (Kl - Sl) 
 class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.
    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type kernel_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :param num_channels_out: Number of output channels.
    :type num_channels_out: int
    :param stride: The x dimension of the stride. Or input a tuple for two 
                image dimension. 
    :type stride: int|tuple|list
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type, relu|brelu
    :type act: string
    :return: Batch norm layer after convolution layer.
    :rtype: Variable
    """
    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
                 padding, act):
        super().__init__()
        assert len(kernel_size) == 2
        assert len(stride) == 2
        assert len(padding) == 2
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.conv = nn.Conv2D(
            num_channels_in,
            num_channels_out,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            weight_attr=None,
            bias_attr=False,
            data_format='NCHW')
        self.bn = nn.BatchNorm2D(
            num_channels_out,
            weight_attr=None,
            bias_attr=None,
            data_format='NCHW')
        self.act = F.relu if act == 'relu' else brelu
    def forward(self, x, x_len):
        """
        x(Tensor): audio, shape [B, C, D, T]
        """
        x = self.conv(x)
-        x = self.bn(x)
+        b, c, t, f = paddle.shape(x)
-        x = self.act(x)
+        x = x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])
-
+        x_len = ((x_len - 1) // 2 - 1) // 2
        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
                 ) // self.stride[1] + 1
        # reset padding part to 0
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
        # TODO(Hui Zhang): not support bool multiply
        # masks = masks.type_as(x)
        masks = masks.astype(x.dtype)
        x = x.multiply(masks)
        return x, x_len
 class ConvStack(nn.Layer):
    """Convolution group with stacked convolution layers.
    :param feat_size: audio feature dim.
    :type feat_size: int
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    """
    def __init__(self, feat_size, num_stacks):
        super().__init__()
        self.feat_size = feat_size  # D
        self.num_stacks = num_stacks
        self.conv_in = ConvBn(
            num_channels_in=1,
            num_channels_out=32,
            kernel_size=(41, 11),  #[D, T]
            stride=(2, 3),
            padding=(20, 5),
            act='brelu')
        out_channel = 32
        convs = [
            ConvBn(
                num_channels_in=32,
                num_channels_out=out_channel,
                kernel_size=(21, 11),
                stride=(2, 1),
                padding=(10, 5),
                act='brelu') for i in range(num_stacks - 1)
        ]
        self.conv_stack = nn.LayerList(convs)
        # conv output feat_dim
        output_height = (feat_size - 1) // 2 + 1
        for i in range(self.num_stacks - 1):
            output_height = (output_height - 1) // 2 + 1
        self.output_height = out_channel * output_height
    def forward(self, x, x_len):
        """
        x: shape [B, C, D, T]
        x_len : shape [B]
        """
        x, x_len = self.conv_in(x, x_len)
        for i, conv in enumerate(self.conv_stack):
            x, x_len = conv(x, x_len)
        return x, x_len
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
@ -11,27 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Deepspeech2 ASR Model"""
+"""Deepspeech2 ASR Online Model"""
 from typing import Optional
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
 from paddle.fluid.layers import fc
 from paddle.nn import GRU
 from paddle.nn import LayerList
 from paddle.nn import LayerNorm
 from paddle.nn import Linear
 from paddle.nn import LSTM
 from yacs.config import CfgNode
-from deepspeech.models.ds2_online.conv import ConvStack
+from deepspeech.models.ds2_online.conv import Conv2dSubsampling4Online
 from deepspeech.models.ds2_online.rnn import RNNStack
 from deepspeech.modules.ctc import CTCDecoder
 from deepspeech.utils import layer_tools
 from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()
 __all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModeOnline']
@ -55,46 +47,48 @@ class CRNNEncoder(nn.Layer):
        self.num_rnn_layers = num_rnn_layers
        self.num_fc_layers = num_fc_layers
        self.fc_layers_size_list = fc_layers_size_list
-        self.conv = ConvStack(feat_size, num_conv_layers)
+        self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
-        i_size = self.conv.output_height  # H after conv stack
+        i_size = self.conv.output_dim
-        self.rnn = LayerList()
+        self.rnn = nn.LayerList()
-        self.layernorm_list = LayerList()
+        self.layernorm_list = nn.LayerList()
-        self.fc_layers_list = LayerList()
+        self.fc_layers_list = nn.LayerList()
        rnn_direction = 'forward'
        layernorm_size = rnn_size
        if use_gru == True:
            self.rnn.append(
-                GRU(input_size=i_size,
+                nn.GRU(
                    input_size=i_size,
                    hidden_size=rnn_size,
                    num_layers=1,
                    direction=rnn_direction))
-            self.layernorm_list.append(LayerNorm(layernorm_size))
+            self.layernorm_list.append(nn.LayerNorm(layernorm_size))
            for i in range(1, num_rnn_layers):
                self.rnn.append(
-                    GRU(input_size=layernorm_size,
+                    nn.GRU(
                        input_size=layernorm_size,
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
-                self.layernorm_list.append(LayerNorm(layernorm_size))
+                self.layernorm_list.append(nn.LayerNorm(layernorm_size))
        else:
            self.rnn.append(
-                LSTM(
+                nn.LSTM(
                    input_size=i_size,
                    hidden_size=rnn_size,
                    num_layers=1,
                    direction=rnn_direction))
-            self.layernorm_list.append(LayerNorm(layernorm_size))
+            self.layernorm_list.append(nn.LayerNorm(layernorm_size))
            for i in range(1, num_rnn_layers):
                self.rnn.append(
-                    LSTM(
+                    nn.LSTM(
                        input_size=layernorm_size,
                        hidden_size=rnn_size,
                        num_layers=1,
                        direction=rnn_direction))
-                self.layernorm_list.append(LayerNorm(layernorm_size))
+                self.layernorm_list.append(nn.LayerNorm(layernorm_size))
        fc_input_size = layernorm_size
        for i in range(self.num_fc_layers):
            self.fc_layers_list.append(
@ -117,20 +111,16 @@ class CRNNEncoder(nn.Layer):
            x (Tensor): encoder outputs, [B, T, D]
            x_lens (Tensor): encoder length, [B]
        """
-        # [B, T, D]  -> [B, D, T]
+        # [B, T, D]
-        audio = audio.transpose([0, 2, 1])
+        x = audio
        # [B, D, T] -> [B, C=1, D, T]
        x = audio.unsqueeze(1)
        x_lens = audio_len
        # convolution group
        x, x_lens = self.conv(x, x_lens)
        # convert data from convolution feature map to sequence of vectors
        #B, C, D, T = paddle.shape(x)  # not work under jit
-        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
+        #x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
-        x = x.reshape([0, 0, -1])  #[B, T, C*D]
+        #x = x.reshape([0, 0, -1])  #[B, T, C*D]
        # remove padding part
        x, output_state = self.rnn[0](x, None, x_lens)
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -36,11 +36,10 @@ collator:
 model:
  num_conv_layers: 2
-  num_rnn_layers: 4
+  num_rnn_layers: 3
  rnn_layer_size: 1024
  use_gru: True 
  share_rnn_weights: False
  apply_online: False
 training:
  n_epoch: 50
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@ -40,7 +40,6 @@ model:
  rnn_layer_size: 2048
  use_gru: False 
  share_rnn_weights: True
  apply_online: False
 training:
  n_epoch: 50
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@ -41,7 +41,6 @@ model:
  rnn_layer_size: 2048
  use_gru: False 
  share_rnn_weights: True 
  apply_online: True
 training:
  n_epoch: 10
--- a/tests/deepspeech2_model_test.py
+++ b/tests/deepspeech2_model_test.py
@ -16,8 +16,8 @@ import unittest
 import numpy as np
 import paddle
-#from deepspeech.models.deepspeech2 import DeepSpeech2Model
+from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.models.ds2_online import DeepSpeech2ModelOnline as DeepSpeech2Model
+
 class TestDeepSpeech2Model(unittest.TestCase):
    def setUp(self):