PaddleSpeech/deepspeech/modules/conv.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging

import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I

from deepspeech.modules.mask import sequence_mask
from deepspeech.modules.activation import brelu

logger = logging.getLogger(__name__)

__all__ = ['ConvStack']


class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.

    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type kernel_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :param num_channels_out: Number of output channels.
    :type num_channels_out: int
    :param stride: The x dimension of the stride. Or input a tuple for two 
                image dimension. 
    :type stride: int|tuple|list
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type, relu|brelu
    :type act: string
    :return: Batch norm layer after convolution layer.
    :rtype: Variable

    """

    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
                 padding, act):

        super().__init__()
        assert len(kernel_size) == 2
        assert len(stride) == 2
        assert len(padding) == 2
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        self.conv = nn.Conv2D(
            num_channels_in,
            num_channels_out,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            weight_attr=None,
            bias_attr=False,
            data_format='NCHW')

        self.bn = nn.BatchNorm2D(
            num_channels_out,
            weight_attr=None,
            bias_attr=None,
            data_format='NCHW')
        self.act = F.relu if act == 'relu' else brelu

    def forward(self, x, x_len):
        """
        x(Tensor): audio, shape [B, C, D, T]
        """
        x = self.conv(x)
        x = self.bn(x)
        x = self.act(x)

        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
                 ) // self.stride[1] + 1

        # reset padding part to 0
        masks = sequence_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
        x = x.multiply(masks)

        return x, x_len


class ConvStack(nn.Layer):
    """Convolution group with stacked convolution layers.

    :param feat_size: audio feature dim.
    :type feat_size: int
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    """

    def __init__(self, feat_size, num_stacks):
        super().__init__()
        self.feat_size = feat_size  # D
        self.num_stacks = num_stacks

        self.conv_in = ConvBn(
            num_channels_in=1,
            num_channels_out=32,
            kernel_size=(41, 11),  #[D, T]
            stride=(2, 3),
            padding=(20, 5),
            act='brelu')

        out_channel = 32
        self.conv_stack = nn.LayerList([
            ConvBn(
                num_channels_in=32,
                num_channels_out=out_channel,
                kernel_size=(21, 11),
                stride=(2, 1),
                padding=(10, 5),
                act='brelu') for i in range(num_stacks - 1)
        ])

        # conv output feat_dim
        output_height = (feat_size - 1) // 2 + 1
        for i in range(self.num_stacks - 1):
            output_height = (output_height - 1) // 2 + 1
        self.output_height = out_channel * output_height

    def forward(self, x, x_len):
        """
        x: shape [B, C, D, T]
        x_len : shape [B]
        """
        x, x_len = self.conv_in(x, x_len)
        for i, conv in enumerate(self.conv_stack):
            x, x_len = conv(x, x_len)
        return x, x_len
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`import logging`

			`import paddle`
			`from paddle import nn`
			`from paddle.nn import functional as F`
			`from paddle.nn import initializer as I`

			`from deepspeech.modules.mask import sequence_mask`
			`from deepspeech.modules.activation import brelu`

			`logger = logging.getLogger(__name__)`

			`__all__ = ['ConvStack']`


			`class ConvBn(nn.Layer):`
			`"""Convolution layer with batch normalization.`

			`:param kernel_size: The x dimension of a filter kernel. Or input a tuple for`
			`two image dimension.`
			`:type kernel_size: int\|tuple\|list`
			`:param num_channels_in: Number of input channels.`
			`:type num_channels_in: int`
			`:param num_channels_out: Number of output channels.`
			`:type num_channels_out: int`
			`:param stride: The x dimension of the stride. Or input a tuple for two`
			`image dimension.`
			`:type stride: int\|tuple\|list`
			`:param padding: The x dimension of the padding. Or input a tuple for two`
			`image dimension.`
			`:type padding: int\|tuple\|list`
			`:param act: Activation type, relu\|brelu`
			`:type act: string`
			`:return: Batch norm layer after convolution layer.`
			`:rtype: Variable`

			`"""`

			`def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,`
			`padding, act):`

			`super().__init__()`
			`assert len(kernel_size) == 2`
			`assert len(stride) == 2`
			`assert len(padding) == 2`
			`self.kernel_size = kernel_size`
			`self.stride = stride`
			`self.padding = padding`

			`self.conv = nn.Conv2D(`
			`num_channels_in,`
			`num_channels_out,`
			`kernel_size=kernel_size,`
			`stride=stride,`
			`padding=padding,`
			`weight_attr=None,`
			`bias_attr=False,`
			`data_format='NCHW')`

			`self.bn = nn.BatchNorm2D(`
			`num_channels_out,`
			`weight_attr=None,`
			`bias_attr=None,`
			`data_format='NCHW')`
			`self.act = F.relu if act == 'relu' else brelu`

			`def forward(self, x, x_len):`
			`"""`
			`x(Tensor): audio, shape [B, C, D, T]`
			`"""`
			`x = self.conv(x)`
			`x = self.bn(x)`
			`x = self.act(x)`

			`x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]`
			`) // self.stride[1] + 1`

			`# reset padding part to 0`
			`masks = sequence_mask(x_len) #[B, T]`
			`masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]`
			`x = x.multiply(masks)`

			`return x, x_len`


			`class ConvStack(nn.Layer):`
			`"""Convolution group with stacked convolution layers.`

			`:param feat_size: audio feature dim.`
			`:type feat_size: int`
			`:param num_stacks: Number of stacked convolution layers.`
			`:type num_stacks: int`
			`"""`

			`def __init__(self, feat_size, num_stacks):`
			`super().__init__()`
			`self.feat_size = feat_size # D`
			`self.num_stacks = num_stacks`

			`self.conv_in = ConvBn(`
			`num_channels_in=1,`
			`num_channels_out=32,`
			`kernel_size=(41, 11), #[D, T]`
			`stride=(2, 3),`
			`padding=(20, 5),`
			`act='brelu')`

			`out_channel = 32`
			`self.conv_stack = nn.LayerList([`
			`ConvBn(`
			`num_channels_in=32,`
			`num_channels_out=out_channel,`
			`kernel_size=(21, 11),`
			`stride=(2, 1),`
			`padding=(10, 5),`
			`act='brelu') for i in range(num_stacks - 1)`
			`])`

			`# conv output feat_dim`
			`output_height = (feat_size - 1) // 2 + 1`
			`for i in range(self.num_stacks - 1):`
			`output_height = (output_height - 1) // 2 + 1`
			`self.output_height = out_channel * output_height`

			`def forward(self, x, x_len):`
			`"""`
			`x: shape [B, C, D, T]`
			`x_len : shape [B]`
			`"""`
			`x, x_len = self.conv_in(x, x_len)`
			`for i, conv in enumerate(self.conv_stack):`
			`x, x_len = conv(x, x_len)`
			`return x, x_len`