PaddleSpeech/paddlespeech/s2t/models/wav2vec2/modules/normalization.py

# Authors
#  * Mirco Ravanelli 2020
#  * Guillermo Cámbara 2021
#  * Sarthak Yadav 2022
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/normalization.py)
import paddle.nn as nn

from paddlespeech.s2t.modules.align import BatchNorm1D


class BatchNorm1d(nn.Layer):
    """Applies 1d batch normalization to the input tensor.
    Arguments
    ---------
    input_shape : tuple
        The expected shape of the input. Alternatively, use ``input_size``.
    input_size : int
        The expected size of the input. Alternatively, use ``input_shape``.
    eps : float
        This value is added to std deviation estimation to improve the numerical
        stability.
    momentum : float
        It is a value used for the running_mean and running_var computation.
    affine : bool
        When set to True, the affine parameters are learned.
    track_running_stats : bool
        When set to True, this module tracks the running mean and variance,
        and when set to False, this module does not track such statistics.
    combine_batch_time : bool
        When true, it combines batch an time axis.
    Example
    -------
    >>> input = paddle.randn([100, 10])
    >>> norm = BatchNorm1d(input_shape=input.shape)
    >>> output = norm(input)
    >>> output.shape
    Paddle.Shape([100, 10])
    """

    def __init__(
            self,
            input_shape=None,
            input_size=None,
            eps=1e-05,
            momentum=0.9,
            combine_batch_time=False,
            skip_transpose=False, ):
        super().__init__()
        self.combine_batch_time = combine_batch_time
        self.skip_transpose = skip_transpose

        if input_size is None and skip_transpose:
            input_size = input_shape[1]
        elif input_size is None:
            input_size = input_shape[-1]

        self.norm = BatchNorm1D(input_size, momentum=momentum, epsilon=eps)

    def forward(self, x):
        """Returns the normalized input tensor.
        Arguments
        ---------
        x : paddle.Tensor (batch, time, [channels])
            input to normalize. 2d or 3d tensors are expected in input
            4d tensors can be used when combine_dims=True.
        """
        shape_or = x.shape
        if self.combine_batch_time:
            if x.ndim == 3:
                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
            else:
                x = x.reshape(shape_or[0] * shape_or[1], shape_or[3],
                              shape_or[2])

        elif not self.skip_transpose:
            x = x.transpose([0, 2, 1])

        x_n = self.norm(x)
        if self.combine_batch_time:
            x_n = x_n.reshape(shape_or)
        elif not self.skip_transpose:
            x_n = x_n.transpose([0, 2, 1])

        return x_n