PaddleSpeech/paddlespeech/t2s/modules/losses.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math

import paddle
from paddle import nn
from paddle.fluid.layers import sequence_mask
from paddle.nn import functional as F
from scipy import signal


# Loss for Tacotron2
def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):
    """Build that W matrix. shape(B, T_dec, T_enc)
    W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])**2 / (2g**2)) 

    See also:
    Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969.
    """
    dtype = dtype or paddle.get_default_dtype()
    dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze(
        -1)  # n/N # shape(B, T_dec)
    enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze(
        -1)  # t/T # shape(B, T_enc)
    W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 /
                       (2 * g**2))

    dec_mask = sequence_mask(dec_lens, maxlen=N)
    enc_mask = sequence_mask(enc_lens, maxlen=T)
    mask = dec_mask.unsqueeze(-1) * enc_mask.unsqueeze(1)
    mask = paddle.cast(mask, W.dtype)

    W *= mask
    return W


def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):
    """Guided attention loss, masked to excluded padding parts."""
    _, N, T = attention_weight.shape
    W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype)

    total_tokens = (dec_lens * enc_lens).astype(W.dtype)
    loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens)
    return loss


# Losses for GAN Vocoder
def stft(x,
         fft_size,
         hop_length=None,
         win_length=None,
         window='hann',
         center=True,
         pad_mode='reflect'):
    """Perform STFT and convert to magnitude spectrogram.
    Parameters
    ----------
    x : Tensor
        Input signal tensor (B, T).
    fft_size : int
        FFT size.
    hop_size : int
        Hop size.
    win_length : int
        window : str, optional
    window : str
        Name of window function, see `scipy.signal.get_window` for more
        details. Defaults to "hann".
    center : bool, optional
        center (bool, optional): Whether to pad `x` to make that the
        :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
    pad_mode : str, optional
        Choose padding pattern when `center` is `True`.
    Returns
    ----------
    Tensor:
        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
    """
    # calculate window
    window = signal.get_window(window, win_length, fftbins=True)
    window = paddle.to_tensor(window)
    x_stft = paddle.signal.stft(
        x,
        fft_size,
        hop_length,
        win_length,
        window=window,
        center=center,
        pad_mode=pad_mode)

    real = x_stft.real()
    imag = x_stft.imag()

    return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
        [0, 2, 1])


class SpectralConvergenceLoss(nn.Layer):
    """Spectral convergence loss module."""

    def __init__(self):
        """Initilize spectral convergence loss module."""
        super().__init__()

    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
        Parameters
        ----------
        x_mag : Tensor
            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
        y_mag : Tensor)
            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns
        ----------
        Tensor
            Spectral convergence loss value.
        """
        return paddle.norm(
            y_mag - x_mag, p="fro") / paddle.clip(
                paddle.norm(y_mag, p="fro"), min=1e-10)


class LogSTFTMagnitudeLoss(nn.Layer):
    """Log STFT magnitude loss module."""

    def __init__(self, epsilon=1e-7):
        """Initilize los STFT magnitude loss module."""
        super().__init__()
        self.epsilon = epsilon

    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
        Parameters
        ----------
        x_mag : Tensor
            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
        y_mag : Tensor
            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns
        ----------
        Tensor
            Log STFT magnitude loss value.
        """
        return F.l1_loss(
            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
            paddle.log(paddle.clip(x_mag, min=self.epsilon)))


class STFTLoss(nn.Layer):
    """STFT loss module."""

    def __init__(self,
                 fft_size=1024,
                 shift_size=120,
                 win_length=600,
                 window="hann"):
        """Initialize STFT loss module."""
        super().__init__()
        self.fft_size = fft_size
        self.shift_size = shift_size
        self.win_length = win_length
        self.window = window
        self.spectral_convergence_loss = SpectralConvergenceLoss()
        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()

    def forward(self, x, y):
        """Calculate forward propagation.
        Parameters
        ----------
        x : Tensor
            Predicted signal (B, T).
        y : Tensor
            Groundtruth signal (B, T).
        Returns
        ----------
        Tensor
            Spectral convergence loss value.
        Tensor
            Log STFT magnitude loss value.
        """
        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
                     self.window)
        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
                     self.window)
        sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)

        return sc_loss, mag_loss


class MultiResolutionSTFTLoss(nn.Layer):
    """Multi resolution STFT loss module."""

    def __init__(
            self,
            fft_sizes=[1024, 2048, 512],
            hop_sizes=[120, 240, 50],
            win_lengths=[600, 1200, 240],
            window="hann", ):
        """Initialize Multi resolution STFT loss module.
        Parameters
        ----------
        fft_sizes : list
            List of FFT sizes.
        hop_sizes : list
            List of hop sizes.
        win_lengths : list
            List of window lengths.
        window : str
            Window function type.
        """
        super().__init__()
        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
        self.stft_losses = nn.LayerList()
        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
            self.stft_losses.append(STFTLoss(fs, ss, wl, window))

    def forward(self, x, y):
        """Calculate forward propagation.
        Parameters
        ----------
        x : Tensor
            Predicted signal (B, T) or (B, #subband, T).
        y : Tensor
            Groundtruth signal (B, T) or (B, #subband, T).
        Returns
        ----------
        Tensor
            Multi resolution spectral convergence loss value.
        Tensor
            Multi resolution log STFT magnitude loss value.
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B x C, T)
            x = x.reshape([-1, x.shape[2]])
            # (B, C, T) -> (B x C, T)
            y = y.reshape([-1, y.shape[2]])
        sc_loss = 0.0
        mag_loss = 0.0
        for f in self.stft_losses:
            sc_l, mag_l = f(x, y)
            sc_loss += sc_l
            mag_loss += mag_l
        sc_loss /= len(self.stft_losses)
        mag_loss /= len(self.stft_losses)

        return sc_loss, mag_loss


class GeneratorAdversarialLoss(nn.Layer):
    """Generator adversarial loss module."""

    def __init__(
            self,
            average_by_discriminators=True,
            loss_type="mse", ):
        """Initialize GeneratorAversarialLoss module."""
        super().__init__()
        self.average_by_discriminators = average_by_discriminators
        assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
        if loss_type == "mse":
            self.criterion = self._mse_loss
        else:
            self.criterion = self._hinge_loss

    def forward(self, outputs):
        """Calcualate generator adversarial loss.
        Parameters
        ----------
        outputs: Tensor or List
        Discriminator outputs or list of discriminator outputs.
        Returns
        ----------
        Tensor
            Generator adversarial loss value.
        """
        if isinstance(outputs, (tuple, list)):
            adv_loss = 0.0
            for i, outputs_ in enumerate(outputs):
                if isinstance(outputs_, (tuple, list)):
                    # case including feature maps
                    outputs_ = outputs_[-1]
                adv_loss += self.criterion(outputs_)
            if self.average_by_discriminators:
                adv_loss /= i + 1
        else:
            adv_loss = self.criterion(outputs)

        return adv_loss

    def _mse_loss(self, x):
        return F.mse_loss(x, paddle.ones_like(x))

    def _hinge_loss(self, x):
        return -x.mean()


class DiscriminatorAdversarialLoss(nn.Layer):
    """Discriminator adversarial loss module."""

    def __init__(
            self,
            average_by_discriminators=True,
            loss_type="mse", ):
        """Initialize DiscriminatorAversarialLoss module."""
        super().__init__()
        self.average_by_discriminators = average_by_discriminators
        assert loss_type in ["mse"], f"{loss_type} is not supported."
        if loss_type == "mse":
            self.fake_criterion = self._mse_fake_loss
            self.real_criterion = self._mse_real_loss

    def forward(self, outputs_hat, outputs):
        """Calcualate discriminator adversarial loss.
        Parameters
        ----------
        outputs_hat : Tensor or list
            Discriminator outputs or list of
            discriminator outputs calculated from generator outputs.
        outputs : Tensor or list
            Discriminator outputs or list of
            discriminator outputs calculated from groundtruth.
        Returns
        ----------
        Tensor
            Discriminator real loss value.
        Tensor
            Discriminator fake loss value.
        """
        if isinstance(outputs, (tuple, list)):
            real_loss = 0.0
            fake_loss = 0.0
            for i, (outputs_hat_,
                    outputs_) in enumerate(zip(outputs_hat, outputs)):
                if isinstance(outputs_hat_, (tuple, list)):
                    # case including feature maps
                    outputs_hat_ = outputs_hat_[-1]
                    outputs_ = outputs_[-1]
                real_loss += self.real_criterion(outputs_)
                fake_loss += self.fake_criterion(outputs_hat_)
            if self.average_by_discriminators:
                fake_loss /= i + 1
                real_loss /= i + 1
        else:
            real_loss = self.real_criterion(outputs)
            fake_loss = self.fake_criterion(outputs_hat)

        return real_loss, fake_loss

    def _mse_real_loss(self, x):
        return F.mse_loss(x, paddle.ones_like(x))

    def _mse_fake_loss(self, x):
        return F.mse_loss(x, paddle.zeros_like(x))


# Losses for SpeedySpeech
# Structural Similarity Index Measure (SSIM)
def gaussian(window_size, sigma):
    gauss = paddle.to_tensor([
        math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
        for x in range(window_size)
    ])
    return gauss / gauss.sum()


def create_window(window_size, channel):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = paddle.matmul(_1D_window, paddle.transpose(
        _1D_window, [1, 0])).unsqueeze([0, 1])
    window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
    return window


def _ssim(img1, img2, window, window_size, channel, size_average=True):
    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(
        img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(
        img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
    sigma12 = F.conv2d(
        img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2

    C1 = 0.01**2
    C2 = 0.03**2

    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
             / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))

    if size_average:
        return ssim_map.mean()
    else:
        return ssim_map.mean(1).mean(1).mean(1)


def ssim(img1, img2, window_size=11, size_average=True):
    (_, channel, _, _) = img1.shape
    window = create_window(window_size, channel)
    return _ssim(img1, img2, window, window_size, channel, size_average)


def weighted_mean(input, weight):
    """Weighted mean. It can also be used as masked mean.

    Parameters
    -----------
    input : Tensor 
        The input tensor.
    weight : Tensor
        The weight tensor with broadcastable shape with the input.

    Returns
    ----------
    Tensor [shape=(1,)]
        Weighted mean tensor with the same dtype as input.
    """
    weight = paddle.cast(weight, input.dtype)
    broadcast_ratio = input.size / weight.size
    return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio)


def masked_l1_loss(prediction, target, mask):
    """Compute maksed L1 loss.

    Parameters
    ----------
    prediction : Tensor
        The prediction.
    target : Tensor
        The target. The shape should be broadcastable to ``prediction``.
    mask : Tensor
        The mask. The shape should be broadcatable to the broadcasted shape of
        ``prediction`` and ``target``.

    Returns
    -------
    Tensor [shape=(1,)]
        The masked L1 loss.
    """
    abs_error = F.l1_loss(prediction, target, reduction='none')
    loss = weighted_mean(abs_error, mask)
    return loss
merge parakeet repo into deepspeech 3 years ago			`# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
refactor encoder, rm old code 3 years ago			`import math`

merge parakeet repo into deepspeech 3 years ago			`import paddle`
refactor encoder, rm old code 3 years ago			`from paddle import nn`
merge parakeet repo into deepspeech 3 years ago			`from paddle.fluid.layers import sequence_mask`
			`from paddle.nn import functional as F`
refactor encoder, rm old code 3 years ago			`from scipy import signal`
merge parakeet repo into deepspeech 3 years ago

refactor encoder, rm old code 3 years ago			`# Loss for Tacotron2`
merge parakeet repo into deepspeech 3 years ago			`def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None):`
			`"""Build that W matrix. shape(B, T_dec, T_enc)`
			`W[i, n, t] = 1 - exp(-(n/dec_lens[i] - t/enc_lens[i])2 / (2g2))`

			`See also:`
			`Tachibana, Hideyuki, Katsuya Uenoyama, and Shunsuke Aihara. 2017. “Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention.” ArXiv:1710.08969 [Cs, Eess], October. http://arxiv.org/abs/1710.08969.`
			`"""`
			`dtype = dtype or paddle.get_default_dtype()`
			`dec_pos = paddle.arange(0, N).astype(dtype) / dec_lens.unsqueeze(`
			`-1) # n/N # shape(B, T_dec)`
			`enc_pos = paddle.arange(0, T).astype(dtype) / enc_lens.unsqueeze(`
			`-1) # t/T # shape(B, T_enc)`
			`W = 1 - paddle.exp(-(dec_pos.unsqueeze(-1) - enc_pos.unsqueeze(1))**2 /`
			`(2 * g**2))`

			`dec_mask = sequence_mask(dec_lens, maxlen=N)`
			`enc_mask = sequence_mask(enc_lens, maxlen=T)`
			`mask = dec_mask.unsqueeze(-1) * enc_mask.unsqueeze(1)`
			`mask = paddle.cast(mask, W.dtype)`

			`W *= mask`
			`return W`


			`def guided_attention_loss(attention_weight, dec_lens, enc_lens, g):`
			`"""Guided attention loss, masked to excluded padding parts."""`
			`_, N, T = attention_weight.shape`
			`W = attention_guide(dec_lens, enc_lens, N, T, g, attention_weight.dtype)`

			`total_tokens = (dec_lens * enc_lens).astype(W.dtype)`
			`loss = paddle.mean(paddle.sum(W * attention_weight, [1, 2]) / total_tokens)`
			`return loss`


refactor encoder, rm old code 3 years ago			`# Losses for GAN Vocoder`
			`def stft(x,`
			`fft_size,`
			`hop_length=None,`
			`win_length=None,`
			`window='hann',`
			`center=True,`
			`pad_mode='reflect'):`
			`"""Perform STFT and convert to magnitude spectrogram.`
			`Parameters`
			`----------`
			`x : Tensor`
			`Input signal tensor (B, T).`
			`fft_size : int`
			`FFT size.`
			`hop_size : int`
			`Hop size.`
			`win_length : int`
			`window : str, optional`
			`window : str`
			Name of window function, see `scipy.signal.get_window` for more
			`details. Defaults to "hann".`
			`center : bool, optional`
			center (bool, optional): Whether to pad `x` to make that the
			:math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
			`pad_mode : str, optional`
			Choose padding pattern when `center` is `True`.
			`Returns`
			`----------`
			`Tensor:`
			`Magnitude spectrogram (B, #frames, fft_size // 2 + 1).`
			`"""`
			`# calculate window`
			`window = signal.get_window(window, win_length, fftbins=True)`
			`window = paddle.to_tensor(window)`
			`x_stft = paddle.signal.stft(`
			`x,`
			`fft_size,`
			`hop_length,`
			`win_length,`
			`window=window,`
			`center=center,`
			`pad_mode=pad_mode)`

			`real = x_stft.real()`
			`imag = x_stft.imag()`

			`return paddle.sqrt(paddle.clip(real2 + imag2, min=1e-7)).transpose(`
			`[0, 2, 1])`


			`class SpectralConvergenceLoss(nn.Layer):`
			`"""Spectral convergence loss module."""`

			`def __init__(self):`
			`"""Initilize spectral convergence loss module."""`
			`super().__init__()`

			`def forward(self, x_mag, y_mag):`
			`"""Calculate forward propagation.`
			`Parameters`
			`----------`
			`x_mag : Tensor`
			`Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).`
			`y_mag : Tensor)`
			`Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).`
			`Returns`
			`----------`
			`Tensor`
			`Spectral convergence loss value.`
			`"""`
			`return paddle.norm(`
			`y_mag - x_mag, p="fro") / paddle.clip(`
			`paddle.norm(y_mag, p="fro"), min=1e-10)`


			`class LogSTFTMagnitudeLoss(nn.Layer):`
			`"""Log STFT magnitude loss module."""`

			`def __init__(self, epsilon=1e-7):`
			`"""Initilize los STFT magnitude loss module."""`
			`super().__init__()`
			`self.epsilon = epsilon`

			`def forward(self, x_mag, y_mag):`
			`"""Calculate forward propagation.`
			`Parameters`
			`----------`
			`x_mag : Tensor`
			`Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).`
			`y_mag : Tensor`
			`Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).`
			`Returns`
			`----------`
			`Tensor`
			`Log STFT magnitude loss value.`
			`"""`
			`return F.l1_loss(`
			`paddle.log(paddle.clip(y_mag, min=self.epsilon)),`
			`paddle.log(paddle.clip(x_mag, min=self.epsilon)))`


			`class STFTLoss(nn.Layer):`
			`"""STFT loss module."""`

			`def __init__(self,`
			`fft_size=1024,`
			`shift_size=120,`
			`win_length=600,`
			`window="hann"):`
			`"""Initialize STFT loss module."""`
			`super().__init__()`
			`self.fft_size = fft_size`
			`self.shift_size = shift_size`
			`self.win_length = win_length`
			`self.window = window`
			`self.spectral_convergence_loss = SpectralConvergenceLoss()`
			`self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()`

			`def forward(self, x, y):`
			`"""Calculate forward propagation.`
			`Parameters`
			`----------`
			`x : Tensor`
			`Predicted signal (B, T).`
			`y : Tensor`
			`Groundtruth signal (B, T).`
			`Returns`
			`----------`
			`Tensor`
			`Spectral convergence loss value.`
			`Tensor`
			`Log STFT magnitude loss value.`
			`"""`
			`x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,`
			`self.window)`
			`y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,`
			`self.window)`
			`sc_loss = self.spectral_convergence_loss(x_mag, y_mag)`
			`mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)`

			`return sc_loss, mag_loss`


			`class MultiResolutionSTFTLoss(nn.Layer):`
			`"""Multi resolution STFT loss module."""`

			`def __init__(`
			`self,`
			`fft_sizes=[1024, 2048, 512],`
			`hop_sizes=[120, 240, 50],`
			`win_lengths=[600, 1200, 240],`
			`window="hann", ):`
			`"""Initialize Multi resolution STFT loss module.`
			`Parameters`
			`----------`
			`fft_sizes : list`
			`List of FFT sizes.`
			`hop_sizes : list`
			`List of hop sizes.`
			`win_lengths : list`
			`List of window lengths.`
			`window : str`
			`Window function type.`
			`"""`
			`super().__init__()`
			`assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)`
			`self.stft_losses = nn.LayerList()`
			`for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):`
			`self.stft_losses.append(STFTLoss(fs, ss, wl, window))`

			`def forward(self, x, y):`
			`"""Calculate forward propagation.`
			`Parameters`
			`----------`
			`x : Tensor`
			`Predicted signal (B, T) or (B, #subband, T).`
			`y : Tensor`
			`Groundtruth signal (B, T) or (B, #subband, T).`
			`Returns`
			`----------`
			`Tensor`
			`Multi resolution spectral convergence loss value.`
			`Tensor`
			`Multi resolution log STFT magnitude loss value.`
			`"""`
			`if len(x.shape) == 3:`
			`# (B, C, T) -> (B x C, T)`
			`x = x.reshape([-1, x.shape[2]])`
			`# (B, C, T) -> (B x C, T)`
			`y = y.reshape([-1, y.shape[2]])`
			`sc_loss = 0.0`
			`mag_loss = 0.0`
			`for f in self.stft_losses:`
			`sc_l, mag_l = f(x, y)`
			`sc_loss += sc_l`
			`mag_loss += mag_l`
			`sc_loss /= len(self.stft_losses)`
			`mag_loss /= len(self.stft_losses)`

			`return sc_loss, mag_loss`


			`class GeneratorAdversarialLoss(nn.Layer):`
			`"""Generator adversarial loss module."""`

			`def __init__(`
			`self,`
			`average_by_discriminators=True,`
			`loss_type="mse", ):`
			`"""Initialize GeneratorAversarialLoss module."""`
			`super().__init__()`
			`self.average_by_discriminators = average_by_discriminators`
			`assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."`
			`if loss_type == "mse":`
			`self.criterion = self._mse_loss`
			`else:`
			`self.criterion = self._hinge_loss`

			`def forward(self, outputs):`
			`"""Calcualate generator adversarial loss.`
			`Parameters`
			`----------`
			`outputs: Tensor or List`
			`Discriminator outputs or list of discriminator outputs.`
			`Returns`
			`----------`
			`Tensor`
			`Generator adversarial loss value.`
			`"""`
			`if isinstance(outputs, (tuple, list)):`
			`adv_loss = 0.0`
			`for i, outputs_ in enumerate(outputs):`
			`if isinstance(outputs_, (tuple, list)):`
			`# case including feature maps`
			`outputs_ = outputs_[-1]`
			`adv_loss += self.criterion(outputs_)`
			`if self.average_by_discriminators:`
			`adv_loss /= i + 1`
			`else:`
			`adv_loss = self.criterion(outputs)`

			`return adv_loss`

			`def _mse_loss(self, x):`
			`return F.mse_loss(x, paddle.ones_like(x))`

			`def _hinge_loss(self, x):`
			`return -x.mean()`


			`class DiscriminatorAdversarialLoss(nn.Layer):`
			`"""Discriminator adversarial loss module."""`

			`def __init__(`
			`self,`
			`average_by_discriminators=True,`
			`loss_type="mse", ):`
			`"""Initialize DiscriminatorAversarialLoss module."""`
			`super().__init__()`
			`self.average_by_discriminators = average_by_discriminators`
			`assert loss_type in ["mse"], f"{loss_type} is not supported."`
			`if loss_type == "mse":`
			`self.fake_criterion = self._mse_fake_loss`
			`self.real_criterion = self._mse_real_loss`

			`def forward(self, outputs_hat, outputs):`
			`"""Calcualate discriminator adversarial loss.`
			`Parameters`
			`----------`
			`outputs_hat : Tensor or list`
			`Discriminator outputs or list of`
			`discriminator outputs calculated from generator outputs.`
			`outputs : Tensor or list`
			`Discriminator outputs or list of`
			`discriminator outputs calculated from groundtruth.`
			`Returns`
			`----------`
			`Tensor`
			`Discriminator real loss value.`
			`Tensor`
			`Discriminator fake loss value.`
			`"""`
			`if isinstance(outputs, (tuple, list)):`
			`real_loss = 0.0`
			`fake_loss = 0.0`
			`for i, (outputs_hat_,`
			`outputs_) in enumerate(zip(outputs_hat, outputs)):`
			`if isinstance(outputs_hat_, (tuple, list)):`
			`# case including feature maps`
			`outputs_hat_ = outputs_hat_[-1]`
			`outputs_ = outputs_[-1]`
			`real_loss += self.real_criterion(outputs_)`
			`fake_loss += self.fake_criterion(outputs_hat_)`
			`if self.average_by_discriminators:`
			`fake_loss /= i + 1`
			`real_loss /= i + 1`
			`else:`
			`real_loss = self.real_criterion(outputs)`
			`fake_loss = self.fake_criterion(outputs_hat)`

			`return real_loss, fake_loss`

			`def _mse_real_loss(self, x):`
			`return F.mse_loss(x, paddle.ones_like(x))`

			`def _mse_fake_loss(self, x):`
			`return F.mse_loss(x, paddle.zeros_like(x))`


			`# Losses for SpeedySpeech`
			`# Structural Similarity Index Measure (SSIM)`
			`def gaussian(window_size, sigma):`
			`gauss = paddle.to_tensor([`
			`math.exp(-(x - window_size // 2)*2 / float(2 sigma**2))`
			`for x in range(window_size)`
			`])`
			`return gauss / gauss.sum()`


			`def create_window(window_size, channel):`
			`_1D_window = gaussian(window_size, 1.5).unsqueeze(1)`
			`_2D_window = paddle.matmul(_1D_window, paddle.transpose(`
			`_1D_window, [1, 0])).unsqueeze([0, 1])`
			`window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])`
			`return window`


			`def _ssim(img1, img2, window, window_size, channel, size_average=True):`
			`mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)`
			`mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)`

			`mu1_sq = mu1.pow(2)`
			`mu2_sq = mu2.pow(2)`
			`mu1_mu2 = mu1 * mu2`

			`sigma1_sq = F.conv2d(`
			`img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq`
			`sigma2_sq = F.conv2d(`
			`img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq`
			`sigma12 = F.conv2d(`
			`img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2`

			`C1 = 0.01**2`
			`C2 = 0.03**2`

			`ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \`
			`/ ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))`

			`if size_average:`
			`return ssim_map.mean()`
			`else:`
			`return ssim_map.mean(1).mean(1).mean(1)`


			`def ssim(img1, img2, window_size=11, size_average=True):`
			`(_, channel, _, _) = img1.shape`
			`window = create_window(window_size, channel)`
			`return _ssim(img1, img2, window, window_size, channel, size_average)`


merge parakeet repo into deepspeech 3 years ago			`def weighted_mean(input, weight):`
			`"""Weighted mean. It can also be used as masked mean.`

			`Parameters`
			`-----------`
			`input : Tensor`
			`The input tensor.`
			`weight : Tensor`
			`The weight tensor with broadcastable shape with the input.`

			`Returns`
			`----------`
			`Tensor [shape=(1,)]`
			`Weighted mean tensor with the same dtype as input.`
			`"""`
			`weight = paddle.cast(weight, input.dtype)`
			`broadcast_ratio = input.size / weight.size`
			`return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio)`


			`def masked_l1_loss(prediction, target, mask):`
			`"""Compute maksed L1 loss.`

			`Parameters`
			`----------`
			`prediction : Tensor`
			`The prediction.`
			`target : Tensor`
			The target. The shape should be broadcastable to ``prediction``.
			`mask : Tensor`
			`The mask. The shape should be broadcatable to the broadcasted shape of`
			``prediction`` and ``target``.

			`Returns`
			`-------`
			`Tensor [shape=(1,)]`
			`The masked L1 loss.`
			`"""`
			`abs_error = F.l1_loss(prediction, target, reduction='none')`
			`loss = weighted_mean(abs_error, mask)`
			`return loss`