PaddleSpeech/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
StarGAN v2
Copyright (c) 2020-present NAVER Corp.
This work is licensed under the Creative Commons Attribution-NonCommercial
4.0 International License. To view a copy of this license, visit
http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
"""
import math

import paddle
import paddle.nn.functional as F
from paddle import nn


class DownSample(nn.Layer):
    def __init__(self, layer_type: str):
        super().__init__()
        self.layer_type = layer_type

    def forward(self, x: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
        Returns:
            Tensor:
                layer_type == 'none': Shape (B, dim_in, n_mels, T)
                layer_type == 'timepreserve': Shape (B, dim_in, n_mels // 2, T)
                layer_type == 'half': Shape (B, dim_in, n_mels // 2, T // 2)
        """
        if self.layer_type == 'none':
            return x
        elif self.layer_type == 'timepreserve':
            out = F.avg_pool2d(x, (2, 1))
            return out
        elif self.layer_type == 'half':
            out = F.avg_pool2d(x, 2)
            return out
        else:
            raise RuntimeError(
                'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]'
                % self.layer_type)


class UpSample(nn.Layer):
    def __init__(self, layer_type: str):
        super().__init__()
        self.layer_type = layer_type

    def forward(self, x: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
        Returns:
            Tensor:
                layer_type == 'none': Shape (B, dim_in, n_mels, T)
                layer_type == 'timepreserve': Shape (B, dim_in, n_mels * 2, T)
                layer_type == 'half': Shape (B, dim_in, n_mels * 2, T * 2)
        """
        if self.layer_type == 'none':
            return x
        elif self.layer_type == 'timepreserve':
            out = F.interpolate(x, scale_factor=(2, 1), mode='nearest')
            return out
        elif self.layer_type == 'half':
            out = F.interpolate(x, scale_factor=2, mode='nearest')
            return out
        else:
            raise RuntimeError(
                'Got unexpected upsampletype %s, expected is [none, timepreserve, half]'
                % self.layer_type)


class ResBlk(nn.Layer):
    def __init__(self,
                 dim_in: int,
                 dim_out: int,
                 actv: nn.LeakyReLU=nn.LeakyReLU(0.2),
                 normalize: bool=False,
                 downsample: str='none'):
        super().__init__()
        self.actv = actv
        self.normalize = normalize
        self.downsample = DownSample(layer_type=downsample)
        self.learned_sc = dim_in != dim_out
        self._build_weights(dim_in, dim_out)

    def _build_weights(self, dim_in: int, dim_out: int):
        self.conv1 = nn.Conv2D(
            in_channels=dim_in,
            out_channels=dim_in,
            kernel_size=3,
            stride=1,
            padding=1)
        self.conv2 = nn.Conv2D(
            in_channels=dim_in,
            out_channels=dim_out,
            kernel_size=3,
            stride=1,
            padding=1)
        if self.normalize:
            self.norm1 = nn.InstanceNorm2D(dim_in)
            self.norm2 = nn.InstanceNorm2D(dim_in)
        if self.learned_sc:
            self.conv1x1 = nn.Conv2D(
                in_channels=dim_in,
                out_channels=dim_out,
                kernel_size=1,
                stride=1,
                padding=0,
                bias_attr=False)

    def _shortcut(self, x: paddle.Tensor):
        if self.learned_sc:
            x = self.conv1x1(x)
        if self.downsample:
            x = self.downsample(x)
        return x

    def _residual(self, x: paddle.Tensor):
        if self.normalize:
            x = self.norm1(x)
        x = self.actv(x)
        x = self.conv1(x)
        x = self.downsample(x)
        if self.normalize:
            x = self.norm2(x)
        x = self.actv(x)
        x = self.conv2(x)
        return x

    def forward(self, x: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
        Returns:
            Tensor:
                downsample == 'none': Shape (B, dim_in, n_mels, T).
                downsample == 'timepreserve': Shape (B, dim_out, T, n_mels // 2, T).
                downsample == 'half': Shape (B, dim_out, T, n_mels // 2, T // 2).
        """
        x = self._shortcut(x) + self._residual(x)
        # unit variance
        out = x / math.sqrt(2)
        return out


class AdaIN(nn.Layer):
    def __init__(self, style_dim: int, num_features: int):
        super().__init__()
        self.norm = nn.InstanceNorm2D(
            num_features=num_features, weight_attr=False, bias_attr=False)
        self.fc = nn.Linear(style_dim, num_features * 2)

    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)): Shape (B, style_dim, n_mels, T).
            s(Tensor(float32)): Shape (style_dim, ).
        Returns:
            Tensor:
                Shape (B, style_dim, T, n_mels, T).
        """
        if len(s.shape) == 1:
            s = s[None]
        h = self.fc(s)
        h = h.reshape((h.shape[0], h.shape[1], 1, 1))
        gamma, beta = paddle.split(h, 2, axis=1)
        out = (1 + gamma) * self.norm(x) + beta
        return out


class AdainResBlk(nn.Layer):
    def __init__(self,
                 dim_in: int,
                 dim_out: int,
                 style_dim: int=64,
                 w_hpf: int=0,
                 actv: nn.Layer=nn.LeakyReLU(0.2),
                 upsample: str='none'):
        super().__init__()
        self.w_hpf = w_hpf
        self.actv = actv
        self.upsample = UpSample(layer_type=upsample)
        self.learned_sc = dim_in != dim_out
        self._build_weights(dim_in, dim_out, style_dim)
        self.layer_type = upsample

    def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64):
        self.conv1 = nn.Conv2D(
            in_channels=dim_in,
            out_channels=dim_out,
            kernel_size=3,
            stride=1,
            padding=1)
        self.conv2 = nn.Conv2D(
            in_channels=dim_out,
            out_channels=dim_out,
            kernel_size=3,
            stride=1,
            padding=1)
        self.norm1 = AdaIN(style_dim=style_dim, num_features=dim_in)
        self.norm2 = AdaIN(style_dim=style_dim, num_features=dim_out)
        if self.learned_sc:
            self.conv1x1 = nn.Conv2D(
                in_channels=dim_in,
                out_channels=dim_out,
                kernel_size=1,
                stride=1,
                padding=0,
                bias_attr=False)

    def _shortcut(self, x: paddle.Tensor):
        x = self.upsample(x)
        if self.learned_sc:
            x = self.conv1x1(x)
        return x

    def _residual(self, x: paddle.Tensor, s: paddle.Tensor):
        x = self.norm1(x, s)
        x = self.actv(x)
        x = self.upsample(x)
        x = self.conv1(x)
        x = self.norm2(x, s)
        x = self.actv(x)
        x = self.conv2(x)
        return x

    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)):
                Shape (B, dim_in, n_mels, T).
            s(Tensor(float32)):
                Shape (64,).
        Returns:
            Tensor:
                upsample == 'none': Shape (B, dim_out, T, n_mels, T).
                upsample == 'timepreserve': Shape (B, dim_out, T, n_mels * 2, T).
                upsample == 'half': Shape (B, dim_out, T, n_mels * 2, T * 2).
        """
        out = self._residual(x, s)
        if self.w_hpf == 0:
            out = (out + self._shortcut(x)) / math.sqrt(2)
        return out


class HighPass(nn.Layer):
    def __init__(self, w_hpf: int):
        super().__init__()
        self.filter = paddle.to_tensor([[-1, -1, -1], [-1, 8., -1],
                                        [-1, -1, -1]]) / w_hpf

    def forward(self, x: paddle.Tensor):
        filter = self.filter.unsqueeze(0).unsqueeze(1).tile(
            [x.shape[1], 1, 1, 1])
        out = F.conv2d(x, filter, padding=1, groups=x.shape[1])
        return out


class Generator(nn.Layer):
    def __init__(self,
                 dim_in: int=48,
                 style_dim: int=48,
                 max_conv_dim: int=48 * 8,
                 w_hpf: int=1,
                 F0_channel: int=0):
        super().__init__()

        self.stem = nn.Conv2D(
            in_channels=1,
            out_channels=dim_in,
            kernel_size=3,
            stride=1,
            padding=1)
        self.encode = nn.LayerList()
        self.decode = nn.LayerList()
        self.to_out = nn.Sequential(
            nn.InstanceNorm2D(dim_in),
            nn.LeakyReLU(0.2),
            nn.Conv2D(
                in_channels=dim_in,
                out_channels=1,
                kernel_size=1,
                stride=1,
                padding=0))
        self.F0_channel = F0_channel
        # down/up-sampling blocks
        # int(np.log2(img_size)) - 4
        repeat_num = 4
        if w_hpf > 0:
            repeat_num += 1

        for lid in range(repeat_num):
            if lid in [1, 3]:
                _downtype = 'timepreserve'
            else:
                _downtype = 'half'

            dim_out = min(dim_in * 2, max_conv_dim)
            self.encode.append(
                ResBlk(
                    dim_in=dim_in,
                    dim_out=dim_out,
                    normalize=True,
                    downsample=_downtype))
            (self.decode.insert if lid else
             lambda i, sublayer: self.decode.append(sublayer))(0, AdainResBlk(
                 dim_in=dim_out,
                 dim_out=dim_in,
                 style_dim=style_dim,
                 w_hpf=w_hpf,
                 upsample=_downtype))  # stack-like
            dim_in = dim_out
        # bottleneck blocks (encoder)
        for _ in range(2):
            self.encode.append(
                ResBlk(dim_in=dim_out, dim_out=dim_out, normalize=True))
        # F0 blocks
        if F0_channel != 0:
            self.decode.insert(0,
                               AdainResBlk(
                                   dim_in=dim_out + int(F0_channel / 2),
                                   dim_out=dim_out,
                                   style_dim=style_dim,
                                   w_hpf=w_hpf))
        # bottleneck blocks (decoder)
        for _ in range(2):
            self.decode.insert(0,
                               AdainResBlk(
                                   dim_in=dim_out + int(F0_channel / 2),
                                   dim_out=dim_out + int(F0_channel / 2),
                                   style_dim=style_dim,
                                   w_hpf=w_hpf))
        if F0_channel != 0:
            self.F0_conv = nn.Sequential(
                ResBlk(
                    dim_in=F0_channel,
                    dim_out=int(F0_channel / 2),
                    normalize=True,
                    downsample="half"), )
        if w_hpf > 0:
            self.hpf = HighPass(w_hpf)

    def forward(self,
                x: paddle.Tensor,
                s: paddle.Tensor,
                masks: paddle.Tensor=None,
                F0: paddle.Tensor=None):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)):
                Shape (B, 1, n_mels, T).
            s(Tensor(float32)):
                Shape (64,).
            masks:
                None.
            F0:
                Shape (B, num_features(256), n_mels // 8, T).
        Returns:
            Tensor:
                output of generator. Shape (B, 1, n_mels, T // 4 * 4)
        """
        x = self.stem(x)
        cache = {}
        # output: (B, max_conv_dim, n_mels // 16, T // 4)
        for block in self.encode:
            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
                cache[x.shape[2]] = x
            x = block(x)
        if F0 is not None:
            # input: (B, num_features(256), n_mels // 8, T)
            # output: (B, num_features(256) // 2, n_mels // 16, T // 2)
            F0 = self.F0_conv(F0)
            # output: (B, num_features(256) // 2, n_mels // 16, T // 4)
            F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]])
            x = paddle.concat([x, F0], axis=1)
        # input: (B, max_conv_dim+num_features(256) // 2, n_mels // 16, T // 4 * 4)
        # output: (B, dim_in, n_mels, T // 4 * 4)
        for block in self.decode:
            x = block(x, s)
            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
                mask = masks[0] if x.shape[2] in [32] else masks[1]
                mask = F.interpolate(mask, size=x.shape[2], mode='bilinear')
                x = x + self.hpf(mask * cache[x.shape[2]])
        out = self.to_out(x)
        return out


class MappingNetwork(nn.Layer):
    def __init__(self,
                 latent_dim: int=16,
                 style_dim: int=48,
                 num_domains: int=2,
                 hidden_dim: int=384):
        super().__init__()
        layers = []
        layers += [nn.Linear(latent_dim, hidden_dim)]
        layers += [nn.ReLU()]
        for _ in range(3):
            layers += [nn.Linear(hidden_dim, hidden_dim)]
            layers += [nn.ReLU()]
        self.shared = nn.Sequential(*layers)

        self.unshared = nn.LayerList()
        for _ in range(num_domains):
            self.unshared.extend([
                nn.Sequential(
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.ReLU(),
                    nn.Linear(hidden_dim, hidden_dim),
                    nn.ReLU(), nn.Linear(hidden_dim, style_dim))
            ])

    def forward(self, z: paddle.Tensor, y: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            z(Tensor(float32)):
                Shape (B, 1, n_mels, T).
            y(Tensor(float32)):
                speaker label. Shape (B, ).
        Returns:
            Tensor:
                Shape (style_dim, )
        """

        h = self.shared(z)
        out = []
        for layer in self.unshared:
            out += [layer(h)]
        # (B, num_domains, style_dim)
        out = paddle.stack(out, axis=1)
        idx = paddle.arange(y.shape[0])
        # (style_dim, )
        s = out[idx, y]
        return s


class StyleEncoder(nn.Layer):
    def __init__(self,
                 dim_in: int=48,
                 style_dim: int=48,
                 num_domains: int=2,
                 max_conv_dim: int=384):
        super().__init__()
        blocks = []
        blocks += [
            nn.Conv2D(
                in_channels=1,
                out_channels=dim_in,
                kernel_size=3,
                stride=1,
                padding=1)
        ]
        repeat_num = 4
        for _ in range(repeat_num):
            dim_out = min(dim_in * 2, max_conv_dim)
            blocks += [
                ResBlk(dim_in=dim_in, dim_out=dim_out, downsample='half')
            ]
            dim_in = dim_out

        blocks += [nn.LeakyReLU(0.2)]
        blocks += [
            nn.Conv2D(
                in_channels=dim_out,
                out_channels=dim_out,
                kernel_size=5,
                stride=1,
                padding=0)
        ]
        blocks += [nn.AdaptiveAvgPool2D(1)]
        blocks += [nn.LeakyReLU(0.2)]
        self.shared = nn.Sequential(*blocks)
        self.unshared = nn.LayerList()
        for _ in range(num_domains):
            self.unshared.append(nn.Linear(dim_out, style_dim))

    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
        """Calculate forward propagation.
        Args:
            x(Tensor(float32)):
                Shape (B, 1, n_mels, T).
            y(Tensor(float32)):
                speaker label. Shape (B, ).
        Returns:
            Tensor:
                Shape (style_dim, )
        """
        h = self.shared(x)
        h = h.reshape((h.shape[0], -1))
        out = []
        for layer in self.unshared:
            out += [layer(h)]
        # (B, num_domains, style_dim)
        out = paddle.stack(out, axis=1)
        idx = paddle.arange(y.shape[0])
        # (style_dim,)
        s = out[idx, y]
        return s


class Discriminator(nn.Layer):
    def __init__(self,
                 dim_in: int=48,
                 num_domains: int=2,
                 max_conv_dim: int=384,
                 repeat_num: int=4):
        super().__init__()
        # real/fake discriminator
        self.dis = Discriminator2D(
            dim_in=dim_in,
            num_domains=num_domains,
            max_conv_dim=max_conv_dim,
            repeat_num=repeat_num)
        # adversarial classifier
        self.cls = Discriminator2D(
            dim_in=dim_in,
            num_domains=num_domains,
            max_conv_dim=max_conv_dim,
            repeat_num=repeat_num)
        self.num_domains = num_domains

    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
        out = self.dis(x, y)
        return out

    def classifier(self, x: paddle.Tensor):
        out = self.cls.get_feature(x)
        return out


class Discriminator2D(nn.Layer):
    def __init__(self,
                 dim_in: int=48,
                 num_domains: int=2,
                 max_conv_dim: int=384,
                 repeat_num: int=4):
        super().__init__()
        blocks = []
        blocks += [
            nn.Conv2D(
                in_channels=1,
                out_channels=dim_in,
                kernel_size=3,
                stride=1,
                padding=1)
        ]

        for lid in range(repeat_num):
            dim_out = min(dim_in * 2, max_conv_dim)
            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
            dim_in = dim_out

        blocks += [nn.LeakyReLU(0.2)]
        blocks += [
            nn.Conv2D(
                in_channels=dim_out,
                out_channels=dim_out,
                kernel_size=5,
                stride=1,
                padding=0)
        ]
        blocks += [nn.LeakyReLU(0.2)]
        blocks += [nn.AdaptiveAvgPool2D(1)]
        blocks += [
            nn.Conv2D(
                in_channels=dim_out,
                out_channels=num_domains,
                kernel_size=1,
                stride=1,
                padding=0)
        ]
        self.main = nn.Sequential(*blocks)

    def get_feature(self, x: paddle.Tensor):
        out = self.main(x)
        # (B, num_domains)
        out = out.reshape((out.shape[0], -1))
        return out

    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
        out = self.get_feature(x)
        idx = paddle.arange(y.shape[0])
        # (B,) ?
        out = out[idx, y]
        return out