PaddleSpeech/paddlespeech/t2s/models/vits/text_encoder.py

# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Text encoder module in VITS.

This code is based on https://github.com/jaywalnut310/vits.

"""
import math
from typing import Tuple

import paddle
from paddle import nn

from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder


class TextEncoder(nn.Layer):
    """Text encoder module in VITS.

    This is a module of text encoder described in `Conditional Variational Autoencoder
    with Adversarial Learning for End-to-End Text-to-Speech`_.

    Instead of the relative positional Transformer, we use conformer architecture as
    the encoder module, which contains additional convolution layers.

    .. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
        Text-to-Speech`: https://arxiv.org/abs/2006.04558

    """

    def __init__(
            self,
            vocabs: int,
            attention_dim: int=192,
            attention_heads: int=2,
            linear_units: int=768,
            blocks: int=6,
            positionwise_layer_type: str="conv1d",
            positionwise_conv_kernel_size: int=3,
            positional_encoding_layer_type: str="rel_pos",
            self_attention_layer_type: str="rel_selfattn",
            activation_type: str="swish",
            normalize_before: bool=True,
            use_macaron_style: bool=False,
            use_conformer_conv: bool=False,
            conformer_kernel_size: int=7,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.0,
            attention_dropout_rate: float=0.0, ):
        """Initialize TextEncoder module.

        Args:
            vocabs (int): Vocabulary size.
            attention_dim (int): Attention dimension.
            attention_heads (int): Number of attention heads.
            linear_units (int): Number of linear units of positionwise layers.
            blocks (int): Number of encoder blocks.
            positionwise_layer_type (str): Positionwise layer type.
            positionwise_conv_kernel_size (int): Positionwise layer's kernel size.
            positional_encoding_layer_type (str): Positional encoding layer type.
            self_attention_layer_type (str): Self-attention layer type.
            activation_type (str): Activation function type.
            normalize_before (bool): Whether to apply LayerNorm before attention.
            use_macaron_style (bool): Whether to use macaron style components.
            use_conformer_conv (bool): Whether to use conformer conv layers.
            conformer_kernel_size (int): Conformer's conv kernel size.
            dropout_rate (float): Dropout rate.
            positional_dropout_rate (float): Dropout rate for positional encoding.
            attention_dropout_rate (float): Dropout rate for attention.

        """
        super().__init__()
        # store for forward
        self.attention_dim = attention_dim

        # define modules
        self.emb = nn.Embedding(vocabs, attention_dim)

        dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5)
        w = dist.sample(self.emb.weight.shape)
        self.emb.weight.set_value(w)

        self.encoder = Encoder(
            idim=-1,
            input_layer=None,
            attention_dim=attention_dim,
            attention_heads=attention_heads,
            linear_units=linear_units,
            num_blocks=blocks,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            attention_dropout_rate=attention_dropout_rate,
            normalize_before=normalize_before,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
            macaron_style=use_macaron_style,
            pos_enc_layer_type=positional_encoding_layer_type,
            selfattention_layer_type=self_attention_layer_type,
            activation_type=activation_type,
            use_cnn_module=use_conformer_conv,
            cnn_module_kernel=conformer_kernel_size, )
        self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1)

    def forward(
            self,
            x: paddle.Tensor,
            x_lengths: paddle.Tensor,
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Calculate forward propagation.

        Args:
            x (Tensor): Input index tensor (B, T_text).
            x_lengths (Tensor): Length tensor (B,).

        Returns:
            Tensor: Encoded hidden representation (B, attention_dim, T_text).
            Tensor: Projected mean tensor (B, attention_dim, T_text).
            Tensor: Projected scale tensor (B, attention_dim, T_text).
            Tensor: Mask tensor for input tensor (B, 1, T_text).

        """
        x = self.emb(x) * math.sqrt(self.attention_dim)
        x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)
        # encoder assume the channel last (B, T_text, attention_dim)
        # but mask shape shoud be (B, 1, T_text)
        x, _ = self.encoder(x, x_mask)

        # convert the channel first (B, attention_dim, T_text)
        x = paddle.transpose(x, [0, 2, 1])
        stats = self.proj(x) * x_mask
        m, logs = paddle.split(stats, 2, axis=1)

        return x, m, logs, x_mask
add vits network scripts, test=tts 2 years ago			`# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Text encoder module in VITS.`

			`This code is based on https://github.com/jaywalnut310/vits.`

			`"""`
			`import math`
			`from typing import Tuple`

			`import paddle`
			`from paddle import nn`

			`from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask`
			`from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder`


			`class TextEncoder(nn.Layer):`
			`"""Text encoder module in VITS.`

			This is a module of text encoder described in `Conditional Variational Autoencoder
			with Adversarial Learning for End-to-End Text-to-Speech`_.

			`Instead of the relative positional Transformer, we use conformer architecture as`
			`the encoder module, which contains additional convolution layers.`

			.. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
			Text-to-Speech`: https://arxiv.org/abs/2006.04558

			`"""`

			`def __init__(`
			`self,`
			`vocabs: int,`
			`attention_dim: int=192,`
			`attention_heads: int=2,`
			`linear_units: int=768,`
			`blocks: int=6,`
			`positionwise_layer_type: str="conv1d",`
			`positionwise_conv_kernel_size: int=3,`
			`positional_encoding_layer_type: str="rel_pos",`
			`self_attention_layer_type: str="rel_selfattn",`
			`activation_type: str="swish",`
			`normalize_before: bool=True,`
			`use_macaron_style: bool=False,`
			`use_conformer_conv: bool=False,`
			`conformer_kernel_size: int=7,`
			`dropout_rate: float=0.1,`
			`positional_dropout_rate: float=0.0,`
			`attention_dropout_rate: float=0.0, ):`
			`"""Initialize TextEncoder module.`

			`Args:`
			`vocabs (int): Vocabulary size.`
			`attention_dim (int): Attention dimension.`
			`attention_heads (int): Number of attention heads.`
			`linear_units (int): Number of linear units of positionwise layers.`
			`blocks (int): Number of encoder blocks.`
			`positionwise_layer_type (str): Positionwise layer type.`
			`positionwise_conv_kernel_size (int): Positionwise layer's kernel size.`
			`positional_encoding_layer_type (str): Positional encoding layer type.`
			`self_attention_layer_type (str): Self-attention layer type.`
			`activation_type (str): Activation function type.`
			`normalize_before (bool): Whether to apply LayerNorm before attention.`
			`use_macaron_style (bool): Whether to use macaron style components.`
			`use_conformer_conv (bool): Whether to use conformer conv layers.`
			`conformer_kernel_size (int): Conformer's conv kernel size.`
			`dropout_rate (float): Dropout rate.`
			`positional_dropout_rate (float): Dropout rate for positional encoding.`
			`attention_dropout_rate (float): Dropout rate for attention.`

			`"""`
			`super().__init__()`
			`# store for forward`
			`self.attention_dim = attention_dim`

			`# define modules`
			`self.emb = nn.Embedding(vocabs, attention_dim)`

			`dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5)`
			`w = dist.sample(self.emb.weight.shape)`
			`self.emb.weight.set_value(w)`

			`self.encoder = Encoder(`
			`idim=-1,`
			`input_layer=None,`
			`attention_dim=attention_dim,`
			`attention_heads=attention_heads,`
			`linear_units=linear_units,`
			`num_blocks=blocks,`
			`dropout_rate=dropout_rate,`
			`positional_dropout_rate=positional_dropout_rate,`
			`attention_dropout_rate=attention_dropout_rate,`
			`normalize_before=normalize_before,`
			`positionwise_layer_type=positionwise_layer_type,`
			`positionwise_conv_kernel_size=positionwise_conv_kernel_size,`
			`macaron_style=use_macaron_style,`
			`pos_enc_layer_type=positional_encoding_layer_type,`
			`selfattention_layer_type=self_attention_layer_type,`
			`activation_type=activation_type,`
			`use_cnn_module=use_conformer_conv,`
			`cnn_module_kernel=conformer_kernel_size, )`
			`self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1)`

			`def forward(`
			`self,`
			`x: paddle.Tensor,`
			`x_lengths: paddle.Tensor,`
			`) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:`
			`"""Calculate forward propagation.`

			`Args:`
			`x (Tensor): Input index tensor (B, T_text).`
			`x_lengths (Tensor): Length tensor (B,).`

			`Returns:`
			`Tensor: Encoded hidden representation (B, attention_dim, T_text).`
			`Tensor: Projected mean tensor (B, attention_dim, T_text).`
			`Tensor: Projected scale tensor (B, attention_dim, T_text).`
			`Tensor: Mask tensor for input tensor (B, 1, T_text).`

			`"""`
			`x = self.emb(x) * math.sqrt(self.attention_dim)`
			`x_mask = make_non_pad_mask(x_lengths).unsqueeze(1)`
			`# encoder assume the channel last (B, T_text, attention_dim)`
			`# but mask shape shoud be (B, 1, T_text)`
			`x, _ = self.encoder(x, x_mask)`

			`# convert the channel first (B, attention_dim, T_text)`
			`x = paddle.transpose(x, [0, 2, 1])`
			`stats = self.proj(x) * x_mask`
			`m, logs = paddle.split(stats, 2, axis=1)`

			`return x, m, logs, x_mask`