PaddleSpeech/paddlespeech/t2s/modules/transformer/encoder.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
from typing import List
from typing import Union

from paddle import nn

from paddlespeech.t2s.modules.activation import get_activation
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
from paddlespeech.t2s.modules.transformer.repeat import repeat
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling


class BaseEncoder(nn.Layer):
    """Base Encoder module.

    Args:
        idim (int): Input dimension.
        attention_dim (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        attention_dropout_rate (float): Dropout rate in attention.
        input_layer (Union[str, nn.Layer]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        pos_enc_layer_type (str): Encoder positional encoding layer type.
        selfattention_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.
        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
            indices start from 1.
            if not None, intermediate outputs are returned (which changes return type
            signature.)
        encoder_type (str): "transformer", or "conformer".
    """

    def __init__(self,
                 idim: int,
                 attention_dim: int=256,
                 attention_heads: int=4,
                 linear_units: int=2048,
                 num_blocks: int=6,
                 dropout_rate: float=0.1,
                 positional_dropout_rate: float=0.1,
                 attention_dropout_rate: float=0.0,
                 input_layer: str="conv2d",
                 normalize_before: bool=True,
                 concat_after: bool=False,
                 positionwise_layer_type: str="linear",
                 positionwise_conv_kernel_size: int=1,
                 macaron_style: bool=False,
                 pos_enc_layer_type: str="abs_pos",
                 selfattention_layer_type: str="selfattn",
                 activation_type: str="swish",
                 use_cnn_module: bool=False,
                 zero_triu: bool=False,
                 cnn_module_kernel: int=31,
                 padding_idx: int=-1,
                 stochastic_depth_rate: float=0.0,
                 intermediate_layers: Union[List[int], None]=None,
                 encoder_type: str="transformer"):
        """Construct an Base Encoder object."""
        super().__init__()
        activation = get_activation(activation_type)
        pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
                                               selfattention_layer_type)
        self.encoder_type = encoder_type

        self.conv_subsampling_factor = 1
        self.embed = self.get_embed(
            idim=idim,
            input_layer=input_layer,
            attention_dim=attention_dim,
            pos_enc_class=pos_enc_class,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            padding_idx=padding_idx)

        self.normalize_before = normalize_before

        # self-attention module definition
        encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
            selfattention_layer_type=selfattention_layer_type,
            attention_heads=attention_heads,
            attention_dim=attention_dim,
            attention_dropout_rate=attention_dropout_rate,
            zero_triu=zero_triu,
            pos_enc_layer_type=pos_enc_layer_type)
        # feed-forward module definition
        positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
            positionwise_layer_type, attention_dim, linear_units, dropout_rate,
            positionwise_conv_kernel_size, activation)

        # convolution module definition
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel, activation)

        if self.encoder_type == "transformer":
            self.encoders = repeat(
                num_blocks,
                lambda lnum: EncoderLayer(
                    attention_dim,
                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
                    positionwise_layer(*positionwise_layer_args),
                    dropout_rate,
                    normalize_before,
                    concat_after, ), )

        elif self.encoder_type == "conformer":
            self.encoders = repeat(
                num_blocks,
                lambda lnum: ConformerEncoderLayer(
                    attention_dim,
                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
                    positionwise_layer(*positionwise_layer_args),
                    positionwise_layer(*positionwise_layer_args) if macaron_style else None,
                    convolution_layer(*convolution_layer_args) if use_cnn_module else None,
                    dropout_rate,
                    normalize_before,
                    concat_after,
                    stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
            self.intermediate_layers = intermediate_layers
        else:
            raise NotImplementedError("Support only linear or conv1d.")

        if self.normalize_before:
            self.after_norm = LayerNorm(attention_dim)

    def get_positionwise_layer(self,
                               positionwise_layer_type: str="linear",
                               attention_dim: int=256,
                               linear_units: int=2048,
                               dropout_rate: float=0.1,
                               positionwise_conv_kernel_size: int=1,
                               activation: nn.Layer=nn.ReLU()):
        """Define positionwise layer."""
        if positionwise_layer_type == "linear":
            positionwise_layer = PositionwiseFeedForward
            positionwise_layer_args = (attention_dim, linear_units,
                                       dropout_rate, activation)
        elif positionwise_layer_type == "conv1d":
            positionwise_layer = MultiLayeredConv1d
            positionwise_layer_args = (attention_dim, linear_units,
                                       positionwise_conv_kernel_size,
                                       dropout_rate, )
        elif positionwise_layer_type == "conv1d-linear":
            positionwise_layer = Conv1dLinear
            positionwise_layer_args = (attention_dim, linear_units,
                                       positionwise_conv_kernel_size,
                                       dropout_rate, )
        else:
            raise NotImplementedError("Support only linear or conv1d.")
        return positionwise_layer, positionwise_layer_args

    def get_encoder_selfattn_layer(self,
                                   selfattention_layer_type: str="selfattn",
                                   attention_heads: int=4,
                                   attention_dim: int=256,
                                   attention_dropout_rate: float=0.0,
                                   zero_triu: bool=False,
                                   pos_enc_layer_type: str="abs_pos"):
        if selfattention_layer_type == "selfattn":
            encoder_selfattn_layer = MultiHeadedAttention
            encoder_selfattn_layer_args = (attention_heads, attention_dim,
                                           attention_dropout_rate, )
        elif selfattention_layer_type == "rel_selfattn":
            assert pos_enc_layer_type == "rel_pos"
            encoder_selfattn_layer = RelPositionMultiHeadedAttention
            encoder_selfattn_layer_args = (attention_heads, attention_dim,
                                           attention_dropout_rate, zero_triu, )
        else:
            raise ValueError("unknown encoder_attn_layer: " +
                             selfattention_layer_type)
        return encoder_selfattn_layer, encoder_selfattn_layer_args

    def get_pos_enc_class(self,
                          pos_enc_layer_type: str="abs_pos",
                          selfattention_layer_type: str="selfattn"):
        if pos_enc_layer_type == "abs_pos":
            pos_enc_class = PositionalEncoding
        elif pos_enc_layer_type == "scaled_abs_pos":
            pos_enc_class = ScaledPositionalEncoding
        elif pos_enc_layer_type == "rel_pos":
            assert selfattention_layer_type == "rel_selfattn"
            pos_enc_class = RelPositionalEncoding
        else:
            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
        return pos_enc_class

    def get_embed(self,
                  idim,
                  input_layer="conv2d",
                  attention_dim: int=256,
                  pos_enc_class=PositionalEncoding,
                  dropout_rate: int=0.1,
                  positional_dropout_rate: int=0.1,
                  padding_idx: int=-1):

        if input_layer == "linear":
            embed = nn.Sequential(
                nn.Linear(idim, attention_dim),
                nn.LayerNorm(attention_dim),
                nn.Dropout(dropout_rate),
                nn.ReLU(),
                pos_enc_class(attention_dim, positional_dropout_rate), )
        elif input_layer == "conv2d":
            embed = Conv2dSubsampling(
                idim,
                attention_dim,
                dropout_rate,
                pos_enc_class(attention_dim, positional_dropout_rate), )
            self.conv_subsampling_factor = 4
        elif input_layer == "embed":
            embed = nn.Sequential(
                nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
                pos_enc_class(attention_dim, positional_dropout_rate), )
        elif isinstance(input_layer, nn.Layer):
            embed = nn.Sequential(
                input_layer,
                pos_enc_class(attention_dim, positional_dropout_rate), )
        elif input_layer is None:
            embed = nn.Sequential(
                pos_enc_class(attention_dim, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)

        return embed

    def forward(self, xs, masks):
        """Encode input sequence.

        Args:
            xs (Tensor): Input tensor (#batch, time, idim).
            masks (Tensor): Mask tensor (#batch, 1, time).

        Returns:
            Tensor: Output tensor (#batch, time, attention_dim).
            Tensor: Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
        if self.normalize_before:
            xs = self.after_norm(xs)
        return xs, masks


class TransformerEncoder(BaseEncoder):
    """Transformer encoder module.

    Args:
        idim (int): Input dimension.
        attention_dim (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        attention_dropout_rate (float): Dropout rate in attention.
        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
        pos_enc_layer_type (str): Encoder positional encoding layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool): Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        selfattention_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        padding_idx (int): Padding idx for input_layer=embed.
    """

    def __init__(
            self,
            idim,
            attention_dim: int=256,
            attention_heads: int=4,
            linear_units: int=2048,
            num_blocks: int=6,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.1,
            attention_dropout_rate: float=0.0,
            input_layer: str="conv2d",
            pos_enc_layer_type: str="abs_pos",
            normalize_before: bool=True,
            concat_after: bool=False,
            positionwise_layer_type: str="linear",
            positionwise_conv_kernel_size: int=1,
            selfattention_layer_type: str="selfattn",
            activation_type: str="relu",
            padding_idx: int=-1, ):
        """Construct an Transformer Encoder object."""
        super().__init__(
            idim,
            attention_dim=attention_dim,
            attention_heads=attention_heads,
            linear_units=linear_units,
            num_blocks=num_blocks,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            attention_dropout_rate=attention_dropout_rate,
            input_layer=input_layer,
            pos_enc_layer_type=pos_enc_layer_type,
            normalize_before=normalize_before,
            concat_after=concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
            selfattention_layer_type=selfattention_layer_type,
            activation_type=activation_type,
            padding_idx=padding_idx,
            encoder_type="transformer")

    def forward(self, xs, masks):
        """Encode input sequence.

        Args:
            xs(Tensor): Input tensor (#batch, time, idim).
            masks(Tensor): Mask tensor (#batch, 1, time).

        Returns:
            Tensor: Output tensor (#batch, time, attention_dim).
            Tensor:Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
        if self.normalize_before:
            xs = self.after_norm(xs)
        return xs, masks

    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.

        Args:
            xs (Tensor): Input tensor.
            masks (Tensor): Mask tensor.
            cache (List[Tensor]): List of cache tensors.

        Returns:
            Tensor: Output tensor.
            Tensor: Mask tensor.
            List[Tensor]: List of new cache tensors.
        """

        xs = self.embed(xs)
        if cache is None:
            cache = [None for _ in range(len(self.encoders))]
        new_cache = []
        for c, e in zip(cache, self.encoders):
            xs, masks = e(xs, masks, cache=c)
            new_cache.append(xs)
        if self.normalize_before:
            xs = self.after_norm(xs)
        return xs, masks, new_cache


class ConformerEncoder(BaseEncoder):
    """Conformer encoder module.

    Args:
        idim (int): Input dimension.
        attention_dim (int): Dimention of attention.
        attention_heads (int): The number of heads of multi head attention.
        linear_units (int): The number of units of position-wise feed forward.
        num_blocks (int): The number of decoder blocks.
        dropout_rate (float): Dropout rate.
        positional_dropout_rate (float): Dropout rate after adding positional encoding.
        attention_dropout_rate (float): Dropout rate in attention.
        input_layer (Union[str, nn.Layer]): Input layer type.
        normalize_before (bool): Whether to use layer_norm before the first block.
        concat_after (bool):Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
        macaron_style (bool): Whether to use macaron style for positionwise layer.
        pos_enc_layer_type (str): Encoder positional encoding layer type.
        selfattention_layer_type (str): Encoder attention layer type.
        activation_type (str): Encoder activation function type.
        use_cnn_module (bool): Whether to use convolution module.
        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): Kernerl size of convolution module.
        padding_idx (int): Padding idx for input_layer=embed.
        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
        intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
            if not None, intermediate outputs are returned (which changes return type signature.)
    """

    def __init__(
            self,
            idim: int,
            attention_dim: int=256,
            attention_heads: int=4,
            linear_units: int=2048,
            num_blocks: int=6,
            dropout_rate: float=0.1,
            positional_dropout_rate: float=0.1,
            attention_dropout_rate: float=0.0,
            input_layer: str="conv2d",
            normalize_before: bool=True,
            concat_after: bool=False,
            positionwise_layer_type: str="linear",
            positionwise_conv_kernel_size: int=1,
            macaron_style: bool=False,
            pos_enc_layer_type: str="rel_pos",
            selfattention_layer_type: str="rel_selfattn",
            activation_type: str="swish",
            use_cnn_module: bool=False,
            zero_triu: bool=False,
            cnn_module_kernel: int=31,
            padding_idx: int=-1,
            stochastic_depth_rate: float=0.0,
            intermediate_layers: Union[List[int], None]=None, ):
        """Construct an Conformer Encoder object."""
        super().__init__(
            idim=idim,
            attention_dim=attention_dim,
            attention_heads=attention_heads,
            linear_units=linear_units,
            num_blocks=num_blocks,
            dropout_rate=dropout_rate,
            positional_dropout_rate=positional_dropout_rate,
            attention_dropout_rate=attention_dropout_rate,
            input_layer=input_layer,
            normalize_before=normalize_before,
            concat_after=concat_after,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_conv_kernel_size=positionwise_conv_kernel_size,
            macaron_style=macaron_style,
            pos_enc_layer_type=pos_enc_layer_type,
            selfattention_layer_type=selfattention_layer_type,
            activation_type=activation_type,
            use_cnn_module=use_cnn_module,
            zero_triu=zero_triu,
            cnn_module_kernel=cnn_module_kernel,
            padding_idx=padding_idx,
            stochastic_depth_rate=stochastic_depth_rate,
            intermediate_layers=intermediate_layers,
            encoder_type="conformer")

    def forward(self, xs, masks):
        """Encode input sequence.

        Args:
            xs (Tensor): Input tensor (#batch, time, idim).
            masks (Tensor): Mask tensor (#batch, 1, time).
        Returns:
            Tensor: Output tensor (#batch, time, attention_dim).
            Tensor: Mask tensor (#batch, 1, time).
        """
        if isinstance(self.embed, (Conv2dSubsampling)):
            xs, masks = self.embed(xs, masks)
        else:
            xs = self.embed(xs)

        if self.intermediate_layers is None:
            xs, masks = self.encoders(xs, masks)
        else:
            intermediate_outputs = []
            for layer_idx, encoder_layer in enumerate(self.encoders):
                xs, masks = encoder_layer(xs, masks)

                if (self.intermediate_layers is not None and
                        layer_idx + 1 in self.intermediate_layers):
                    # intermediate branches also require normalization.
                    encoder_output = xs
                    if isinstance(encoder_output, tuple):
                        encoder_output = encoder_output[0]
                        if self.normalize_before:
                            encoder_output = self.after_norm(encoder_output)
                    intermediate_outputs.append(encoder_output)

        if isinstance(xs, tuple):
            xs = xs[0]

        if self.normalize_before:
            xs = self.after_norm(xs)

        if self.intermediate_layers is not None:
            return xs, masks, intermediate_outputs
        return xs, masks


class Conv1dResidualBlock(nn.Layer):
    """
    Special module for simplified version of Encoder class.
    """

    def __init__(self,
                 idim: int=256,
                 odim: int=256,
                 kernel_size: int=5,
                 dropout_rate: float=0.2):
        super().__init__()
        self.main_block = nn.Sequential(
            nn.Conv1D(
                idim, odim, kernel_size=kernel_size, padding=kernel_size // 2),
            nn.ReLU(),
            nn.BatchNorm1D(odim),
            nn.Dropout(p=dropout_rate))
        self.conv1d_residual = nn.Conv1D(idim, odim, kernel_size=1)

    def forward(self, xs):
        """Encode input sequence.
        Args:
            xs (Tensor): Input tensor (#batch, idim, T).
        Returns:
            Tensor: Output tensor (#batch, odim, T).
        """
        outputs = self.main_block(xs)
        outputs = self.conv1d_residual(xs) + outputs
        return outputs


class CNNDecoder(nn.Layer):
    """
    Much simplified decoder than the original one with Prenet.
    """

    def __init__(
            self,
            emb_dim: int=256,
            odim: int=80,
            kernel_size: int=5,
            dropout_rate: float=0.2,
            resblock_kernel_sizes: List[int]=[256, 256], ):

        super().__init__()

        input_shape = emb_dim
        out_sizes = resblock_kernel_sizes
        out_sizes.append(out_sizes[-1])

        in_sizes = [input_shape] + out_sizes[:-1]
        self.residual_blocks = nn.LayerList([
            Conv1dResidualBlock(
                idim=in_channels,
                odim=out_channels,
                kernel_size=kernel_size,
                dropout_rate=dropout_rate, )
            for in_channels, out_channels in zip(in_sizes, out_sizes)
        ])
        self.conv1d = nn.Conv1D(
            in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)

    def forward(self, xs, masks=None):
        """Encode input sequence.
        Args:
            xs (Tensor): Input tensor (#batch, time, idim).
            masks (Tensor): Mask tensor (#batch, 1, time).
        Returns:
            Tensor: Output tensor (#batch, time, odim).
        """
        # exchange the temporal dimension and the feature dimension
        xs = xs.transpose([0, 2, 1])
        if masks is not None:
            xs = xs * masks

        for layer in self.residual_blocks:
            outputs = layer(xs)
            if masks is not None:
                # input_mask B * 1 * T
                outputs = outputs * masks
            xs = outputs
        outputs = self.conv1d(outputs)
        if masks is not None:
            outputs = outputs * masks
        outputs = outputs.transpose([0, 2, 1])
        return outputs, masks


class CNNPostnet(nn.Layer):
    def __init__(
            self,
            odim: int=80,
            kernel_size: int=5,
            dropout_rate: float=0.2,
            resblock_kernel_sizes: List[int]=[256, 256], ):
        super().__init__()
        out_sizes = resblock_kernel_sizes
        in_sizes = [odim] + out_sizes[:-1]
        self.residual_blocks = nn.LayerList([
            Conv1dResidualBlock(
                idim=in_channels,
                odim=out_channels,
                kernel_size=kernel_size,
                dropout_rate=dropout_rate)
            for in_channels, out_channels in zip(in_sizes, out_sizes)
        ])
        self.conv1d = nn.Conv1D(
            in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)

    def forward(self, xs, masks=None):
        """Encode input sequence.
        Args:
            xs (Tensor): Input tensor (#batch, odim, time).
            masks (Tensor): Mask tensor (#batch, 1, time).
        Returns:
            Tensor: Output tensor (#batch, odim, time).
        """
        for layer in self.residual_blocks:
            outputs = layer(xs)
            if masks is not None:
                # input_mask B * 1 * T
                outputs = outputs * masks
            xs = outputs
        outputs = self.conv1d(outputs)
        if masks is not None:
            outputs = outputs * masks
        return outputs