# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Dict from typing import List from typing import Optional import paddle from paddle import nn from paddlespeech.t2s.modules.activation import get_activation from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer from paddlespeech.t2s.modules.layer_norm import LayerNorm from paddlespeech.t2s.modules.masked_fill import masked_fill from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.tacotron2.decoder import Postnet from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward from paddlespeech.t2s.modules.transformer.repeat import repeat from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling # MLM -> Mask Language Model class mySequential(nn.Sequential): def forward(self, *inputs): for module in self._sub_layers.values(): if type(inputs) == tuple: inputs = module(*inputs) else: inputs = module(inputs) return inputs class MaskInputLayer(nn.Layer): def __init__(self, out_features: int) -> None: super().__init__() self.mask_feature = paddle.create_parameter( shape=(1, 1, out_features), dtype=paddle.float32, default_initializer=paddle.nn.initializer.Assign( paddle.normal(shape=(1, 1, out_features)))) def forward(self, input: paddle.Tensor, masked_pos: paddle.Tensor=None) -> paddle.Tensor: masked_pos = paddle.expand_as(paddle.unsqueeze(masked_pos, -1), input) masked_input = masked_fill(input, masked_pos, 0) + masked_fill( paddle.expand_as(self.mask_feature, input), ~masked_pos, 0) return masked_input class MLMEncoder(nn.Layer): """Conformer encoder module. Args: idim (int): Input dimension. attention_dim (int): Dimension of attention. attention_heads (int): The number of heads of multi head attention. linear_units (int): The number of units of position-wise feed forward. num_blocks (int): The number of decoder blocks. dropout_rate (float): Dropout rate. positional_dropout_rate (float): Dropout rate after adding positional encoding. attention_dropout_rate (float): Dropout rate in attention. input_layer (Union[str, paddle.nn.Layer]): Input layer type. normalize_before (bool): Whether to use layer_norm before the first block. concat_after (bool): Whether to concat attention layer's input and output. if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x))) if False, no additional linear will be applied. i.e. x -> x + att(x) positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. macaron_style (bool): Whether to use macaron style for positionwise layer. pos_enc_layer_type (str): Encoder positional encoding layer type. selfattention_layer_type (str): Encoder attention layer type. activation_type (str): Encoder activation function type. use_cnn_module (bool): Whether to use convolution module. zero_triu (bool): Whether to zero the upper triangular part of attention matrix. cnn_module_kernel (int): Kernerl size of convolution module. padding_idx (int): Padding idx for input_layer=embed. stochastic_depth_rate (float): Maximum probability to skip the encoder layer. """ def __init__(self, idim: int, vocab_size: int=0, pre_speech_layer: int=0, attention_dim: int=256, attention_heads: int=4, linear_units: int=2048, num_blocks: int=6, dropout_rate: float=0.1, positional_dropout_rate: float=0.1, attention_dropout_rate: float=0.0, input_layer: str="conv2d", normalize_before: bool=True, concat_after: bool=False, positionwise_layer_type: str="linear", positionwise_conv_kernel_size: int=1, macaron_style: bool=False, pos_enc_layer_type: str="abs_pos", pos_enc_class=None, selfattention_layer_type: str="selfattn", activation_type: str="swish", use_cnn_module: bool=False, zero_triu: bool=False, cnn_module_kernel: int=31, padding_idx: int=-1, stochastic_depth_rate: float=0.0, text_masking: bool=False): """Construct an Encoder object.""" super().__init__() self._output_size = attention_dim self.text_masking = text_masking if self.text_masking: self.text_masking_layer = MaskInputLayer(attention_dim) activation = get_activation(activation_type) if pos_enc_layer_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_layer_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_layer_type == "rel_pos": assert selfattention_layer_type == "rel_selfattn" pos_enc_class = RelPositionalEncoding elif pos_enc_layer_type == "legacy_rel_pos": pos_enc_class = LegacyRelPositionalEncoding assert selfattention_layer_type == "legacy_rel_selfattn" else: raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) self.conv_subsampling_factor = 1 if input_layer == "linear": self.embed = nn.Sequential( nn.Linear(idim, attention_dim), nn.LayerNorm(attention_dim), nn.Dropout(dropout_rate), nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "conv2d": self.embed = Conv2dSubsampling( idim, attention_dim, dropout_rate, pos_enc_class(attention_dim, positional_dropout_rate), ) self.conv_subsampling_factor = 4 elif input_layer == "embed": self.embed = nn.Sequential( nn.Embedding(idim, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "mlm": self.segment_emb = None self.speech_embed = mySequential( MaskInputLayer(idim), nn.Linear(idim, attention_dim), nn.LayerNorm(attention_dim), nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) self.text_embed = nn.Sequential( nn.Embedding( vocab_size, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer == "sega_mlm": self.segment_emb = nn.Embedding( 500, attention_dim, padding_idx=padding_idx) self.speech_embed = mySequential( MaskInputLayer(idim), nn.Linear(idim, attention_dim), nn.LayerNorm(attention_dim), nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate)) self.text_embed = nn.Sequential( nn.Embedding( vocab_size, attention_dim, padding_idx=padding_idx), pos_enc_class(attention_dim, positional_dropout_rate), ) elif isinstance(input_layer, nn.Layer): self.embed = nn.Sequential( input_layer, pos_enc_class(attention_dim, positional_dropout_rate), ) elif input_layer is None: self.embed = nn.Sequential( pos_enc_class(attention_dim, positional_dropout_rate)) else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before # self-attention module definition if selfattention_layer_type == "selfattn": encoder_selfattn_layer = MultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "legacy_rel_selfattn": assert pos_enc_layer_type == "legacy_rel_pos" encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, ) elif selfattention_layer_type == "rel_selfattn": assert pos_enc_layer_type == "rel_pos" encoder_selfattn_layer = RelPositionMultiHeadedAttention encoder_selfattn_layer_args = (attention_heads, attention_dim, attention_dropout_rate, zero_triu, ) else: raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type) # feed-forward module definition if positionwise_layer_type == "linear": positionwise_layer = PositionwiseFeedForward positionwise_layer_args = (attention_dim, linear_units, dropout_rate, activation, ) elif positionwise_layer_type == "conv1d": positionwise_layer = MultiLayeredConv1d positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) elif positionwise_layer_type == "conv1d-linear": positionwise_layer = Conv1dLinear positionwise_layer_args = (attention_dim, linear_units, positionwise_conv_kernel_size, dropout_rate, ) else: raise NotImplementedError("Support only linear or conv1d.") # convolution module definition convolution_layer = ConvolutionModule convolution_layer_args = (attention_dim, cnn_module_kernel, activation) self.encoders = repeat( num_blocks, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, stochastic_depth_rate * float(1 + lnum) / num_blocks, ), ) self.pre_speech_layer = pre_speech_layer self.pre_speech_encoders = repeat( self.pre_speech_layer, lambda lnum: EncoderLayer( attention_dim, encoder_selfattn_layer(*encoder_selfattn_layer_args), positionwise_layer(*positionwise_layer_args), positionwise_layer(*positionwise_layer_args) if macaron_style else None, convolution_layer(*convolution_layer_args) if use_cnn_module else None, dropout_rate, normalize_before, concat_after, stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer, ), ) if self.normalize_before: self.after_norm = LayerNorm(attention_dim) def forward(self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor=None, text_mask: paddle.Tensor=None, speech_seg_pos: paddle.Tensor=None, text_seg_pos: paddle.Tensor=None): """Encode input sequence. """ if masked_pos is not None: speech = self.speech_embed(speech, masked_pos) else: speech = self.speech_embed(speech) if text is not None: text = self.text_embed(text) if speech_seg_pos is not None and text_seg_pos is not None and self.segment_emb: speech_seg_emb = self.segment_emb(speech_seg_pos) text_seg_emb = self.segment_emb(text_seg_pos) text = (text[0] + text_seg_emb, text[1]) speech = (speech[0] + speech_seg_emb, speech[1]) if self.pre_speech_encoders: speech, _ = self.pre_speech_encoders(speech, speech_mask) if text is not None: xs = paddle.concat([speech[0], text[0]], axis=1) xs_pos_emb = paddle.concat([speech[1], text[1]], axis=1) masks = paddle.concat([speech_mask, text_mask], axis=-1) else: xs = speech[0] xs_pos_emb = speech[1] masks = speech_mask xs, masks = self.encoders((xs, xs_pos_emb), masks) if isinstance(xs, tuple): xs = xs[0] if self.normalize_before: xs = self.after_norm(xs) return xs, masks class MLMDecoder(MLMEncoder): def forward(self, xs: paddle.Tensor, masks: paddle.Tensor): """Encode input sequence. Args: xs (paddle.Tensor): Input tensor (#batch, time, idim). masks (paddle.Tensor): Mask tensor (#batch, time). Returns: paddle.Tensor: Output tensor (#batch, time, attention_dim). paddle.Tensor: Mask tensor (#batch, time). """ xs = self.embed(xs) xs, masks = self.encoders(xs, masks) if isinstance(xs, tuple): xs = xs[0] if self.normalize_before: xs = self.after_norm(xs) return xs, masks # encoder and decoder is nn.Layer, not str class MLM(nn.Layer): def __init__(self, odim: int, encoder: nn.Layer, decoder: Optional[nn.Layer], postnet_layers: int=0, postnet_chans: int=0, postnet_filts: int=0, text_masking: bool=False): super().__init__() self.odim = odim self.encoder = encoder self.decoder = decoder self.vocab_size = encoder.text_embed[0]._num_embeddings if self.decoder is None or not (hasattr(self.decoder, 'output_layer') and self.decoder.output_layer is not None): self.sfc = nn.Linear(self.encoder._output_size, odim) else: self.sfc = None if text_masking: self.text_sfc = nn.Linear( self.encoder.text_embed[0]._embedding_dim, self.vocab_size, weight_attr=self.encoder.text_embed[0]._weight_attr) else: self.text_sfc = None self.postnet = (None if postnet_layers == 0 else Postnet( idim=self.encoder._output_size, odim=odim, n_layers=postnet_layers, n_chans=postnet_chans, n_filts=postnet_filts, use_batch_norm=True, dropout_rate=0.5, )) def inference( self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor, text_mask: paddle.Tensor, speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]: ''' Args: speech (paddle.Tensor): input speech (1, Tmax, D). text (paddle.Tensor): input text (1, Tmax2). masked_pos (paddle.Tensor): masked position of input speech (1, Tmax) speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax). text_mask (paddle.Tensor): mask of text (1, 1, Tmax2). speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). span_bdy (List[int]): masked mel boundary of input speech (2,) use_teacher_forcing (bool): whether to use teacher forcing Returns: List[Tensor]: eg: [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])] ''' z_cache = None if use_teacher_forcing: before_outs, zs, *_ = self.forward( speech=speech, text=text, masked_pos=masked_pos, speech_mask=speech_mask, text_mask=text_mask, speech_seg_pos=speech_seg_pos, text_seg_pos=text_seg_pos) if zs is None: zs = before_outs speech = speech.squeeze(0) outs = [speech[:span_bdy[0]]] outs += [zs[0][span_bdy[0]:span_bdy[1]]] outs += [speech[span_bdy[1]:]] return outs return None class MLMEncAsDecoder(MLM): def forward(self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor, text_mask: paddle.Tensor, speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor): # feats: (Batch, Length, Dim) # -> encoder_out: (Batch, Length2, Dim2) encoder_out, h_masks = self.encoder( speech=speech, text=text, masked_pos=masked_pos, speech_mask=speech_mask, text_mask=text_mask, speech_seg_pos=speech_seg_pos, text_seg_pos=text_seg_pos) if self.decoder is not None: zs, _ = self.decoder(encoder_out, h_masks) else: zs = encoder_out speech_hidden_states = zs[:, :paddle.shape(speech)[1], :] if self.sfc is not None: before_outs = paddle.reshape( self.sfc(speech_hidden_states), (paddle.shape(speech_hidden_states)[0], -1, self.odim)) else: before_outs = speech_hidden_states if self.postnet is not None: after_outs = before_outs + paddle.transpose( self.postnet(paddle.transpose(before_outs, [0, 2, 1])), [0, 2, 1]) else: after_outs = None return before_outs, after_outs, None class MLMDualMaksing(MLM): def forward(self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor, text_mask: paddle.Tensor, speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor): # feats: (Batch, Length, Dim) # -> encoder_out: (Batch, Length2, Dim2) encoder_out, h_masks = self.encoder( speech=speech, text=text, masked_pos=masked_pos, speech_mask=speech_mask, text_mask=text_mask, speech_seg_pos=speech_seg_pos, text_seg_pos=text_seg_pos) if self.decoder is not None: zs, _ = self.decoder(encoder_out, h_masks) else: zs = encoder_out speech_hidden_states = zs[:, :paddle.shape(speech)[1], :] if self.text_sfc: text_hiddent_states = zs[:, paddle.shape(speech)[1]:, :] text_outs = paddle.reshape( self.text_sfc(text_hiddent_states), (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size)) if self.sfc is not None: before_outs = paddle.reshape( self.sfc(speech_hidden_states), (paddle.shape(speech_hidden_states)[0], -1, self.odim)) else: before_outs = speech_hidden_states if self.postnet is not None: after_outs = before_outs + paddle.transpose( self.postnet(paddle.transpose(before_outs, [0, 2, 1])), [0, 2, 1]) else: after_outs = None return before_outs, after_outs, text_outs class ErnieSAT(nn.Layer): def __init__( self, # network structure related idim: int, odim: int, postnet_layers: int=5, postnet_filts: int=5, postnet_chans: int=256, use_scaled_pos_enc: bool=False, encoder_type: str='conformer', decoder_type: str='conformer', enc_input_layer: str='sega_mlm', enc_pre_speech_layer: int=0, enc_cnn_module_kernel: int=7, enc_attention_dim: int=384, enc_attention_heads: int=2, enc_linear_units: int=1536, enc_num_blocks: int=4, enc_dropout_rate: float=0.2, enc_positional_dropout_rate: float=0.2, enc_attention_dropout_rate: float=0.2, enc_normalize_before: bool=True, enc_macaron_style: bool=True, enc_use_cnn_module: bool=True, enc_selfattention_layer_type: str='legacy_rel_selfattn', enc_activation_type: str='swish', enc_pos_enc_layer_type: str='legacy_rel_pos', enc_positionwise_layer_type: str='conv1d', enc_positionwise_conv_kernel_size: int=3, text_masking: bool=False, dec_cnn_module_kernel: int=31, dec_attention_dim: int=384, dec_attention_heads: int=2, dec_linear_units: int=1536, dec_num_blocks: int=4, dec_dropout_rate: float=0.2, dec_positional_dropout_rate: float=0.2, dec_attention_dropout_rate: float=0.2, dec_macaron_style: bool=True, dec_use_cnn_module: bool=True, dec_selfattention_layer_type: str='legacy_rel_selfattn', dec_activation_type: str='swish', dec_pos_enc_layer_type: str='legacy_rel_pos', dec_positionwise_layer_type: str='conv1d', dec_positionwise_conv_kernel_size: int=3, init_type: str="xavier_uniform", ): super().__init__() # store hyperparameters self.odim = odim self.use_scaled_pos_enc = use_scaled_pos_enc # initialize parameters initialize(self, init_type) # Encoder if encoder_type == "conformer": encoder = MLMEncoder( idim=odim, vocab_size=idim, pre_speech_layer=enc_pre_speech_layer, attention_dim=enc_attention_dim, attention_heads=enc_attention_heads, linear_units=enc_linear_units, num_blocks=enc_num_blocks, dropout_rate=enc_dropout_rate, positional_dropout_rate=enc_positional_dropout_rate, attention_dropout_rate=enc_attention_dropout_rate, input_layer=enc_input_layer, normalize_before=enc_normalize_before, positionwise_layer_type=enc_positionwise_layer_type, positionwise_conv_kernel_size=enc_positionwise_conv_kernel_size, macaron_style=enc_macaron_style, pos_enc_layer_type=enc_pos_enc_layer_type, selfattention_layer_type=enc_selfattention_layer_type, activation_type=enc_activation_type, use_cnn_module=enc_use_cnn_module, cnn_module_kernel=enc_cnn_module_kernel, text_masking=text_masking) else: raise ValueError(f"{encoder_type} is not supported.") # Decoder if decoder_type != 'no_decoder': decoder = MLMDecoder( idim=0, input_layer=None, cnn_module_kernel=dec_cnn_module_kernel, attention_dim=dec_attention_dim, attention_heads=dec_attention_heads, linear_units=dec_linear_units, num_blocks=dec_num_blocks, dropout_rate=dec_dropout_rate, positional_dropout_rate=dec_positional_dropout_rate, macaron_style=dec_macaron_style, use_cnn_module=dec_use_cnn_module, selfattention_layer_type=dec_selfattention_layer_type, activation_type=dec_activation_type, pos_enc_layer_type=dec_pos_enc_layer_type, positionwise_layer_type=dec_positionwise_layer_type, positionwise_conv_kernel_size=dec_positionwise_conv_kernel_size) else: decoder = None model_class = MLMDualMaksing if text_masking else MLMEncAsDecoder self.model = model_class( odim=odim, encoder=encoder, decoder=decoder, postnet_layers=postnet_layers, postnet_filts=postnet_filts, postnet_chans=postnet_chans, text_masking=text_masking) nn.initializer.set_global_initializer(None) def forward(self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor, text_mask: paddle.Tensor, speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor): return self.model( speech=speech, text=text, masked_pos=masked_pos, speech_mask=speech_mask, text_mask=text_mask, speech_seg_pos=speech_seg_pos, text_seg_pos=text_seg_pos) def inference( self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor, text_mask: paddle.Tensor, speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], use_teacher_forcing: bool=True, ) -> Dict[str, paddle.Tensor]: return self.model.inference( speech=speech, text=text, masked_pos=masked_pos, speech_mask=speech_mask, text_mask=text_mask, speech_seg_pos=speech_seg_pos, text_seg_pos=text_seg_pos, span_bdy=span_bdy, use_teacher_forcing=use_teacher_forcing) class ErnieSATInference(nn.Layer): def __init__(self, normalizer, model): super().__init__() self.normalizer = normalizer self.acoustic_model = model def forward( self, speech: paddle.Tensor, text: paddle.Tensor, masked_pos: paddle.Tensor, speech_mask: paddle.Tensor, text_mask: paddle.Tensor, speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], use_teacher_forcing: bool=True, ): outs = self.acoustic_model.inference( speech=speech, text=text, masked_pos=masked_pos, speech_mask=speech_mask, text_mask=text_mask, speech_seg_pos=speech_seg_pos, text_seg_pos=text_seg_pos, span_bdy=span_bdy, use_teacher_forcing=use_teacher_forcing) normed_mel_pre, normed_mel_masked, normed_mel_post = outs logmel_pre = self.normalizer.inverse(normed_mel_pre) logmel_masked = self.normalizer.inverse(normed_mel_masked) logmel_post = self.normalizer.inverse(normed_mel_post) return logmel_pre, logmel_masked, logmel_post