commit
2bbfdbae91
@ -0,0 +1,109 @@
|
|||||||
|
###########################################################
|
||||||
|
# FEATURE EXTRACTION SETTING #
|
||||||
|
###########################################################
|
||||||
|
|
||||||
|
fs: 24000 # sr
|
||||||
|
n_fft: 2048 # FFT size.
|
||||||
|
n_shift: 300 # Hop size.
|
||||||
|
win_length: 1200 # Window length.
|
||||||
|
# If set to null, it will be the same as fft_size.
|
||||||
|
window: "hann" # Window function.
|
||||||
|
|
||||||
|
# Only used for feats_type != raw
|
||||||
|
|
||||||
|
fmin: 80 # Minimum frequency of Mel basis.
|
||||||
|
fmax: 7600 # Maximum frequency of Mel basis.
|
||||||
|
n_mels: 80 # The number of mel basis.
|
||||||
|
|
||||||
|
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||||
|
f0min: 80 # Maximum f0 for pitch extraction.
|
||||||
|
f0max: 400 # Minimum f0 for pitch extraction.
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 4
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model:
|
||||||
|
adim: 384 # attention dimension
|
||||||
|
aheads: 2 # number of attention heads
|
||||||
|
elayers: 4 # number of encoder layers
|
||||||
|
eunits: 1536 # number of encoder ff units
|
||||||
|
dlayers: 4 # number of decoder layers
|
||||||
|
dunits: 1536 # number of decoder ff units
|
||||||
|
positionwise_layer_type: conv1d # type of position-wise layer
|
||||||
|
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||||
|
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||||
|
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||||
|
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||||
|
postnet_layers: 5 # number of layers of postnset
|
||||||
|
postnet_filts: 5 # filter size of conv layers in postnet
|
||||||
|
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||||
|
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||||
|
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||||
|
reduction_factor: 1 # reduction factor
|
||||||
|
encoder_type: conformer # encoder type
|
||||||
|
decoder_type: conformer # decoder type
|
||||||
|
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
|
||||||
|
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
|
||||||
|
conformer_activation_type: swish # conformer activation type
|
||||||
|
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
|
||||||
|
use_cnn_in_conformer: true # whether to use CNN in conformer
|
||||||
|
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
|
||||||
|
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
|
||||||
|
init_type: xavier_uniform # initialization type
|
||||||
|
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||||
|
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||||
|
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||||
|
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||||
|
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||||
|
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||||
|
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||||
|
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||||
|
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||||
|
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||||
|
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||||
|
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||||
|
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||||
|
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||||
|
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||||
|
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||||
|
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||||
|
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||||
|
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||||
|
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# UPDATER SETTING #
|
||||||
|
###########################################################
|
||||||
|
updater:
|
||||||
|
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer:
|
||||||
|
optim: adam # optimizer type
|
||||||
|
learning_rate: 0.001 # learning rate
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 1000
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
seed: 10086
|
@ -1,125 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# Modified from espnet(https://github.com/espnet/espnet)
|
|
||||||
"""Adversarial loss modules."""
|
|
||||||
import paddle
|
|
||||||
import paddle.nn.functional as F
|
|
||||||
from paddle import nn
|
|
||||||
|
|
||||||
|
|
||||||
class GeneratorAdversarialLoss(nn.Layer):
|
|
||||||
"""Generator adversarial loss module."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
average_by_discriminators=True,
|
|
||||||
loss_type="mse", ):
|
|
||||||
"""Initialize GeneratorAversarialLoss module."""
|
|
||||||
super().__init__()
|
|
||||||
self.average_by_discriminators = average_by_discriminators
|
|
||||||
assert loss_type in ["mse", "hinge"], f"{loss_type} is not supported."
|
|
||||||
if loss_type == "mse":
|
|
||||||
self.criterion = self._mse_loss
|
|
||||||
else:
|
|
||||||
self.criterion = self._hinge_loss
|
|
||||||
|
|
||||||
def forward(self, outputs):
|
|
||||||
"""Calcualate generator adversarial loss.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
outputs: Tensor or List
|
|
||||||
Discriminator outputs or list of discriminator outputs.
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor
|
|
||||||
Generator adversarial loss value.
|
|
||||||
"""
|
|
||||||
if isinstance(outputs, (tuple, list)):
|
|
||||||
adv_loss = 0.0
|
|
||||||
for i, outputs_ in enumerate(outputs):
|
|
||||||
if isinstance(outputs_, (tuple, list)):
|
|
||||||
# case including feature maps
|
|
||||||
outputs_ = outputs_[-1]
|
|
||||||
adv_loss += self.criterion(outputs_)
|
|
||||||
if self.average_by_discriminators:
|
|
||||||
adv_loss /= i + 1
|
|
||||||
else:
|
|
||||||
adv_loss = self.criterion(outputs)
|
|
||||||
|
|
||||||
return adv_loss
|
|
||||||
|
|
||||||
def _mse_loss(self, x):
|
|
||||||
return F.mse_loss(x, paddle.ones_like(x))
|
|
||||||
|
|
||||||
def _hinge_loss(self, x):
|
|
||||||
return -x.mean()
|
|
||||||
|
|
||||||
|
|
||||||
class DiscriminatorAdversarialLoss(nn.Layer):
|
|
||||||
"""Discriminator adversarial loss module."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
average_by_discriminators=True,
|
|
||||||
loss_type="mse", ):
|
|
||||||
"""Initialize DiscriminatorAversarialLoss module."""
|
|
||||||
super().__init__()
|
|
||||||
self.average_by_discriminators = average_by_discriminators
|
|
||||||
assert loss_type in ["mse"], f"{loss_type} is not supported."
|
|
||||||
if loss_type == "mse":
|
|
||||||
self.fake_criterion = self._mse_fake_loss
|
|
||||||
self.real_criterion = self._mse_real_loss
|
|
||||||
|
|
||||||
def forward(self, outputs_hat, outputs):
|
|
||||||
"""Calcualate discriminator adversarial loss.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
outputs_hat : Tensor or list
|
|
||||||
Discriminator outputs or list of
|
|
||||||
discriminator outputs calculated from generator outputs.
|
|
||||||
outputs : Tensor or list
|
|
||||||
Discriminator outputs or list of
|
|
||||||
discriminator outputs calculated from groundtruth.
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor
|
|
||||||
Discriminator real loss value.
|
|
||||||
Tensor
|
|
||||||
Discriminator fake loss value.
|
|
||||||
"""
|
|
||||||
if isinstance(outputs, (tuple, list)):
|
|
||||||
real_loss = 0.0
|
|
||||||
fake_loss = 0.0
|
|
||||||
for i, (outputs_hat_,
|
|
||||||
outputs_) in enumerate(zip(outputs_hat, outputs)):
|
|
||||||
if isinstance(outputs_hat_, (tuple, list)):
|
|
||||||
# case including feature maps
|
|
||||||
outputs_hat_ = outputs_hat_[-1]
|
|
||||||
outputs_ = outputs_[-1]
|
|
||||||
real_loss += self.real_criterion(outputs_)
|
|
||||||
fake_loss += self.fake_criterion(outputs_hat_)
|
|
||||||
if self.average_by_discriminators:
|
|
||||||
fake_loss /= i + 1
|
|
||||||
real_loss /= i + 1
|
|
||||||
else:
|
|
||||||
real_loss = self.real_criterion(outputs)
|
|
||||||
fake_loss = self.fake_criterion(outputs_hat)
|
|
||||||
|
|
||||||
return real_loss, fake_loss
|
|
||||||
|
|
||||||
def _mse_real_loss(self, x):
|
|
||||||
return F.mse_loss(x, paddle.ones_like(x))
|
|
||||||
|
|
||||||
def _mse_fake_loss(self, x):
|
|
||||||
return F.mse_loss(x, paddle.zeros_like(x))
|
|
@ -1,348 +0,0 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import math
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
|
|
||||||
|
|
||||||
def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
|
|
||||||
training=True):
|
|
||||||
r"""Scaled dot product attention with masking.
|
|
||||||
|
|
||||||
Assume that q, k, v all have the same leading dimensions (denoted as * in
|
|
||||||
descriptions below). Dropout is applied to attention weights before
|
|
||||||
weighted sum of values.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
q : Tensor [shape=(\*, T_q, d)]
|
|
||||||
the query tensor.
|
|
||||||
k : Tensor [shape=(\*, T_k, d)]
|
|
||||||
the key tensor.
|
|
||||||
v : Tensor [shape=(\*, T_k, d_v)]
|
|
||||||
the value tensor.
|
|
||||||
mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
|
|
||||||
the mask tensor, zeros correspond to paddings. Defaults to None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
out : Tensor [shape=(\*, T_q, d_v)]
|
|
||||||
the context vector.
|
|
||||||
attn_weights : Tensor [shape=(\*, T_q, T_k)]
|
|
||||||
the attention weights.
|
|
||||||
"""
|
|
||||||
d = q.shape[-1] # we only support imperative execution
|
|
||||||
qk = paddle.matmul(q, k, transpose_y=True)
|
|
||||||
scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))
|
|
||||||
|
|
||||||
if mask is not None:
|
|
||||||
scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here
|
|
||||||
|
|
||||||
attn_weights = F.softmax(scaled_logit, axis=-1)
|
|
||||||
attn_weights = F.dropout(attn_weights, dropout, training=training)
|
|
||||||
out = paddle.matmul(attn_weights, v)
|
|
||||||
return out, attn_weights
|
|
||||||
|
|
||||||
|
|
||||||
def drop_head(x, drop_n_heads, training=True):
|
|
||||||
"""Drop n context vectors from multiple ones.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
|
|
||||||
The input, multiple context vectors.
|
|
||||||
drop_n_heads : int [0<= drop_n_heads <= num_heads]
|
|
||||||
Number of vectors to drop.
|
|
||||||
training : bool
|
|
||||||
A flag indicating whether it is in training. If `False`, no dropout is
|
|
||||||
applied.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor
|
|
||||||
The output.
|
|
||||||
"""
|
|
||||||
if not training or (drop_n_heads == 0):
|
|
||||||
return x
|
|
||||||
|
|
||||||
batch_size, num_heads, _, _ = x.shape
|
|
||||||
# drop all heads
|
|
||||||
if num_heads == drop_n_heads:
|
|
||||||
return paddle.zeros_like(x)
|
|
||||||
|
|
||||||
mask = np.ones([batch_size, num_heads])
|
|
||||||
mask[:, :drop_n_heads] = 0
|
|
||||||
for subarray in mask:
|
|
||||||
np.random.shuffle(subarray)
|
|
||||||
scale = float(num_heads) / (num_heads - drop_n_heads)
|
|
||||||
mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
|
|
||||||
out = x * paddle.to_tensor(mask)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _split_heads(x, num_heads):
|
|
||||||
batch_size, time_steps, _ = x.shape
|
|
||||||
x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
|
|
||||||
x = paddle.transpose(x, [0, 2, 1, 3])
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def _concat_heads(x):
|
|
||||||
batch_size, _, time_steps, _ = x.shape
|
|
||||||
x = paddle.transpose(x, [0, 2, 1, 3])
|
|
||||||
x = paddle.reshape(x, [batch_size, time_steps, -1])
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
# Standard implementations of Monohead Attention & Multihead Attention
|
|
||||||
class MonoheadAttention(nn.Layer):
|
|
||||||
"""Monohead Attention module.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
model_dim : int
|
|
||||||
Feature size of the query.
|
|
||||||
dropout : float, optional
|
|
||||||
Dropout probability of scaled dot product attention and final context
|
|
||||||
vector. Defaults to 0.0.
|
|
||||||
k_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to `model_dim / num_heads`. Defaults to None.
|
|
||||||
v_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to `model_dim / num_heads`. Defaults to None.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
model_dim: int,
|
|
||||||
dropout: float=0.0,
|
|
||||||
k_dim: int=None,
|
|
||||||
v_dim: int=None):
|
|
||||||
super(MonoheadAttention, self).__init__()
|
|
||||||
k_dim = k_dim or model_dim
|
|
||||||
v_dim = v_dim or model_dim
|
|
||||||
self.affine_q = nn.Linear(model_dim, k_dim)
|
|
||||||
self.affine_k = nn.Linear(model_dim, k_dim)
|
|
||||||
self.affine_v = nn.Linear(model_dim, v_dim)
|
|
||||||
self.affine_o = nn.Linear(v_dim, model_dim)
|
|
||||||
|
|
||||||
self.model_dim = model_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, q, k, v, mask):
|
|
||||||
"""Compute context vector and attention weights.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The queries.
|
|
||||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The keys.
|
|
||||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The values.
|
|
||||||
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
|
|
||||||
The mask.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The context vector.
|
|
||||||
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
|
|
||||||
The attention weights.
|
|
||||||
"""
|
|
||||||
q = self.affine_q(q) # (B, T, C)
|
|
||||||
k = self.affine_k(k)
|
|
||||||
v = self.affine_v(v)
|
|
||||||
|
|
||||||
context_vectors, attention_weights = scaled_dot_product_attention(
|
|
||||||
q, k, v, mask, self.dropout, self.training)
|
|
||||||
|
|
||||||
out = self.affine_o(context_vectors)
|
|
||||||
return out, attention_weights
|
|
||||||
|
|
||||||
|
|
||||||
class MultiheadAttention(nn.Layer):
|
|
||||||
"""Multihead Attention module.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
model_dim: int
|
|
||||||
The feature size of query.
|
|
||||||
num_heads : int
|
|
||||||
The number of attention heads.
|
|
||||||
dropout : float, optional
|
|
||||||
Dropout probability of scaled dot product attention and final context
|
|
||||||
vector. Defaults to 0.0.
|
|
||||||
k_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to ``model_dim / num_heads``. Defaults to None.
|
|
||||||
v_dim : int, optional
|
|
||||||
Feature size of the key of each scaled dot product attention. If not
|
|
||||||
provided, it is set to ``model_dim / num_heads``. Defaults to None.
|
|
||||||
|
|
||||||
Raises
|
|
||||||
---------
|
|
||||||
ValueError
|
|
||||||
If ``model_dim`` is not divisible by ``num_heads``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
model_dim: int,
|
|
||||||
num_heads: int,
|
|
||||||
dropout: float=0.0,
|
|
||||||
k_dim: int=None,
|
|
||||||
v_dim: int=None):
|
|
||||||
super(MultiheadAttention, self).__init__()
|
|
||||||
if model_dim % num_heads != 0:
|
|
||||||
raise ValueError("model_dim must be divisible by num_heads")
|
|
||||||
depth = model_dim // num_heads
|
|
||||||
k_dim = k_dim or depth
|
|
||||||
v_dim = v_dim or depth
|
|
||||||
self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
|
|
||||||
self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
|
|
||||||
self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
|
|
||||||
self.affine_o = nn.Linear(num_heads * v_dim, model_dim)
|
|
||||||
|
|
||||||
self.num_heads = num_heads
|
|
||||||
self.model_dim = model_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, q, k, v, mask):
|
|
||||||
"""Compute context vector and attention weights.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The queries.
|
|
||||||
k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The keys.
|
|
||||||
v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
|
|
||||||
The values.
|
|
||||||
mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
|
|
||||||
The mask.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
|
|
||||||
The context vector.
|
|
||||||
attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
|
|
||||||
The attention weights.
|
|
||||||
"""
|
|
||||||
q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)
|
|
||||||
k = _split_heads(self.affine_k(k), self.num_heads)
|
|
||||||
v = _split_heads(self.affine_v(v), self.num_heads)
|
|
||||||
mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim
|
|
||||||
|
|
||||||
context_vectors, attention_weights = scaled_dot_product_attention(
|
|
||||||
q, k, v, mask, self.dropout, self.training)
|
|
||||||
# NOTE: there is more sophisticated implementation: Scheduled DropHead
|
|
||||||
context_vectors = _concat_heads(context_vectors) # (B, T, h*C)
|
|
||||||
out = self.affine_o(context_vectors)
|
|
||||||
return out, attention_weights
|
|
||||||
|
|
||||||
|
|
||||||
class LocationSensitiveAttention(nn.Layer):
|
|
||||||
"""Location Sensitive Attention module.
|
|
||||||
|
|
||||||
Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
d_query: int
|
|
||||||
The feature size of query.
|
|
||||||
d_key : int
|
|
||||||
The feature size of key.
|
|
||||||
d_attention : int
|
|
||||||
The feature size of dimension.
|
|
||||||
location_filters : int
|
|
||||||
Filter size of attention convolution.
|
|
||||||
location_kernel_size : int
|
|
||||||
Kernel size of attention convolution.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
d_query: int,
|
|
||||||
d_key: int,
|
|
||||||
d_attention: int,
|
|
||||||
location_filters: int,
|
|
||||||
location_kernel_size: int):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
|
|
||||||
self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
|
|
||||||
self.value = nn.Linear(d_attention, 1, bias_attr=False)
|
|
||||||
|
|
||||||
# Location Layer
|
|
||||||
self.location_conv = nn.Conv1D(
|
|
||||||
2,
|
|
||||||
location_filters,
|
|
||||||
kernel_size=location_kernel_size,
|
|
||||||
padding=int((location_kernel_size - 1) / 2),
|
|
||||||
bias_attr=False,
|
|
||||||
data_format='NLC')
|
|
||||||
self.location_layer = nn.Linear(
|
|
||||||
location_filters, d_attention, bias_attr=False)
|
|
||||||
|
|
||||||
def forward(self,
|
|
||||||
query,
|
|
||||||
processed_key,
|
|
||||||
value,
|
|
||||||
attention_weights_cat,
|
|
||||||
mask=None):
|
|
||||||
"""Compute context vector and attention weights.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
query : Tensor [shape=(batch_size, d_query)]
|
|
||||||
The queries.
|
|
||||||
processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
|
|
||||||
The keys after linear layer.
|
|
||||||
value : Tensor [shape=(batch_size, time_steps_k, d_key)]
|
|
||||||
The values.
|
|
||||||
attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
|
|
||||||
Attention weights concat.
|
|
||||||
mask : Tensor, optional
|
|
||||||
The mask. Shape should be (batch_size, times_steps_k, 1).
|
|
||||||
Defaults to None.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
attention_context : Tensor [shape=(batch_size, d_attention)]
|
|
||||||
The context vector.
|
|
||||||
attention_weights : Tensor [shape=(batch_size, time_steps_k)]
|
|
||||||
The attention weights.
|
|
||||||
"""
|
|
||||||
|
|
||||||
processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
|
|
||||||
processed_attention_weights = self.location_layer(
|
|
||||||
self.location_conv(attention_weights_cat))
|
|
||||||
# (B, T_enc, 1)
|
|
||||||
alignment = self.value(
|
|
||||||
paddle.tanh(processed_attention_weights + processed_key +
|
|
||||||
processed_query))
|
|
||||||
|
|
||||||
if mask is not None:
|
|
||||||
alignment = alignment + (1.0 - mask) * -1e9
|
|
||||||
|
|
||||||
attention_weights = F.softmax(alignment, axis=1)
|
|
||||||
attention_context = paddle.matmul(
|
|
||||||
attention_weights, value, transpose_x=True)
|
|
||||||
|
|
||||||
attention_weights = paddle.squeeze(attention_weights, axis=-1)
|
|
||||||
attention_context = paddle.squeeze(attention_context, axis=1)
|
|
||||||
|
|
||||||
return attention_context, attention_weights
|
|
@ -1,229 +0,0 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import librosa
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
from librosa.util import pad_center
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
from scipy import signal
|
|
||||||
|
|
||||||
__all__ = ["quantize", "dequantize", "STFT", "MelScale"]
|
|
||||||
|
|
||||||
|
|
||||||
def quantize(values, n_bands):
|
|
||||||
"""Linearlly quantize a float Tensor in [-1, 1) to an interger Tensor in
|
|
||||||
[0, n_bands).
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
values : Tensor [dtype: flaot32 or float64]
|
|
||||||
The floating point value.
|
|
||||||
|
|
||||||
n_bands : int
|
|
||||||
The number of bands. The output integer Tensor's value is in the range
|
|
||||||
[0, n_bans).
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor [dtype: int 64]
|
|
||||||
The quantized tensor.
|
|
||||||
"""
|
|
||||||
quantized = paddle.cast((values + 1.0) / 2.0 * n_bands, "int64")
|
|
||||||
return quantized
|
|
||||||
|
|
||||||
|
|
||||||
def dequantize(quantized, n_bands, dtype=None):
|
|
||||||
"""Linearlly dequantize an integer Tensor into a float Tensor in the range
|
|
||||||
[-1, 1).
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
quantized : Tensor [dtype: int]
|
|
||||||
The quantized value in the range [0, n_bands).
|
|
||||||
|
|
||||||
n_bands : int
|
|
||||||
Number of bands. The input integer Tensor's value is in the range
|
|
||||||
[0, n_bans).
|
|
||||||
|
|
||||||
dtype : str, optional
|
|
||||||
Data type of the output.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-----------
|
|
||||||
Tensor
|
|
||||||
The dequantized tensor, dtype is specified by `dtype`. If `dtype` is
|
|
||||||
not specified, the default float data type is used.
|
|
||||||
"""
|
|
||||||
dtype = dtype or paddle.get_default_dtype()
|
|
||||||
value = (paddle.cast(quantized, dtype) + 0.5) * (2.0 / n_bands) - 1.0
|
|
||||||
return value
|
|
||||||
|
|
||||||
|
|
||||||
class STFT(nn.Layer):
|
|
||||||
"""A module for computing stft transformation in a differentiable way.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
------------
|
|
||||||
n_fft : int
|
|
||||||
Number of samples in a frame.
|
|
||||||
hop_length : int
|
|
||||||
Number of samples shifted between adjacent frames.
|
|
||||||
win_length : int
|
|
||||||
Length of the window.
|
|
||||||
window : str, optional
|
|
||||||
Name of window function, see `scipy.signal.get_window` for more
|
|
||||||
details. Defaults to "hanning".
|
|
||||||
center : bool
|
|
||||||
If True, the signal y is padded so that frame D[:, t] is centered
|
|
||||||
at y[t * hop_length]. If False, then D[:, t] begins at y[t * hop_length].
|
|
||||||
Defaults to True.
|
|
||||||
pad_mode : string or function
|
|
||||||
If center=True, this argument is passed to np.pad for padding the edges
|
|
||||||
of the signal y. By default (pad_mode="reflect"), y is padded on both
|
|
||||||
sides with its own reflection, mirrored around its first and last
|
|
||||||
sample respectively. If center=False, this argument is ignored.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
-----------
|
|
||||||
It behaves like ``librosa.core.stft``. See ``librosa.core.stft`` for more
|
|
||||||
details.
|
|
||||||
|
|
||||||
Given a audio which ``T`` samples, it the STFT transformation outputs a
|
|
||||||
spectrum with (C, frames) and complex dtype, where ``C = 1 + n_fft / 2``
|
|
||||||
and ``frames = 1 + T // hop_lenghth``.
|
|
||||||
|
|
||||||
Ony ``center`` and ``reflect`` padding is supported now.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
n_fft,
|
|
||||||
hop_length=None,
|
|
||||||
win_length=None,
|
|
||||||
window="hanning",
|
|
||||||
center=True,
|
|
||||||
pad_mode="reflect"):
|
|
||||||
super().__init__()
|
|
||||||
# By default, use the entire frame
|
|
||||||
if win_length is None:
|
|
||||||
win_length = n_fft
|
|
||||||
|
|
||||||
# Set the default hop, if it's not already specified
|
|
||||||
if hop_length is None:
|
|
||||||
hop_length = int(win_length // 4)
|
|
||||||
|
|
||||||
self.hop_length = hop_length
|
|
||||||
self.n_bin = 1 + n_fft // 2
|
|
||||||
self.n_fft = n_fft
|
|
||||||
self.center = center
|
|
||||||
self.pad_mode = pad_mode
|
|
||||||
|
|
||||||
# calculate window
|
|
||||||
window = signal.get_window(window, win_length, fftbins=True)
|
|
||||||
|
|
||||||
# pad window to n_fft size
|
|
||||||
if n_fft != win_length:
|
|
||||||
window = pad_center(window, n_fft, mode="constant")
|
|
||||||
# lpad = (n_fft - win_length) // 2
|
|
||||||
# rpad = n_fft - win_length - lpad
|
|
||||||
# window = np.pad(window, ((lpad, pad), ), 'constant')
|
|
||||||
|
|
||||||
# calculate weights
|
|
||||||
# r = np.arange(0, n_fft)
|
|
||||||
# M = np.expand_dims(r, -1) * np.expand_dims(r, 0)
|
|
||||||
# w_real = np.reshape(window *
|
|
||||||
# np.cos(2 * np.pi * M / n_fft)[:self.n_bin],
|
|
||||||
# (self.n_bin, 1, self.n_fft))
|
|
||||||
# w_imag = np.reshape(window *
|
|
||||||
# np.sin(-2 * np.pi * M / n_fft)[:self.n_bin],
|
|
||||||
# (self.n_bin, 1, self.n_fft))
|
|
||||||
weight = np.fft.fft(np.eye(n_fft))[:self.n_bin]
|
|
||||||
w_real = weight.real
|
|
||||||
w_imag = weight.imag
|
|
||||||
w = np.concatenate([w_real, w_imag], axis=0)
|
|
||||||
w = w * window
|
|
||||||
w = np.expand_dims(w, 1)
|
|
||||||
weight = paddle.cast(paddle.to_tensor(w), paddle.get_default_dtype())
|
|
||||||
self.register_buffer("weight", weight)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
"""Compute the stft transform.
|
|
||||||
Parameters
|
|
||||||
------------
|
|
||||||
x : Tensor [shape=(B, T)]
|
|
||||||
The input waveform.
|
|
||||||
Returns
|
|
||||||
------------
|
|
||||||
real : Tensor [shape=(B, C, frames)]
|
|
||||||
The real part of the spectrogram.
|
|
||||||
|
|
||||||
imag : Tensor [shape=(B, C, frames)]
|
|
||||||
The image part of the spectrogram.
|
|
||||||
"""
|
|
||||||
x = paddle.unsqueeze(x, axis=1)
|
|
||||||
if self.center:
|
|
||||||
x = F.pad(
|
|
||||||
x, [self.n_fft // 2, self.n_fft // 2],
|
|
||||||
data_format='NCL',
|
|
||||||
mode=self.pad_mode)
|
|
||||||
|
|
||||||
# to BCT, C=1
|
|
||||||
out = F.conv1d(x, self.weight, stride=self.hop_length)
|
|
||||||
real, imag = paddle.chunk(out, 2, axis=1) # BCT
|
|
||||||
return real, imag
|
|
||||||
|
|
||||||
def power(self, x):
|
|
||||||
"""Compute the power spectrum.
|
|
||||||
Parameters
|
|
||||||
------------
|
|
||||||
x : Tensor [shape=(B, T)]
|
|
||||||
The input waveform.
|
|
||||||
Returns
|
|
||||||
------------
|
|
||||||
Tensor [shape=(B, C, T)]
|
|
||||||
The power spectrum.
|
|
||||||
"""
|
|
||||||
real, imag = self.forward(x)
|
|
||||||
power = real**2 + imag**2
|
|
||||||
return power
|
|
||||||
|
|
||||||
def magnitude(self, x):
|
|
||||||
"""Compute the magnitude of the spectrum.
|
|
||||||
Parameters
|
|
||||||
------------
|
|
||||||
x : Tensor [shape=(B, T)]
|
|
||||||
The input waveform.
|
|
||||||
Returns
|
|
||||||
------------
|
|
||||||
Tensor [shape=(B, C, T)]
|
|
||||||
The magnitude of the spectrum.
|
|
||||||
"""
|
|
||||||
power = self.power(x)
|
|
||||||
magnitude = paddle.sqrt(power) # TODO(chenfeiyu): maybe clipping
|
|
||||||
return magnitude
|
|
||||||
|
|
||||||
|
|
||||||
class MelScale(nn.Layer):
|
|
||||||
def __init__(self, sr, n_fft, n_mels, fmin, fmax):
|
|
||||||
super().__init__()
|
|
||||||
mel_basis = librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)
|
|
||||||
# self.weight = paddle.to_tensor(mel_basis)
|
|
||||||
weight = paddle.to_tensor(mel_basis, dtype=paddle.get_default_dtype())
|
|
||||||
self.register_buffer("weight", weight)
|
|
||||||
|
|
||||||
def forward(self, spec):
|
|
||||||
# (n_mels, n_freq) * (batch_size, n_freq, n_frames)
|
|
||||||
mel = paddle.matmul(self.weight, spec)
|
|
||||||
return mel
|
|
@ -0,0 +1,86 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
"""ConvolutionModule definition."""
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
|
||||||
|
class ConvolutionModule(nn.Layer):
|
||||||
|
"""ConvolutionModule in Conformer model.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
channels : int
|
||||||
|
The number of channels of conv layers.
|
||||||
|
kernel_size : int
|
||||||
|
Kernerl size of conv layers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
|
||||||
|
"""Construct an ConvolutionModule object."""
|
||||||
|
super().__init__()
|
||||||
|
# kernerl_size should be a odd number for 'SAME' padding
|
||||||
|
assert (kernel_size - 1) % 2 == 0
|
||||||
|
|
||||||
|
self.pointwise_conv1 = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
2 * channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=bias, )
|
||||||
|
self.depthwise_conv = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
stride=1,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
groups=channels,
|
||||||
|
bias_attr=bias, )
|
||||||
|
self.norm = nn.BatchNorm1D(channels)
|
||||||
|
self.pointwise_conv2 = nn.Conv1D(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0,
|
||||||
|
bias_attr=bias, )
|
||||||
|
self.activation = activation
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Compute convolution module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, channels).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, channels).
|
||||||
|
"""
|
||||||
|
# exchange the temporal dimension and the feature dimension
|
||||||
|
x = x.transpose([0, 2, 1])
|
||||||
|
|
||||||
|
# GLU mechanism
|
||||||
|
# (batch, 2*channel, time)
|
||||||
|
x = self.pointwise_conv1(x)
|
||||||
|
# (batch, channel, time)
|
||||||
|
x = nn.functional.glu(x, axis=1)
|
||||||
|
|
||||||
|
# 1D Depthwise Conv
|
||||||
|
x = self.depthwise_conv(x)
|
||||||
|
x = self.activation(self.norm(x))
|
||||||
|
|
||||||
|
x = self.pointwise_conv2(x)
|
||||||
|
|
||||||
|
return x.transpose([0, 2, 1])
|
@ -0,0 +1,196 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
"""Encoder self-attention layer definition."""
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from paddlespeech.t2s.modules.layer_norm import LayerNorm
|
||||||
|
|
||||||
|
|
||||||
|
class EncoderLayer(nn.Layer):
|
||||||
|
"""Encoder layer module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
size : int
|
||||||
|
Input dimension.
|
||||||
|
self_attn : nn.Layer
|
||||||
|
Self-attention module instance.
|
||||||
|
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
|
||||||
|
can be used as the argument.
|
||||||
|
feed_forward : nn.Layer
|
||||||
|
Feed-forward module instance.
|
||||||
|
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
|
||||||
|
can be used as the argument.
|
||||||
|
feed_forward_macaron : nn.Layer
|
||||||
|
Additional feed-forward module instance.
|
||||||
|
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
|
||||||
|
can be used as the argument.
|
||||||
|
conv_module : nn.Layer
|
||||||
|
Convolution module instance.
|
||||||
|
`ConvlutionModule` instance can be used as the argument.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
|
if True, additional linear will be applied.
|
||||||
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
stochastic_depth_rate : float
|
||||||
|
Proability to skip this layer.
|
||||||
|
During training, the layer may skip residual computation and return input
|
||||||
|
as-is with given probability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
size,
|
||||||
|
self_attn,
|
||||||
|
feed_forward,
|
||||||
|
feed_forward_macaron,
|
||||||
|
conv_module,
|
||||||
|
dropout_rate,
|
||||||
|
normalize_before=True,
|
||||||
|
concat_after=False,
|
||||||
|
stochastic_depth_rate=0.0, ):
|
||||||
|
"""Construct an EncoderLayer object."""
|
||||||
|
super().__init__()
|
||||||
|
self.self_attn = self_attn
|
||||||
|
self.feed_forward = feed_forward
|
||||||
|
self.feed_forward_macaron = feed_forward_macaron
|
||||||
|
self.conv_module = conv_module
|
||||||
|
self.norm_ff = LayerNorm(size) # for the FNN module
|
||||||
|
self.norm_mha = LayerNorm(size) # for the MHA module
|
||||||
|
if feed_forward_macaron is not None:
|
||||||
|
self.norm_ff_macaron = LayerNorm(size)
|
||||||
|
self.ff_scale = 0.5
|
||||||
|
else:
|
||||||
|
self.ff_scale = 1.0
|
||||||
|
if self.conv_module is not None:
|
||||||
|
self.norm_conv = LayerNorm(size) # for the CNN module
|
||||||
|
self.norm_final = LayerNorm(
|
||||||
|
size) # for the final output of the block
|
||||||
|
self.dropout = nn.Dropout(dropout_rate)
|
||||||
|
self.size = size
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
self.concat_after = concat_after
|
||||||
|
if self.concat_after:
|
||||||
|
self.concat_linear = nn.Linear(size + size, size)
|
||||||
|
self.stochastic_depth_rate = stochastic_depth_rate
|
||||||
|
|
||||||
|
def forward(self, x_input, mask, cache=None):
|
||||||
|
"""Compute encoded features.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x_input : Union[Tuple, paddle.Tensor]
|
||||||
|
Input tensor w/ or w/o pos emb.
|
||||||
|
- w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
|
||||||
|
- w/o pos emb: Tensor (#batch, time, size).
|
||||||
|
mask : paddle.Tensor
|
||||||
|
Mask tensor for the input (#batch, time).
|
||||||
|
cache paddle.Tensor
|
||||||
|
Cache tensor of the input (#batch, time - 1, size).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, size).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, time).
|
||||||
|
"""
|
||||||
|
if isinstance(x_input, tuple):
|
||||||
|
x, pos_emb = x_input[0], x_input[1]
|
||||||
|
else:
|
||||||
|
x, pos_emb = x_input, None
|
||||||
|
|
||||||
|
skip_layer = False
|
||||||
|
# with stochastic depth, residual connection `x + f(x)` becomes
|
||||||
|
# `x <- x + 1 / (1 - p) * f(x)` at training time.
|
||||||
|
stoch_layer_coeff = 1.0
|
||||||
|
if self.training and self.stochastic_depth_rate > 0:
|
||||||
|
skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
|
||||||
|
stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
|
||||||
|
|
||||||
|
if skip_layer:
|
||||||
|
if cache is not None:
|
||||||
|
x = paddle.concat([cache, x], axis=1)
|
||||||
|
if pos_emb is not None:
|
||||||
|
return (x, pos_emb), mask
|
||||||
|
return x, mask
|
||||||
|
|
||||||
|
# whether to use macaron style
|
||||||
|
if self.feed_forward_macaron is not None:
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_ff_macaron(x)
|
||||||
|
x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
|
||||||
|
self.feed_forward_macaron(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_ff_macaron(x)
|
||||||
|
|
||||||
|
# multi-headed self-attention module
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_mha(x)
|
||||||
|
|
||||||
|
if cache is None:
|
||||||
|
x_q = x
|
||||||
|
else:
|
||||||
|
assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
|
||||||
|
x_q = x[:, -1:, :]
|
||||||
|
residual = residual[:, -1:, :]
|
||||||
|
mask = None if mask is None else mask[:, -1:, :]
|
||||||
|
|
||||||
|
if pos_emb is not None:
|
||||||
|
x_att = self.self_attn(x_q, x, x, pos_emb, mask)
|
||||||
|
else:
|
||||||
|
x_att = self.self_attn(x_q, x, x, mask)
|
||||||
|
|
||||||
|
if self.concat_after:
|
||||||
|
x_concat = paddle.concat((x, x_att), axis=-1)
|
||||||
|
x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
|
||||||
|
else:
|
||||||
|
x = residual + stoch_layer_coeff * self.dropout(x_att)
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_mha(x)
|
||||||
|
|
||||||
|
# convolution module
|
||||||
|
if self.conv_module is not None:
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_conv(x)
|
||||||
|
x = residual + stoch_layer_coeff * self.dropout(self.conv_module(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_conv(x)
|
||||||
|
|
||||||
|
# feed forward module
|
||||||
|
residual = x
|
||||||
|
if self.normalize_before:
|
||||||
|
x = self.norm_ff(x)
|
||||||
|
x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
|
||||||
|
self.feed_forward(x))
|
||||||
|
if not self.normalize_before:
|
||||||
|
x = self.norm_ff(x)
|
||||||
|
|
||||||
|
if self.conv_module is not None:
|
||||||
|
x = self.norm_final(x)
|
||||||
|
|
||||||
|
if cache is not None:
|
||||||
|
x = paddle.concat([cache, x], axis=1)
|
||||||
|
|
||||||
|
if pos_emb is not None:
|
||||||
|
return (x, pos_emb), mask
|
||||||
|
|
||||||
|
return x, mask
|
@ -1,37 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
from paddle import Tensor
|
|
||||||
|
|
||||||
|
|
||||||
def expand(encodings: Tensor, durations: Tensor) -> Tensor:
|
|
||||||
"""
|
|
||||||
encodings: (B, T, C)
|
|
||||||
durations: (B, T)
|
|
||||||
"""
|
|
||||||
batch_size, t_enc = durations.shape
|
|
||||||
durations = durations.numpy()
|
|
||||||
slens = np.sum(durations, -1)
|
|
||||||
t_dec = np.max(slens)
|
|
||||||
M = np.zeros([batch_size, t_dec, t_enc])
|
|
||||||
for i in range(batch_size):
|
|
||||||
k = 0
|
|
||||||
for j in range(t_enc):
|
|
||||||
d = durations[i, j]
|
|
||||||
M[i, k:k + d, j] = 1
|
|
||||||
k += d
|
|
||||||
M = paddle.to_tensor(M, dtype=encodings.dtype)
|
|
||||||
encodings = paddle.matmul(M, encodings)
|
|
||||||
return encodings
|
|
@ -1,224 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# Modified from espnet(https://github.com/espnet/espnet)
|
|
||||||
from paddle import nn
|
|
||||||
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.attention import MultiHeadedAttention
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.embedding import PositionalEncoding
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.encoder_layer import EncoderLayer
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import Conv1dLinear
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.multi_layer_conv import MultiLayeredConv1d
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.positionwise_feed_forward import PositionwiseFeedForward
|
|
||||||
from paddlespeech.t2s.modules.fastspeech2_transformer.repeat import repeat
|
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Layer):
|
|
||||||
"""Transformer encoder module.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
idim : int
|
|
||||||
Input dimension.
|
|
||||||
attention_dim : int
|
|
||||||
Dimention of attention.
|
|
||||||
attention_heads : int
|
|
||||||
The number of heads of multi head attention.
|
|
||||||
linear_units : int
|
|
||||||
The number of units of position-wise feed forward.
|
|
||||||
num_blocks : int
|
|
||||||
The number of decoder blocks.
|
|
||||||
dropout_rate : float
|
|
||||||
Dropout rate.
|
|
||||||
positional_dropout_rate : float
|
|
||||||
Dropout rate after adding positional encoding.
|
|
||||||
attention_dropout_rate : float
|
|
||||||
Dropout rate in attention.
|
|
||||||
input_layer : Union[str, paddle.nn.Layer]
|
|
||||||
Input layer type.
|
|
||||||
pos_enc_class : paddle.nn.Layer
|
|
||||||
Positional encoding module class.
|
|
||||||
`PositionalEncoding `or `ScaledPositionalEncoding`
|
|
||||||
normalize_before : bool
|
|
||||||
Whether to use layer_norm before the first block.
|
|
||||||
concat_after : bool
|
|
||||||
Whether to concat attention layer's input and output.
|
|
||||||
if True, additional linear will be applied.
|
|
||||||
i.e. x -> x + linear(concat(x, att(x)))
|
|
||||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
|
||||||
positionwise_layer_type : str
|
|
||||||
"linear", "conv1d", or "conv1d-linear".
|
|
||||||
positionwise_conv_kernel_size : int
|
|
||||||
Kernel size of positionwise conv1d layer.
|
|
||||||
selfattention_layer_type : str
|
|
||||||
Encoder attention layer type.
|
|
||||||
padding_idx : int
|
|
||||||
Padding idx for input_layer=embed.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
idim,
|
|
||||||
attention_dim=256,
|
|
||||||
attention_heads=4,
|
|
||||||
linear_units=2048,
|
|
||||||
num_blocks=6,
|
|
||||||
dropout_rate=0.1,
|
|
||||||
positional_dropout_rate=0.1,
|
|
||||||
attention_dropout_rate=0.0,
|
|
||||||
input_layer="conv2d",
|
|
||||||
pos_enc_class=PositionalEncoding,
|
|
||||||
normalize_before=True,
|
|
||||||
concat_after=False,
|
|
||||||
positionwise_layer_type="linear",
|
|
||||||
positionwise_conv_kernel_size=1,
|
|
||||||
selfattention_layer_type="selfattn",
|
|
||||||
padding_idx=-1, ):
|
|
||||||
"""Construct an Encoder object."""
|
|
||||||
super(Encoder, self).__init__()
|
|
||||||
self.conv_subsampling_factor = 1
|
|
||||||
if input_layer == "linear":
|
|
||||||
self.embed = nn.Sequential(
|
|
||||||
nn.Linear(idim, attention_dim, bias_attr=True),
|
|
||||||
nn.LayerNorm(attention_dim),
|
|
||||||
nn.Dropout(dropout_rate),
|
|
||||||
nn.ReLU(),
|
|
||||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
|
||||||
elif input_layer == "embed":
|
|
||||||
self.embed = nn.Sequential(
|
|
||||||
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
|
|
||||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
|
||||||
elif isinstance(input_layer, nn.Layer):
|
|
||||||
self.embed = nn.Sequential(
|
|
||||||
input_layer,
|
|
||||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
|
||||||
elif input_layer is None:
|
|
||||||
self.embed = nn.Sequential(
|
|
||||||
pos_enc_class(attention_dim, positional_dropout_rate))
|
|
||||||
else:
|
|
||||||
raise ValueError("unknown input_layer: " + input_layer)
|
|
||||||
|
|
||||||
self.normalize_before = normalize_before
|
|
||||||
positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
|
|
||||||
positionwise_layer_type,
|
|
||||||
attention_dim,
|
|
||||||
linear_units,
|
|
||||||
dropout_rate,
|
|
||||||
positionwise_conv_kernel_size, )
|
|
||||||
if selfattention_layer_type in [
|
|
||||||
"selfattn",
|
|
||||||
"rel_selfattn",
|
|
||||||
"legacy_rel_selfattn",
|
|
||||||
]:
|
|
||||||
encoder_selfattn_layer = MultiHeadedAttention
|
|
||||||
encoder_selfattn_layer_args = [
|
|
||||||
(attention_heads, attention_dim, attention_dropout_rate, )
|
|
||||||
] * num_blocks
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(selfattention_layer_type)
|
|
||||||
|
|
||||||
self.encoders = repeat(
|
|
||||||
num_blocks,
|
|
||||||
lambda lnum: EncoderLayer(
|
|
||||||
attention_dim,
|
|
||||||
encoder_selfattn_layer(*encoder_selfattn_layer_args[lnum]),
|
|
||||||
positionwise_layer(*positionwise_layer_args),
|
|
||||||
dropout_rate,
|
|
||||||
normalize_before,
|
|
||||||
concat_after, ), )
|
|
||||||
if self.normalize_before:
|
|
||||||
self.after_norm = nn.LayerNorm(attention_dim)
|
|
||||||
|
|
||||||
def get_positionwise_layer(
|
|
||||||
self,
|
|
||||||
positionwise_layer_type="linear",
|
|
||||||
attention_dim=256,
|
|
||||||
linear_units=2048,
|
|
||||||
dropout_rate=0.1,
|
|
||||||
positionwise_conv_kernel_size=1, ):
|
|
||||||
"""Define positionwise layer."""
|
|
||||||
if positionwise_layer_type == "linear":
|
|
||||||
positionwise_layer = PositionwiseFeedForward
|
|
||||||
positionwise_layer_args = (attention_dim, linear_units,
|
|
||||||
dropout_rate)
|
|
||||||
elif positionwise_layer_type == "conv1d":
|
|
||||||
positionwise_layer = MultiLayeredConv1d
|
|
||||||
positionwise_layer_args = (attention_dim, linear_units,
|
|
||||||
positionwise_conv_kernel_size,
|
|
||||||
dropout_rate, )
|
|
||||||
elif positionwise_layer_type == "conv1d-linear":
|
|
||||||
positionwise_layer = Conv1dLinear
|
|
||||||
positionwise_layer_args = (attention_dim, linear_units,
|
|
||||||
positionwise_conv_kernel_size,
|
|
||||||
dropout_rate, )
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Support only linear or conv1d.")
|
|
||||||
return positionwise_layer, positionwise_layer_args
|
|
||||||
|
|
||||||
def forward(self, xs, masks):
|
|
||||||
"""Encode input sequence.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
xs : paddle.Tensor
|
|
||||||
Input tensor (#batch, time, idim).
|
|
||||||
masks : paddle.Tensor
|
|
||||||
Mask tensor (#batch, time).
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
paddle.Tensor
|
|
||||||
Output tensor (#batch, time, attention_dim).
|
|
||||||
paddle.Tensor
|
|
||||||
Mask tensor (#batch, time).
|
|
||||||
"""
|
|
||||||
|
|
||||||
xs = self.embed(xs)
|
|
||||||
xs, masks = self.encoders(xs, masks)
|
|
||||||
if self.normalize_before:
|
|
||||||
xs = self.after_norm(xs)
|
|
||||||
return xs, masks
|
|
||||||
|
|
||||||
def forward_one_step(self, xs, masks, cache=None):
|
|
||||||
"""Encode input frame.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
xs : paddle.Tensor
|
|
||||||
Input tensor.
|
|
||||||
masks : paddle.Tensor
|
|
||||||
Mask tensor.
|
|
||||||
cache : List[paddle.Tensor]
|
|
||||||
List of cache tensors.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
paddle.Tensor
|
|
||||||
Output tensor.
|
|
||||||
paddle.Tensor
|
|
||||||
Mask tensor.
|
|
||||||
List[paddle.Tensor]
|
|
||||||
List of new cache tensors.
|
|
||||||
"""
|
|
||||||
|
|
||||||
xs = self.embed(xs)
|
|
||||||
if cache is None:
|
|
||||||
cache = [None for _ in range(len(self.encoders))]
|
|
||||||
new_cache = []
|
|
||||||
for c, e in zip(cache, self.encoders):
|
|
||||||
xs, masks = e(xs, masks, cache=c)
|
|
||||||
new_cache.append(xs)
|
|
||||||
if self.normalize_before:
|
|
||||||
xs = self.after_norm(xs)
|
|
||||||
return xs, masks, new_cache
|
|
@ -1,120 +0,0 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import paddle
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"id_mask",
|
|
||||||
"feature_mask",
|
|
||||||
"combine_mask",
|
|
||||||
"future_mask",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def id_mask(input, padding_index=0, dtype="bool"):
|
|
||||||
"""Generate mask with input ids.
|
|
||||||
|
|
||||||
Those positions where the value equals ``padding_index`` correspond to 0 or
|
|
||||||
``False``, otherwise, 1 or ``True``.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
input : Tensor [dtype: int]
|
|
||||||
The input tensor. It represents the ids.
|
|
||||||
padding_index : int, optional
|
|
||||||
The id which represents padding, by default 0.
|
|
||||||
dtype : str, optional
|
|
||||||
Data type of the returned mask, by default "bool".
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor
|
|
||||||
The generate mask. It has the same shape as ``input`` does.
|
|
||||||
"""
|
|
||||||
return paddle.cast(input != padding_index, dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def feature_mask(input, axis, dtype="bool"):
|
|
||||||
"""Compute mask from input features.
|
|
||||||
|
|
||||||
For a input features, represented as batched feature vectors, those vectors
|
|
||||||
which all zeros are considerd padding vectors.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
input : Tensor [dtype: float]
|
|
||||||
The input tensor which represents featues.
|
|
||||||
axis : int
|
|
||||||
The index of the feature dimension in ``input``. Other dimensions are
|
|
||||||
considered ``spatial`` dimensions.
|
|
||||||
dtype : str, optional
|
|
||||||
Data type of the generated mask, by default "bool"
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor
|
|
||||||
The geenrated mask with ``spatial`` shape as mentioned above.
|
|
||||||
|
|
||||||
It has one less dimension than ``input`` does.
|
|
||||||
"""
|
|
||||||
feature_sum = paddle.sum(paddle.abs(input), axis)
|
|
||||||
return paddle.cast(feature_sum != 0, dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def combine_mask(mask1, mask2):
|
|
||||||
"""Combine two mask with multiplication or logical and.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
-----------
|
|
||||||
mask1 : Tensor
|
|
||||||
The first mask.
|
|
||||||
mask2 : Tensor
|
|
||||||
The second mask with broadcastable shape with ``mask1``.
|
|
||||||
Returns
|
|
||||||
--------
|
|
||||||
Tensor
|
|
||||||
Combined mask.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
------
|
|
||||||
It is mainly used to combine the padding mask and no future mask for
|
|
||||||
transformer decoder.
|
|
||||||
|
|
||||||
Padding mask is used to mask padding positions of the decoder inputs and
|
|
||||||
no future mask is used to prevent the decoder to see future information.
|
|
||||||
"""
|
|
||||||
if mask1.dtype == paddle.fluid.core.VarDesc.VarType.BOOL:
|
|
||||||
return paddle.logical_and(mask1, mask2)
|
|
||||||
else:
|
|
||||||
return mask1 * mask2
|
|
||||||
|
|
||||||
|
|
||||||
def future_mask(time_steps, dtype="bool"):
|
|
||||||
"""Generate lower triangular mask.
|
|
||||||
|
|
||||||
It is used at transformer decoder to prevent the decoder to see future
|
|
||||||
information.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
time_steps : int
|
|
||||||
Decoder time steps.
|
|
||||||
dtype : str, optional
|
|
||||||
The data type of the generate mask, by default "bool".
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor
|
|
||||||
The generated mask.
|
|
||||||
"""
|
|
||||||
mask = paddle.tril(paddle.ones([time_steps, time_steps]))
|
|
||||||
return paddle.cast(mask, dtype)
|
|
@ -1,80 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from math import exp
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
import paddle.nn.functional as F
|
|
||||||
from paddle import nn
|
|
||||||
|
|
||||||
|
|
||||||
def gaussian(window_size, sigma):
|
|
||||||
gauss = paddle.to_tensor([
|
|
||||||
exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
|
|
||||||
for x in range(window_size)
|
|
||||||
])
|
|
||||||
return gauss / gauss.sum()
|
|
||||||
|
|
||||||
|
|
||||||
def create_window(window_size, channel):
|
|
||||||
_1D_window = gaussian(window_size, 1.5).unsqueeze(1)
|
|
||||||
_2D_window = paddle.matmul(_1D_window, paddle.transpose(
|
|
||||||
_1D_window, [1, 0])).unsqueeze([0, 1])
|
|
||||||
window = paddle.expand(_2D_window, [channel, 1, window_size, window_size])
|
|
||||||
return window
|
|
||||||
|
|
||||||
|
|
||||||
def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
|
||||||
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
|
||||||
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
|
||||||
|
|
||||||
mu1_sq = mu1.pow(2)
|
|
||||||
mu2_sq = mu2.pow(2)
|
|
||||||
mu1_mu2 = mu1 * mu2
|
|
||||||
|
|
||||||
sigma1_sq = F.conv2d(
|
|
||||||
img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
|
||||||
sigma2_sq = F.conv2d(
|
|
||||||
img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
|
|
||||||
sigma12 = F.conv2d(
|
|
||||||
img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
|
|
||||||
|
|
||||||
C1 = 0.01**2
|
|
||||||
C2 = 0.03**2
|
|
||||||
|
|
||||||
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
|
|
||||||
/ ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
|
|
||||||
|
|
||||||
if size_average:
|
|
||||||
return ssim_map.mean()
|
|
||||||
else:
|
|
||||||
return ssim_map.mean(1).mean(1).mean(1)
|
|
||||||
|
|
||||||
|
|
||||||
class SSIM(nn.Layer):
|
|
||||||
def __init__(self, window_size=11, size_average=True):
|
|
||||||
super().__init__()
|
|
||||||
self.window_size = window_size
|
|
||||||
self.size_average = size_average
|
|
||||||
self.channel = 1
|
|
||||||
self.window = create_window(window_size, self.channel)
|
|
||||||
|
|
||||||
def forward(self, img1, img2):
|
|
||||||
return _ssim(img1, img2, self.window, self.window_size, self.channel,
|
|
||||||
self.size_average)
|
|
||||||
|
|
||||||
|
|
||||||
def ssim(img1, img2, window_size=11, size_average=True):
|
|
||||||
(_, channel, _, _) = img1.shape
|
|
||||||
window = create_window(window_size, channel)
|
|
||||||
return _ssim(img1, img2, window, window_size, channel, size_average)
|
|
@ -1,220 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
# Modified from espnet(https://github.com/espnet/espnet)
|
|
||||||
import paddle
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
from scipy import signal
|
|
||||||
|
|
||||||
|
|
||||||
def stft(x,
|
|
||||||
fft_size,
|
|
||||||
hop_length=None,
|
|
||||||
win_length=None,
|
|
||||||
window='hann',
|
|
||||||
center=True,
|
|
||||||
pad_mode='reflect'):
|
|
||||||
"""Perform STFT and convert to magnitude spectrogram.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor
|
|
||||||
Input signal tensor (B, T).
|
|
||||||
fft_size : int
|
|
||||||
FFT size.
|
|
||||||
hop_size : int
|
|
||||||
Hop size.
|
|
||||||
win_length : int
|
|
||||||
window : str, optional
|
|
||||||
window : str
|
|
||||||
Name of window function, see `scipy.signal.get_window` for more
|
|
||||||
details. Defaults to "hann".
|
|
||||||
center : bool, optional
|
|
||||||
center (bool, optional): Whether to pad `x` to make that the
|
|
||||||
:math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`.
|
|
||||||
pad_mode : str, optional
|
|
||||||
Choose padding pattern when `center` is `True`.
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor:
|
|
||||||
Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
|
|
||||||
"""
|
|
||||||
# calculate window
|
|
||||||
window = signal.get_window(window, win_length, fftbins=True)
|
|
||||||
window = paddle.to_tensor(window)
|
|
||||||
x_stft = paddle.signal.stft(
|
|
||||||
x,
|
|
||||||
fft_size,
|
|
||||||
hop_length,
|
|
||||||
win_length,
|
|
||||||
window=window,
|
|
||||||
center=center,
|
|
||||||
pad_mode=pad_mode)
|
|
||||||
|
|
||||||
real = x_stft.real()
|
|
||||||
imag = x_stft.imag()
|
|
||||||
|
|
||||||
return paddle.sqrt(paddle.clip(real**2 + imag**2, min=1e-7)).transpose(
|
|
||||||
[0, 2, 1])
|
|
||||||
|
|
||||||
|
|
||||||
class SpectralConvergenceLoss(nn.Layer):
|
|
||||||
"""Spectral convergence loss module."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initilize spectral convergence loss module."""
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def forward(self, x_mag, y_mag):
|
|
||||||
"""Calculate forward propagation.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x_mag : Tensor
|
|
||||||
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
|
||||||
y_mag : Tensor)
|
|
||||||
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor
|
|
||||||
Spectral convergence loss value.
|
|
||||||
"""
|
|
||||||
return paddle.norm(
|
|
||||||
y_mag - x_mag, p="fro") / paddle.clip(
|
|
||||||
paddle.norm(y_mag, p="fro"), min=1e-10)
|
|
||||||
|
|
||||||
|
|
||||||
class LogSTFTMagnitudeLoss(nn.Layer):
|
|
||||||
"""Log STFT magnitude loss module."""
|
|
||||||
|
|
||||||
def __init__(self, epsilon=1e-7):
|
|
||||||
"""Initilize los STFT magnitude loss module."""
|
|
||||||
super().__init__()
|
|
||||||
self.epsilon = epsilon
|
|
||||||
|
|
||||||
def forward(self, x_mag, y_mag):
|
|
||||||
"""Calculate forward propagation.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x_mag : Tensor
|
|
||||||
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
|
|
||||||
y_mag : Tensor
|
|
||||||
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor
|
|
||||||
Log STFT magnitude loss value.
|
|
||||||
"""
|
|
||||||
return F.l1_loss(
|
|
||||||
paddle.log(paddle.clip(y_mag, min=self.epsilon)),
|
|
||||||
paddle.log(paddle.clip(x_mag, min=self.epsilon)))
|
|
||||||
|
|
||||||
|
|
||||||
class STFTLoss(nn.Layer):
|
|
||||||
"""STFT loss module."""
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
fft_size=1024,
|
|
||||||
shift_size=120,
|
|
||||||
win_length=600,
|
|
||||||
window="hann"):
|
|
||||||
"""Initialize STFT loss module."""
|
|
||||||
super().__init__()
|
|
||||||
self.fft_size = fft_size
|
|
||||||
self.shift_size = shift_size
|
|
||||||
self.win_length = win_length
|
|
||||||
self.window = window
|
|
||||||
self.spectral_convergence_loss = SpectralConvergenceLoss()
|
|
||||||
self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
|
|
||||||
|
|
||||||
def forward(self, x, y):
|
|
||||||
"""Calculate forward propagation.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor
|
|
||||||
Predicted signal (B, T).
|
|
||||||
y : Tensor
|
|
||||||
Groundtruth signal (B, T).
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor
|
|
||||||
Spectral convergence loss value.
|
|
||||||
Tensor
|
|
||||||
Log STFT magnitude loss value.
|
|
||||||
"""
|
|
||||||
x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
|
|
||||||
self.window)
|
|
||||||
y_mag = stft(y, self.fft_size, self.shift_size, self.win_length,
|
|
||||||
self.window)
|
|
||||||
sc_loss = self.spectral_convergence_loss(x_mag, y_mag)
|
|
||||||
mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
|
|
||||||
|
|
||||||
return sc_loss, mag_loss
|
|
||||||
|
|
||||||
|
|
||||||
class MultiResolutionSTFTLoss(nn.Layer):
|
|
||||||
"""Multi resolution STFT loss module."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
fft_sizes=[1024, 2048, 512],
|
|
||||||
hop_sizes=[120, 240, 50],
|
|
||||||
win_lengths=[600, 1200, 240],
|
|
||||||
window="hann", ):
|
|
||||||
"""Initialize Multi resolution STFT loss module.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
fft_sizes : list
|
|
||||||
List of FFT sizes.
|
|
||||||
hop_sizes : list
|
|
||||||
List of hop sizes.
|
|
||||||
win_lengths : list
|
|
||||||
List of window lengths.
|
|
||||||
window : str
|
|
||||||
Window function type.
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
|
|
||||||
self.stft_losses = nn.LayerList()
|
|
||||||
for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
|
|
||||||
self.stft_losses.append(STFTLoss(fs, ss, wl, window))
|
|
||||||
|
|
||||||
def forward(self, x, y):
|
|
||||||
"""Calculate forward propagation.
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor
|
|
||||||
Predicted signal (B, T) or (B, #subband, T).
|
|
||||||
y : Tensor
|
|
||||||
Groundtruth signal (B, T) or (B, #subband, T).
|
|
||||||
Returns
|
|
||||||
----------
|
|
||||||
Tensor
|
|
||||||
Multi resolution spectral convergence loss value.
|
|
||||||
Tensor
|
|
||||||
Multi resolution log STFT magnitude loss value.
|
|
||||||
"""
|
|
||||||
if len(x.shape) == 3:
|
|
||||||
# (B, C, T) -> (B x C, T)
|
|
||||||
x = x.reshape([-1, x.shape[2]])
|
|
||||||
# (B, C, T) -> (B x C, T)
|
|
||||||
y = y.reshape([-1, y.shape[2]])
|
|
||||||
sc_loss = 0.0
|
|
||||||
mag_loss = 0.0
|
|
||||||
for f in self.stft_losses:
|
|
||||||
sc_l, mag_l = f(x, y)
|
|
||||||
sc_loss += sc_l
|
|
||||||
mag_loss += mag_l
|
|
||||||
sc_loss /= len(self.stft_losses)
|
|
||||||
mag_loss /= len(self.stft_losses)
|
|
||||||
|
|
||||||
return sc_loss, mag_loss
|
|
@ -1,208 +0,0 @@
|
|||||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from paddle import nn
|
|
||||||
from paddle.nn import functional as F
|
|
||||||
|
|
||||||
from paddlespeech.t2s.modules import attention as attn
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"PositionwiseFFN",
|
|
||||||
"TransformerEncoderLayer",
|
|
||||||
"TransformerDecoderLayer",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class PositionwiseFFN(nn.Layer):
|
|
||||||
"""A faithful implementation of Position-wise Feed-Forward Network
|
|
||||||
in `Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
|
||||||
It is basically a 2-layer MLP, with relu actication and dropout in between.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
input_size: int
|
|
||||||
The feature size of the intput. It is also the feature size of the
|
|
||||||
output.
|
|
||||||
hidden_size: int
|
|
||||||
The hidden size.
|
|
||||||
dropout: float
|
|
||||||
The probability of the Dropout applied to the output of the first
|
|
||||||
layer, by default 0.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, input_size: int, hidden_size: int, dropout=0.0):
|
|
||||||
super(PositionwiseFFN, self).__init__()
|
|
||||||
self.linear1 = nn.Linear(input_size, hidden_size)
|
|
||||||
self.linear2 = nn.Linear(hidden_size, input_size)
|
|
||||||
self.dropout = nn.Dropout(dropout)
|
|
||||||
|
|
||||||
self.input_size = input_size
|
|
||||||
self.hidden_szie = hidden_size
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
r"""Forward pass of positionwise feed forward network.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor [shape=(\*, input_size)]
|
|
||||||
The input tensor, where ``\*`` means arbitary shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Tensor [shape=(\*, input_size)]
|
|
||||||
The output tensor.
|
|
||||||
"""
|
|
||||||
l1 = self.dropout(F.relu(self.linear1(x)))
|
|
||||||
l2 = self.linear2(l1)
|
|
||||||
return l2
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerEncoderLayer(nn.Layer):
|
|
||||||
"""A faithful implementation of Transformer encoder layer in
|
|
||||||
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d_model :int
|
|
||||||
The feature size of the input. It is also the feature size of the
|
|
||||||
output.
|
|
||||||
n_heads : int
|
|
||||||
The number of heads of self attention (a ``MultiheadAttention``
|
|
||||||
layer).
|
|
||||||
d_ffn : int
|
|
||||||
The hidden size of the positional feed forward network (a
|
|
||||||
``PositionwiseFFN`` layer).
|
|
||||||
dropout : float, optional
|
|
||||||
The probability of the dropout in MultiHeadAttention and
|
|
||||||
PositionwiseFFN, by default 0.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
------
|
|
||||||
It uses the PostLN (post layer norm) scheme.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
|
||||||
super(TransformerEncoderLayer, self).__init__()
|
|
||||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
|
||||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
|
||||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, x, mask):
|
|
||||||
"""Forward pass of TransformerEncoderLayer.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : Tensor [shape=(batch_size, time_steps, d_model)]
|
|
||||||
The input.
|
|
||||||
mask : Tensor
|
|
||||||
The padding mask. The shape is (batch_size, time_steps,
|
|
||||||
time_steps) or broadcastable shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
x :Tensor [shape=(batch_size, time_steps, d_model)]
|
|
||||||
The encoded output.
|
|
||||||
|
|
||||||
attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
|
|
||||||
The attention weights of the self attention.
|
|
||||||
"""
|
|
||||||
context_vector, attn_weights = self.self_mha(x, x, x, mask)
|
|
||||||
x = self.layer_norm1(
|
|
||||||
F.dropout(x + context_vector, self.dropout, training=self.training))
|
|
||||||
|
|
||||||
x = self.layer_norm2(
|
|
||||||
F.dropout(x + self.ffn(x), self.dropout, training=self.training))
|
|
||||||
return x, attn_weights
|
|
||||||
|
|
||||||
|
|
||||||
class TransformerDecoderLayer(nn.Layer):
|
|
||||||
"""A faithful implementation of Transformer decoder layer in
|
|
||||||
`Attention is All You Need <https://arxiv.org/abs/1706.03762>`_.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
d_model :int
|
|
||||||
The feature size of the input. It is also the feature size of the
|
|
||||||
output.
|
|
||||||
n_heads : int
|
|
||||||
The number of heads of attentions (``MultiheadAttention``
|
|
||||||
layers).
|
|
||||||
d_ffn : int
|
|
||||||
The hidden size of the positional feed forward network (a
|
|
||||||
``PositionwiseFFN`` layer).
|
|
||||||
dropout : float, optional
|
|
||||||
The probability of the dropout in MultiHeadAttention and
|
|
||||||
PositionwiseFFN, by default 0.
|
|
||||||
|
|
||||||
Notes
|
|
||||||
------
|
|
||||||
It uses the PostLN (post layer norm) scheme.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, d_model, n_heads, d_ffn, dropout=0.):
|
|
||||||
super(TransformerDecoderLayer, self).__init__()
|
|
||||||
self.self_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
|
||||||
self.layer_norm1 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.cross_mha = attn.MultiheadAttention(d_model, n_heads, dropout)
|
|
||||||
self.layer_norm2 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.ffn = PositionwiseFFN(d_model, d_ffn, dropout)
|
|
||||||
self.layer_norm3 = nn.LayerNorm([d_model], epsilon=1e-6)
|
|
||||||
|
|
||||||
self.dropout = dropout
|
|
||||||
|
|
||||||
def forward(self, q, k, v, encoder_mask, decoder_mask):
|
|
||||||
"""Forward pass of TransformerEncoderLayer.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
|
|
||||||
The decoder input.
|
|
||||||
k : Tensor [shape=(batch_size, time_steps_k, d_model)]
|
|
||||||
The keys.
|
|
||||||
v : Tensor [shape=(batch_size, time_steps_k, d_model)]
|
|
||||||
The values
|
|
||||||
encoder_mask : Tensor
|
|
||||||
Encoder padding mask, shape is ``(batch_size, time_steps_k,
|
|
||||||
time_steps_k)`` or broadcastable shape.
|
|
||||||
decoder_mask : Tensor
|
|
||||||
Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
|
|
||||||
or broadcastable shape.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
--------
|
|
||||||
q : Tensor [shape=(batch_size, time_steps_q, d_model)]
|
|
||||||
The decoder output.
|
|
||||||
self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
|
|
||||||
Decoder self attention.
|
|
||||||
|
|
||||||
cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)]
|
|
||||||
Decoder-encoder cross attention.
|
|
||||||
"""
|
|
||||||
context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask)
|
|
||||||
q = self.layer_norm1(
|
|
||||||
F.dropout(q + context_vector, self.dropout, training=self.training))
|
|
||||||
|
|
||||||
context_vector, cross_attn_weights = self.cross_mha(q, k, v,
|
|
||||||
encoder_mask)
|
|
||||||
q = self.layer_norm2(
|
|
||||||
F.dropout(q + context_vector, self.dropout, training=self.training))
|
|
||||||
|
|
||||||
q = self.layer_norm3(
|
|
||||||
F.dropout(q + self.ffn(q), self.dropout, training=self.training))
|
|
||||||
return q, self_attn_weights, cross_attn_weights
|
|
@ -0,0 +1,609 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
from typing import List
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from paddlespeech.t2s.modules.activation import get_activation
|
||||||
|
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
|
||||||
|
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer as ConformerEncoderLayer
|
||||||
|
from paddlespeech.t2s.modules.layer_norm import LayerNorm
|
||||||
|
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
|
||||||
|
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
|
||||||
|
from paddlespeech.t2s.modules.transformer.encoder_layer import EncoderLayer
|
||||||
|
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
|
||||||
|
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
|
||||||
|
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||||
|
from paddlespeech.t2s.modules.transformer.repeat import repeat
|
||||||
|
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEncoder(nn.Layer):
|
||||||
|
"""Base Encoder module.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
attention_dim : int
|
||||||
|
Dimention of attention.
|
||||||
|
attention_heads : int
|
||||||
|
The number of heads of multi head attention.
|
||||||
|
linear_units : int
|
||||||
|
The number of units of position-wise feed forward.
|
||||||
|
num_blocks : int
|
||||||
|
The number of decoder blocks.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
positional_dropout_rate : float
|
||||||
|
Dropout rate after adding positional encoding.
|
||||||
|
attention_dropout_rate : float
|
||||||
|
Dropout rate in attention.
|
||||||
|
input_layer : Union[str, nn.Layer]
|
||||||
|
Input layer type.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
|
if True, additional linear will be applied.
|
||||||
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
positionwise_layer_type : str
|
||||||
|
"linear", "conv1d", or "conv1d-linear".
|
||||||
|
positionwise_conv_kernel_size : int
|
||||||
|
Kernel size of positionwise conv1d layer.
|
||||||
|
macaron_style : bool
|
||||||
|
Whether to use macaron style for positionwise layer.
|
||||||
|
pos_enc_layer_type : str
|
||||||
|
Encoder positional encoding layer type.
|
||||||
|
selfattention_layer_type : str
|
||||||
|
Encoder attention layer type.
|
||||||
|
activation_type : str
|
||||||
|
Encoder activation function type.
|
||||||
|
use_cnn_module : bool
|
||||||
|
Whether to use convolution module.
|
||||||
|
zero_triu : bool
|
||||||
|
Whether to zero the upper triangular part of attention matrix.
|
||||||
|
cnn_module_kernel : int
|
||||||
|
Kernerl size of convolution module.
|
||||||
|
padding_idx : int
|
||||||
|
Padding idx for input_layer=embed.
|
||||||
|
stochastic_depth_rate : float
|
||||||
|
Maximum probability to skip the encoder layer.
|
||||||
|
intermediate_layers : Union[List[int], None]
|
||||||
|
indices of intermediate CTC layer.
|
||||||
|
indices start from 1.
|
||||||
|
if not None, intermediate outputs are returned (which changes return type
|
||||||
|
signature.)
|
||||||
|
encoder_type: str
|
||||||
|
"transformer", or "conformer".
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim: int,
|
||||||
|
attention_dim: int=256,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="conv2d",
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False,
|
||||||
|
positionwise_layer_type: str="linear",
|
||||||
|
positionwise_conv_kernel_size: int=1,
|
||||||
|
macaron_style: bool=False,
|
||||||
|
pos_enc_layer_type: str="abs_pos",
|
||||||
|
selfattention_layer_type: str="selfattn",
|
||||||
|
activation_type: str="swish",
|
||||||
|
use_cnn_module: bool=False,
|
||||||
|
zero_triu: bool=False,
|
||||||
|
cnn_module_kernel: int=31,
|
||||||
|
padding_idx: int=-1,
|
||||||
|
stochastic_depth_rate: float=0.0,
|
||||||
|
intermediate_layers: Union[List[int], None]=None,
|
||||||
|
encoder_type: str="transformer"):
|
||||||
|
"""Construct an Base Encoder object."""
|
||||||
|
super().__init__()
|
||||||
|
activation = get_activation(activation_type)
|
||||||
|
pos_enc_class = self.get_pos_enc_class(pos_enc_layer_type,
|
||||||
|
selfattention_layer_type)
|
||||||
|
self.encoder_type = encoder_type
|
||||||
|
|
||||||
|
self.conv_subsampling_factor = 1
|
||||||
|
self.embed = self.get_embed(
|
||||||
|
idim=idim,
|
||||||
|
input_layer=input_layer,
|
||||||
|
attention_dim=attention_dim,
|
||||||
|
pos_enc_class=pos_enc_class,
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
positional_dropout_rate=positional_dropout_rate,
|
||||||
|
padding_idx=padding_idx)
|
||||||
|
|
||||||
|
self.normalize_before = normalize_before
|
||||||
|
|
||||||
|
# self-attention module definition
|
||||||
|
encoder_selfattn_layer, encoder_selfattn_layer_args = self.get_encoder_selfattn_layer(
|
||||||
|
selfattention_layer_type=selfattention_layer_type,
|
||||||
|
attention_heads=attention_heads,
|
||||||
|
attention_dim=attention_dim,
|
||||||
|
attention_dropout_rate=attention_dropout_rate,
|
||||||
|
zero_triu=zero_triu,
|
||||||
|
pos_enc_layer_type=pos_enc_layer_type)
|
||||||
|
# feed-forward module definition
|
||||||
|
positionwise_layer, positionwise_layer_args = self.get_positionwise_layer(
|
||||||
|
positionwise_layer_type, attention_dim, linear_units, dropout_rate,
|
||||||
|
positionwise_conv_kernel_size, activation)
|
||||||
|
|
||||||
|
# convolution module definition
|
||||||
|
convolution_layer = ConvolutionModule
|
||||||
|
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
|
||||||
|
|
||||||
|
if self.encoder_type == "transformer":
|
||||||
|
self.encoders = repeat(
|
||||||
|
num_blocks,
|
||||||
|
lambda lnum: EncoderLayer(
|
||||||
|
attention_dim,
|
||||||
|
encoder_selfattn_layer(*encoder_selfattn_layer_args),
|
||||||
|
positionwise_layer(*positionwise_layer_args),
|
||||||
|
dropout_rate,
|
||||||
|
normalize_before,
|
||||||
|
concat_after, ), )
|
||||||
|
|
||||||
|
elif self.encoder_type == "conformer":
|
||||||
|
self.encoders = repeat(
|
||||||
|
num_blocks,
|
||||||
|
lambda lnum: ConformerEncoderLayer(
|
||||||
|
attention_dim,
|
||||||
|
encoder_selfattn_layer(*encoder_selfattn_layer_args),
|
||||||
|
positionwise_layer(*positionwise_layer_args),
|
||||||
|
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
|
||||||
|
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
|
||||||
|
dropout_rate,
|
||||||
|
normalize_before,
|
||||||
|
concat_after,
|
||||||
|
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
|
||||||
|
self.intermediate_layers = intermediate_layers
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Support only linear or conv1d.")
|
||||||
|
|
||||||
|
if self.normalize_before:
|
||||||
|
self.after_norm = LayerNorm(attention_dim)
|
||||||
|
|
||||||
|
def get_positionwise_layer(self,
|
||||||
|
positionwise_layer_type: str="linear",
|
||||||
|
attention_dim: int=256,
|
||||||
|
linear_units: int=2048,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positionwise_conv_kernel_size: int=1,
|
||||||
|
activation: nn.Layer=nn.ReLU()):
|
||||||
|
"""Define positionwise layer."""
|
||||||
|
if positionwise_layer_type == "linear":
|
||||||
|
positionwise_layer = PositionwiseFeedForward
|
||||||
|
positionwise_layer_args = (attention_dim, linear_units,
|
||||||
|
dropout_rate, activation)
|
||||||
|
elif positionwise_layer_type == "conv1d":
|
||||||
|
positionwise_layer = MultiLayeredConv1d
|
||||||
|
positionwise_layer_args = (attention_dim, linear_units,
|
||||||
|
positionwise_conv_kernel_size,
|
||||||
|
dropout_rate, )
|
||||||
|
elif positionwise_layer_type == "conv1d-linear":
|
||||||
|
positionwise_layer = Conv1dLinear
|
||||||
|
positionwise_layer_args = (attention_dim, linear_units,
|
||||||
|
positionwise_conv_kernel_size,
|
||||||
|
dropout_rate, )
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Support only linear or conv1d.")
|
||||||
|
return positionwise_layer, positionwise_layer_args
|
||||||
|
|
||||||
|
def get_encoder_selfattn_layer(self,
|
||||||
|
selfattention_layer_type: str="selfattn",
|
||||||
|
attention_heads: int=4,
|
||||||
|
attention_dim: int=256,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
zero_triu: bool=False,
|
||||||
|
pos_enc_layer_type: str="abs_pos"):
|
||||||
|
if selfattention_layer_type == "selfattn":
|
||||||
|
encoder_selfattn_layer = MultiHeadedAttention
|
||||||
|
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||||
|
attention_dropout_rate, )
|
||||||
|
elif selfattention_layer_type == "rel_selfattn":
|
||||||
|
assert pos_enc_layer_type == "rel_pos"
|
||||||
|
encoder_selfattn_layer = RelPositionMultiHeadedAttention
|
||||||
|
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||||
|
attention_dropout_rate, zero_triu, )
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown encoder_attn_layer: " +
|
||||||
|
selfattention_layer_type)
|
||||||
|
return encoder_selfattn_layer, encoder_selfattn_layer_args
|
||||||
|
|
||||||
|
def get_pos_enc_class(self,
|
||||||
|
pos_enc_layer_type: str="abs_pos",
|
||||||
|
selfattention_layer_type: str="selfattn"):
|
||||||
|
if pos_enc_layer_type == "abs_pos":
|
||||||
|
pos_enc_class = PositionalEncoding
|
||||||
|
elif pos_enc_layer_type == "scaled_abs_pos":
|
||||||
|
pos_enc_class = ScaledPositionalEncoding
|
||||||
|
elif pos_enc_layer_type == "rel_pos":
|
||||||
|
assert selfattention_layer_type == "rel_selfattn"
|
||||||
|
pos_enc_class = RelPositionalEncoding
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
|
||||||
|
return pos_enc_class
|
||||||
|
|
||||||
|
def get_embed(self,
|
||||||
|
idim,
|
||||||
|
input_layer="conv2d",
|
||||||
|
attention_dim: int=256,
|
||||||
|
pos_enc_class=PositionalEncoding,
|
||||||
|
dropout_rate: int=0.1,
|
||||||
|
positional_dropout_rate: int=0.1,
|
||||||
|
padding_idx: int=-1):
|
||||||
|
|
||||||
|
if input_layer == "linear":
|
||||||
|
embed = nn.Sequential(
|
||||||
|
nn.Linear(idim, attention_dim),
|
||||||
|
nn.LayerNorm(attention_dim),
|
||||||
|
nn.Dropout(dropout_rate),
|
||||||
|
nn.ReLU(),
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
elif input_layer == "conv2d":
|
||||||
|
embed = Conv2dSubsampling(
|
||||||
|
idim,
|
||||||
|
attention_dim,
|
||||||
|
dropout_rate,
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
self.conv_subsampling_factor = 4
|
||||||
|
elif input_layer == "embed":
|
||||||
|
embed = nn.Sequential(
|
||||||
|
nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
elif isinstance(input_layer, nn.Layer):
|
||||||
|
embed = nn.Sequential(
|
||||||
|
input_layer,
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||||
|
elif input_layer is None:
|
||||||
|
embed = nn.Sequential(
|
||||||
|
pos_enc_class(attention_dim, positional_dropout_rate))
|
||||||
|
else:
|
||||||
|
raise ValueError("unknown input_layer: " + input_layer)
|
||||||
|
|
||||||
|
return embed
|
||||||
|
|
||||||
|
def forward(self, xs, masks):
|
||||||
|
"""Encode input sequence.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
xs : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
masks : paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, attention_dim).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time).
|
||||||
|
"""
|
||||||
|
xs = self.embed(xs)
|
||||||
|
xs, masks = self.encoders(xs, masks)
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
return xs, masks
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerEncoder(BaseEncoder):
|
||||||
|
"""Transformer encoder module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
attention_dim : int
|
||||||
|
Dimention of attention.
|
||||||
|
attention_heads : int
|
||||||
|
The number of heads of multi head attention.
|
||||||
|
linear_units : int
|
||||||
|
The number of units of position-wise feed forward.
|
||||||
|
num_blocks : int
|
||||||
|
The number of decoder blocks.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
positional_dropout_rate : float
|
||||||
|
Dropout rate after adding positional encoding.
|
||||||
|
attention_dropout_rate : float
|
||||||
|
Dropout rate in attention.
|
||||||
|
input_layer : Union[str, paddle.nn.Layer]
|
||||||
|
Input layer type.
|
||||||
|
pos_enc_layer_type : str
|
||||||
|
Encoder positional encoding layer type.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
|
if True, additional linear will be applied.
|
||||||
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
positionwise_layer_type : str
|
||||||
|
"linear", "conv1d", or "conv1d-linear".
|
||||||
|
positionwise_conv_kernel_size : int
|
||||||
|
Kernel size of positionwise conv1d layer.
|
||||||
|
selfattention_layer_type : str
|
||||||
|
Encoder attention layer type.
|
||||||
|
activation_type : str
|
||||||
|
Encoder activation function type.
|
||||||
|
padding_idx : int
|
||||||
|
Padding idx for input_layer=embed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
idim,
|
||||||
|
attention_dim: int=256,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="conv2d",
|
||||||
|
pos_enc_layer_type: str="abs_pos",
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False,
|
||||||
|
positionwise_layer_type: str="linear",
|
||||||
|
positionwise_conv_kernel_size: int=1,
|
||||||
|
selfattention_layer_type: str="selfattn",
|
||||||
|
activation_type: str="relu",
|
||||||
|
padding_idx: int=-1, ):
|
||||||
|
"""Construct an Transformer Encoder object."""
|
||||||
|
super().__init__(
|
||||||
|
idim,
|
||||||
|
attention_dim=attention_dim,
|
||||||
|
attention_heads=attention_heads,
|
||||||
|
linear_units=linear_units,
|
||||||
|
num_blocks=num_blocks,
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
positional_dropout_rate=positional_dropout_rate,
|
||||||
|
attention_dropout_rate=attention_dropout_rate,
|
||||||
|
input_layer=input_layer,
|
||||||
|
pos_enc_layer_type=pos_enc_layer_type,
|
||||||
|
normalize_before=normalize_before,
|
||||||
|
concat_after=concat_after,
|
||||||
|
positionwise_layer_type=positionwise_layer_type,
|
||||||
|
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
|
||||||
|
selfattention_layer_type=selfattention_layer_type,
|
||||||
|
activation_type=activation_type,
|
||||||
|
padding_idx=padding_idx,
|
||||||
|
encoder_type="transformer")
|
||||||
|
|
||||||
|
def forward(self, xs, masks):
|
||||||
|
"""Encode input sequence.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
xs : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
masks : paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, attention_dim).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time).
|
||||||
|
"""
|
||||||
|
xs = self.embed(xs)
|
||||||
|
xs, masks = self.encoders(xs, masks)
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
return xs, masks
|
||||||
|
|
||||||
|
def forward_one_step(self, xs, masks, cache=None):
|
||||||
|
"""Encode input frame.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
xs : paddle.Tensor
|
||||||
|
Input tensor.
|
||||||
|
masks : paddle.Tensor
|
||||||
|
Mask tensor.
|
||||||
|
cache : List[paddle.Tensor]
|
||||||
|
List of cache tensors.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor.
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor.
|
||||||
|
List[paddle.Tensor]
|
||||||
|
List of new cache tensors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
xs = self.embed(xs)
|
||||||
|
if cache is None:
|
||||||
|
cache = [None for _ in range(len(self.encoders))]
|
||||||
|
new_cache = []
|
||||||
|
for c, e in zip(cache, self.encoders):
|
||||||
|
xs, masks = e(xs, masks, cache=c)
|
||||||
|
new_cache.append(xs)
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
return xs, masks, new_cache
|
||||||
|
|
||||||
|
|
||||||
|
class ConformerEncoder(BaseEncoder):
|
||||||
|
"""Conformer encoder module.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
attention_dim : int
|
||||||
|
Dimention of attention.
|
||||||
|
attention_heads : int
|
||||||
|
The number of heads of multi head attention.
|
||||||
|
linear_units : int
|
||||||
|
The number of units of position-wise feed forward.
|
||||||
|
num_blocks : int
|
||||||
|
The number of decoder blocks.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
positional_dropout_rate : float
|
||||||
|
Dropout rate after adding positional encoding.
|
||||||
|
attention_dropout_rate : float
|
||||||
|
Dropout rate in attention.
|
||||||
|
input_layer : Union[str, nn.Layer]
|
||||||
|
Input layer type.
|
||||||
|
normalize_before : bool
|
||||||
|
Whether to use layer_norm before the first block.
|
||||||
|
concat_after : bool
|
||||||
|
Whether to concat attention layer's input and output.
|
||||||
|
if True, additional linear will be applied.
|
||||||
|
i.e. x -> x + linear(concat(x, att(x)))
|
||||||
|
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||||
|
positionwise_layer_type : str
|
||||||
|
"linear", "conv1d", or "conv1d-linear".
|
||||||
|
positionwise_conv_kernel_size : int
|
||||||
|
Kernel size of positionwise conv1d layer.
|
||||||
|
macaron_style : bool
|
||||||
|
Whether to use macaron style for positionwise layer.
|
||||||
|
pos_enc_layer_type : str
|
||||||
|
Encoder positional encoding layer type.
|
||||||
|
selfattention_layer_type : str
|
||||||
|
Encoder attention layer type.
|
||||||
|
activation_type : str
|
||||||
|
Encoder activation function type.
|
||||||
|
use_cnn_module : bool
|
||||||
|
Whether to use convolution module.
|
||||||
|
zero_triu : bool
|
||||||
|
Whether to zero the upper triangular part of attention matrix.
|
||||||
|
cnn_module_kernel : int
|
||||||
|
Kernerl size of convolution module.
|
||||||
|
padding_idx : int
|
||||||
|
Padding idx for input_layer=embed.
|
||||||
|
stochastic_depth_rate : float
|
||||||
|
Maximum probability to skip the encoder layer.
|
||||||
|
intermediate_layers : Union[List[int], None]
|
||||||
|
indices of intermediate CTC layer.
|
||||||
|
indices start from 1.
|
||||||
|
if not None, intermediate outputs are returned (which changes return type
|
||||||
|
signature.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
idim: int,
|
||||||
|
attention_dim: int=256,
|
||||||
|
attention_heads: int=4,
|
||||||
|
linear_units: int=2048,
|
||||||
|
num_blocks: int=6,
|
||||||
|
dropout_rate: float=0.1,
|
||||||
|
positional_dropout_rate: float=0.1,
|
||||||
|
attention_dropout_rate: float=0.0,
|
||||||
|
input_layer: str="conv2d",
|
||||||
|
normalize_before: bool=True,
|
||||||
|
concat_after: bool=False,
|
||||||
|
positionwise_layer_type: str="linear",
|
||||||
|
positionwise_conv_kernel_size: int=1,
|
||||||
|
macaron_style: bool=False,
|
||||||
|
pos_enc_layer_type: str="rel_pos",
|
||||||
|
selfattention_layer_type: str="rel_selfattn",
|
||||||
|
activation_type: str="swish",
|
||||||
|
use_cnn_module: bool=False,
|
||||||
|
zero_triu: bool=False,
|
||||||
|
cnn_module_kernel: int=31,
|
||||||
|
padding_idx: int=-1,
|
||||||
|
stochastic_depth_rate: float=0.0,
|
||||||
|
intermediate_layers: Union[List[int], None]=None, ):
|
||||||
|
"""Construct an Conformer Encoder object."""
|
||||||
|
super().__init__(
|
||||||
|
idim=idim,
|
||||||
|
attention_dim=attention_dim,
|
||||||
|
attention_heads=attention_heads,
|
||||||
|
linear_units=linear_units,
|
||||||
|
num_blocks=num_blocks,
|
||||||
|
dropout_rate=dropout_rate,
|
||||||
|
positional_dropout_rate=positional_dropout_rate,
|
||||||
|
attention_dropout_rate=attention_dropout_rate,
|
||||||
|
input_layer=input_layer,
|
||||||
|
normalize_before=normalize_before,
|
||||||
|
concat_after=concat_after,
|
||||||
|
positionwise_layer_type=positionwise_layer_type,
|
||||||
|
positionwise_conv_kernel_size=positionwise_conv_kernel_size,
|
||||||
|
macaron_style=macaron_style,
|
||||||
|
pos_enc_layer_type=pos_enc_layer_type,
|
||||||
|
selfattention_layer_type=selfattention_layer_type,
|
||||||
|
activation_type=activation_type,
|
||||||
|
use_cnn_module=use_cnn_module,
|
||||||
|
zero_triu=zero_triu,
|
||||||
|
cnn_module_kernel=cnn_module_kernel,
|
||||||
|
padding_idx=padding_idx,
|
||||||
|
stochastic_depth_rate=stochastic_depth_rate,
|
||||||
|
intermediate_layers=intermediate_layers,
|
||||||
|
encoder_type="conformer")
|
||||||
|
|
||||||
|
def forward(self, xs, masks):
|
||||||
|
"""Encode input sequence.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
xs : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
masks : paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Output tensor (#batch, time, attention_dim).
|
||||||
|
paddle.Tensor
|
||||||
|
Mask tensor (#batch, 1, time).
|
||||||
|
"""
|
||||||
|
if isinstance(self.embed, (Conv2dSubsampling)):
|
||||||
|
xs, masks = self.embed(xs, masks)
|
||||||
|
else:
|
||||||
|
xs = self.embed(xs)
|
||||||
|
|
||||||
|
if self.intermediate_layers is None:
|
||||||
|
xs, masks = self.encoders(xs, masks)
|
||||||
|
else:
|
||||||
|
intermediate_outputs = []
|
||||||
|
for layer_idx, encoder_layer in enumerate(self.encoders):
|
||||||
|
xs, masks = encoder_layer(xs, masks)
|
||||||
|
|
||||||
|
if (self.intermediate_layers is not None and
|
||||||
|
layer_idx + 1 in self.intermediate_layers):
|
||||||
|
# intermediate branches also require normalization.
|
||||||
|
encoder_output = xs
|
||||||
|
if isinstance(encoder_output, tuple):
|
||||||
|
encoder_output = encoder_output[0]
|
||||||
|
if self.normalize_before:
|
||||||
|
encoder_output = self.after_norm(encoder_output)
|
||||||
|
intermediate_outputs.append(encoder_output)
|
||||||
|
|
||||||
|
if isinstance(xs, tuple):
|
||||||
|
xs = xs[0]
|
||||||
|
|
||||||
|
if self.normalize_before:
|
||||||
|
xs = self.after_norm(xs)
|
||||||
|
|
||||||
|
if self.intermediate_layers is not None:
|
||||||
|
return xs, masks, intermediate_outputs
|
||||||
|
return xs, masks
|
@ -0,0 +1,83 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from espnet(https://github.com/espnet/espnet)
|
||||||
|
"""Subsampling layer definition."""
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
|
||||||
|
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2dSubsampling(nn.Layer):
|
||||||
|
"""Convolutional 2D subsampling (to 1/4 length).
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
idim : int
|
||||||
|
Input dimension.
|
||||||
|
odim : int
|
||||||
|
Output dimension.
|
||||||
|
dropout_rate : float
|
||||||
|
Dropout rate.
|
||||||
|
pos_enc : nn.Layer
|
||||||
|
Custom position encoding layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
|
||||||
|
"""Construct an Conv2dSubsampling object."""
|
||||||
|
super().__init__()
|
||||||
|
self.conv = nn.Sequential(
|
||||||
|
nn.Conv2D(1, odim, 3, 2),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Conv2D(odim, odim, 3, 2),
|
||||||
|
nn.ReLU(), )
|
||||||
|
self.out = nn.Sequential(
|
||||||
|
nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim),
|
||||||
|
pos_enc if pos_enc is not None else
|
||||||
|
PositionalEncoding(odim, dropout_rate), )
|
||||||
|
|
||||||
|
def forward(self, x, x_mask):
|
||||||
|
"""Subsample x.
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : paddle.Tensor
|
||||||
|
Input tensor (#batch, time, idim).
|
||||||
|
x_mask : paddle.Tensor
|
||||||
|
Input mask (#batch, 1, time).
|
||||||
|
Returns
|
||||||
|
----------
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled tensor (#batch, time', odim),
|
||||||
|
where time' = time // 4.
|
||||||
|
paddle.Tensor
|
||||||
|
Subsampled mask (#batch, 1, time'),
|
||||||
|
where time' = time // 4.
|
||||||
|
"""
|
||||||
|
# (b, c, t, f)
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = self.conv(x)
|
||||||
|
b, c, t, f = paddle.shape(x)
|
||||||
|
x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))
|
||||||
|
if x_mask is None:
|
||||||
|
return x, None
|
||||||
|
return x, x_mask[:, :, :-2:2][:, :, :-2:2]
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
"""Get item.
|
||||||
|
When reset_parameters() is called, if use_scaled_pos_enc is used,
|
||||||
|
return the positioning encoding.
|
||||||
|
"""
|
||||||
|
if key != -1:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Support only `-1` (for `reset_parameters`).")
|
||||||
|
return self.out[key]
|
Loading…
Reference in new issue