You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py

263 lines
8.8 KiB

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn.functional as F
import paddleaudio.functional as audio_F
from paddle import nn
from paddlespeech.utils.initialize import _calculate_gain
from paddlespeech.utils.initialize import xavier_uniform_
def _get_activation_fn(activ):
if activ == 'relu':
return nn.ReLU()
elif activ == 'lrelu':
return nn.LeakyReLU(0.2)
elif activ == 'swish':
return nn.Swish()
else:
raise RuntimeError(
'Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
class LinearNorm(nn.Layer):
def __init__(self,
in_dim: int,
out_dim: int,
bias: bool=True,
w_init_gain: str='linear'):
super().__init__()
self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias)
xavier_uniform_(
self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
def forward(self, x: paddle.Tensor):
out = self.linear_layer(x)
return out
class ConvNorm(nn.Layer):
def __init__(self,
in_channels: int,
out_channels: int,
kernel_size: int=1,
stride: int=1,
padding: int=None,
dilation: int=1,
bias: bool=True,
w_init_gain: str='linear',
param=None):
super().__init__()
if padding is None:
assert (kernel_size % 2 == 1)
padding = int(dilation * (kernel_size - 1) / 2)
self.conv = nn.Conv1D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias_attr=bias)
xavier_uniform_(
self.conv.weight, gain=_calculate_gain(w_init_gain, param=param))
def forward(self, signal: paddle.Tensor):
conv_signal = self.conv(signal)
return conv_signal
class ConvBlock(nn.Layer):
def __init__(self,
hidden_dim: int,
n_conv: int=3,
dropout_p: float=0.2,
activ: str='relu'):
super().__init__()
self._n_groups = 8
self.blocks = nn.LayerList([
self._get_conv(
hidden_dim=hidden_dim,
dilation=3**i,
activ=activ,
dropout_p=dropout_p) for i in range(n_conv)
])
def forward(self, x: paddle.Tensor):
for block in self.blocks:
res = x
x = block(x)
x += res
return x
def _get_conv(self,
hidden_dim: int,
dilation: int,
activ: str='relu',
dropout_p: float=0.2):
layers = [
ConvNorm(
in_channels=hidden_dim,
out_channels=hidden_dim,
kernel_size=3,
padding=dilation,
dilation=dilation), _get_activation_fn(activ),
nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
nn.Dropout(p=dropout_p), ConvNorm(
hidden_dim, hidden_dim, kernel_size=3, padding=1,
dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p)
]
return nn.Sequential(*layers)
class LocationLayer(nn.Layer):
def __init__(self,
attention_n_filters: int,
attention_kernel_size: int,
attention_dim: int):
super().__init__()
padding = int((attention_kernel_size - 1) / 2)
self.location_conv = ConvNorm(
in_channels=2,
out_channels=attention_n_filters,
kernel_size=attention_kernel_size,
padding=padding,
bias=False,
stride=1,
dilation=1)
self.location_dense = LinearNorm(
in_dim=attention_n_filters,
out_dim=attention_dim,
bias=False,
w_init_gain='tanh')
def forward(self, attention_weights_cat: paddle.Tensor):
processed_attention = self.location_conv(attention_weights_cat)
processed_attention = processed_attention.transpose([0, 2, 1])
processed_attention = self.location_dense(processed_attention)
return processed_attention
class Attention(nn.Layer):
def __init__(self,
attention_rnn_dim: int,
embedding_dim: int,
attention_dim: int,
attention_location_n_filters: int,
attention_location_kernel_size: int):
super().__init__()
self.query_layer = LinearNorm(
in_dim=attention_rnn_dim,
out_dim=attention_dim,
bias=False,
w_init_gain='tanh')
self.memory_layer = LinearNorm(
in_dim=embedding_dim,
out_dim=attention_dim,
bias=False,
w_init_gain='tanh')
self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False)
self.location_layer = LocationLayer(
attention_n_filters=attention_location_n_filters,
attention_kernel_size=attention_location_kernel_size,
attention_dim=attention_dim)
self.score_mask_value = -float("inf")
def get_alignment_energies(self,
query: paddle.Tensor,
processed_memory: paddle.Tensor,
attention_weights_cat: paddle.Tensor):
"""
Args:
query:
decoder output (B, n_mel_channels * n_frames_per_step)
processed_memory:
processed encoder outputs (B, T_in, attention_dim)
attention_weights_cat:
cumulative and prev. att weights (B, 2, max_time)
Returns:
Tensor:
alignment (B, max_time)
"""
processed_query = self.query_layer(query.unsqueeze(1))
processed_attention_weights = self.location_layer(attention_weights_cat)
energies = self.v(
paddle.tanh(processed_query + processed_attention_weights +
processed_memory))
energies = energies.squeeze(-1)
return energies
def forward(self,
attention_hidden_state: paddle.Tensor,
memory: paddle.Tensor,
processed_memory: paddle.Tensor,
attention_weights_cat: paddle.Tensor,
mask: paddle.Tensor):
"""
Args:
attention_hidden_state:
attention rnn last output
memory:
encoder outputs
processed_memory:
processed encoder outputs
attention_weights_cat:
previous and cummulative attention weights
mask:
binary mask for padded data
"""
alignment = self.get_alignment_energies(
query=attention_hidden_state,
processed_memory=processed_memory,
attention_weights_cat=attention_weights_cat)
if mask is not None:
alignment.data.masked_fill_(mask, self.score_mask_value)
attention_weights = F.softmax(alignment, axis=1)
attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory)
attention_context = attention_context.squeeze(1)
return attention_context, attention_weights
class MFCC(nn.Layer):
def __init__(self, n_mfcc: int=40, n_mels: int=80):
super().__init__()
self.n_mfcc = n_mfcc
self.n_mels = n_mels
self.norm = 'ortho'
dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
self.register_buffer('dct_mat', dct_mat)
def forward(self, mel_specgram: paddle.Tensor):
if len(mel_specgram.shape) == 2:
mel_specgram = mel_specgram.unsqueeze(0)
unsqueezed = True
else:
unsqueezed = False
# (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
# -> (channel, time, n_mfcc).tranpose(...)
mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]),
self.dct_mat).transpose([0, 2, 1])
# unpack batch
if unsqueezed:
mfcc = mfcc.squeeze(0)
return mfcc