You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/t2s/modules/diffnet.py

245 lines
9.0 KiB

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn.functional as F
from paddle import nn
from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
from paddlespeech.utils.initialize import kaiming_normal_
from paddlespeech.utils.initialize import kaiming_uniform_
from paddlespeech.utils.initialize import uniform_
from paddlespeech.utils.initialize import zeros_
def Conv1D(*args, **kwargs):
layer = nn.Conv1D(*args, **kwargs)
# Initialize the weight to be consistent with the official
kaiming_normal_(layer.weight)
# Initialization is consistent with torch
if layer.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
if fan_in != 0:
bound = 1 / math.sqrt(fan_in)
uniform_(layer.bias, -bound, bound)
return layer
# Initialization is consistent with torch
def Linear(*args, **kwargs):
layer = nn.Linear(*args, **kwargs)
kaiming_uniform_(layer.weight, a=math.sqrt(5))
if layer.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
uniform_(layer.bias, -bound, bound)
return layer
class ResidualBlock(nn.Layer):
"""ResidualBlock
Args:
encoder_hidden (int, optional):
Input feature size of the 1D convolution, by default 256
residual_channels (int, optional):
Feature size of the residual output(and also the input), by default 256
gate_channels (int, optional):
Output feature size of the 1D convolution, by default 512
kernel_size (int, optional):
Kernel size of the 1D convolution, by default 3
dilation (int, optional):
Dilation of the 1D convolution, by default 4
"""
def __init__(self,
encoder_hidden: int=256,
residual_channels: int=256,
gate_channels: int=512,
kernel_size: int=3,
dilation: int=4):
super().__init__()
self.dilated_conv = Conv1D(
residual_channels,
gate_channels,
kernel_size,
padding=dilation,
dilation=dilation)
self.diffusion_projection = Linear(residual_channels, residual_channels)
self.conditioner_projection = Conv1D(encoder_hidden, gate_channels, 1)
self.output_projection = Conv1D(residual_channels, gate_channels, 1)
def forward(
self,
x: paddle.Tensor,
diffusion_step: paddle.Tensor,
cond: paddle.Tensor, ):
"""Calculate forward propagation.
Args:
spec (Tensor(float32)): input feature. (B, residual_channels, T)
diffusion_step (Tensor(int64)): The timestep input (adding noise step). (B,)
cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, residual_channels, T)
Returns:
x (Tensor(float32)): output (B, residual_channels, T)
"""
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
cond = self.conditioner_projection(cond)
y = x + diffusion_step
y = self.dilated_conv(y) + cond
gate, filter = paddle.chunk(y, 2, axis=1)
y = F.sigmoid(gate) * paddle.tanh(filter)
y = self.output_projection(y)
residual, skip = paddle.chunk(y, 2, axis=1)
return (x + residual) / math.sqrt(2.0), skip
class SinusoidalPosEmb(nn.Layer):
"""Positional embedding
"""
def __init__(self, dim: int=256):
super().__init__()
self.dim = dim
def forward(self, x: paddle.Tensor):
x = paddle.cast(x, 'float32')
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = paddle.exp(paddle.arange(half_dim) * -emb)
emb = x[:, None] * emb[None, :]
emb = paddle.concat([emb.sin(), emb.cos()], axis=-1)
return emb
class DiffNet(nn.Layer):
"""A Mel-Spectrogram Denoiser
Args:
in_channels (int, optional):
Number of channels of the input mel-spectrogram, by default 80
out_channels (int, optional):
Number of channels of the output mel-spectrogram, by default 80
kernel_size (int, optional):
Kernel size of the residual blocks inside, by default 3
layers (int, optional):
Number of residual blocks inside, by default 20
stacks (int, optional):
The number of groups to split the residual blocks into, by default 5
Within each group, the dilation of the residual block grows exponentially.
residual_channels (int, optional):
Residual channel of the residual blocks, by default 256
gate_channels (int, optional):
Gate channel of the residual blocks, by default 512
skip_channels (int, optional):
Skip channel of the residual blocks, by default 256
aux_channels (int, optional):
Auxiliary channel of the residual blocks, by default 256
dropout (float, optional):
Dropout of the residual blocks, by default 0.
bias (bool, optional):
Whether to use bias in residual blocks, by default True
use_weight_norm (bool, optional):
Whether to use weight norm in all convolutions, by default False
"""
def __init__(
self,
in_channels: int=80,
out_channels: int=80,
kernel_size: int=3,
layers: int=20,
stacks: int=5,
residual_channels: int=256,
gate_channels: int=512,
skip_channels: int=256,
aux_channels: int=256,
dropout: float=0.,
bias: bool=True,
use_weight_norm: bool=False,
init_type: str="kaiming_normal", ):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.layers = layers
self.aux_channels = aux_channels
self.residual_channels = residual_channels
self.gate_channels = gate_channels
self.kernel_size = kernel_size
self.dilation_cycle_length = layers // stacks
self.skip_channels = skip_channels
self.input_projection = Conv1D(self.in_channels, self.residual_channels,
1)
self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels)
dim = self.residual_channels
self.mlp = nn.Sequential(
Linear(dim, dim * 4), nn.Mish(), Linear(dim * 4, dim))
self.residual_layers = nn.LayerList([
ResidualBlock(
encoder_hidden=self.aux_channels,
residual_channels=self.residual_channels,
gate_channels=self.gate_channels,
kernel_size=self.kernel_size,
dilation=2**(i % self.dilation_cycle_length))
for i in range(self.layers)
])
self.skip_projection = Conv1D(self.residual_channels,
self.skip_channels, 1)
self.output_projection = Conv1D(self.residual_channels,
self.out_channels, 1)
zeros_(self.output_projection.weight)
def forward(
self,
spec: paddle.Tensor,
diffusion_step: paddle.Tensor,
cond: paddle.Tensor, ):
"""Calculate forward propagation.
Args:
spec (Tensor(float32)): The input mel-spectrogram. (B, n_mel, T)
diffusion_step (Tensor(int64)): The timestep input (adding noise step). (B,)
cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, D_enc_out, T)
Returns:
x (Tensor(float32)): pred noise (B, n_mel, T)
"""
x = spec
x = self.input_projection(x) # x [B, residual_channel, T]
x = F.relu(x)
diffusion_step = self.diffusion_embedding(diffusion_step)
diffusion_step = self.mlp(diffusion_step)
skip = []
for layer_id, layer in enumerate(self.residual_layers):
x, skip_connection = layer(
x=x,
diffusion_step=diffusion_step,
cond=cond, )
skip.append(skip_connection)
x = paddle.sum(
paddle.stack(skip), axis=0) / math.sqrt(len(self.residual_layers))
x = self.skip_projection(x)
x = F.relu(x)
x = self.output_projection(x) # [B, 80, T]
return x