You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
246 lines
9.1 KiB
246 lines
9.1 KiB
2 years ago
|
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
import math
|
||
|
|
||
|
import paddle
|
||
|
import paddle.nn.functional as F
|
||
|
from paddle import nn
|
||
|
|
||
|
from paddlespeech.t2s.modules.nets_utils import initialize
|
||
|
from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
|
||
|
from paddlespeech.utils.initialize import kaiming_normal_
|
||
|
from paddlespeech.utils.initialize import kaiming_uniform_
|
||
|
from paddlespeech.utils.initialize import uniform_
|
||
|
from paddlespeech.utils.initialize import zeros_
|
||
|
|
||
|
|
||
|
def Conv1D(*args, **kwargs):
|
||
|
layer = nn.Conv1D(*args, **kwargs)
|
||
|
# Initialize the weight to be consistent with the official
|
||
|
kaiming_normal_(layer.weight)
|
||
|
|
||
|
# Initialization is consistent with torch
|
||
|
if layer.bias is not None:
|
||
|
fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
|
||
|
if fan_in != 0:
|
||
|
bound = 1 / math.sqrt(fan_in)
|
||
|
uniform_(layer.bias, -bound, bound)
|
||
|
return layer
|
||
|
|
||
|
|
||
|
# Initialization is consistent with torch
|
||
|
def Linear(*args, **kwargs):
|
||
|
layer = nn.Linear(*args, **kwargs)
|
||
|
kaiming_uniform_(layer.weight, a=math.sqrt(5))
|
||
|
if layer.bias is not None:
|
||
|
fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
|
||
|
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
|
||
|
uniform_(layer.bias, -bound, bound)
|
||
|
return layer
|
||
|
|
||
|
|
||
|
class ResidualBlock(nn.Layer):
|
||
|
"""ResidualBlock
|
||
|
|
||
|
Args:
|
||
|
encoder_hidden (int, optional):
|
||
|
Input feature size of the 1D convolution, by default 256
|
||
|
residual_channels (int, optional):
|
||
|
Feature size of the residual output(and also the input), by default 256
|
||
|
gate_channels (int, optional):
|
||
|
Output feature size of the 1D convolution, by default 512
|
||
|
kernel_size (int, optional):
|
||
|
Kernel size of the 1D convolution, by default 3
|
||
|
dilation (int, optional):
|
||
|
Dilation of the 1D convolution, by default 4
|
||
|
"""
|
||
|
|
||
|
def __init__(self,
|
||
|
encoder_hidden: int=256,
|
||
|
residual_channels: int=256,
|
||
|
gate_channels: int=512,
|
||
|
kernel_size: int=3,
|
||
|
dilation: int=4):
|
||
|
super().__init__()
|
||
|
self.dilated_conv = Conv1D(
|
||
|
residual_channels,
|
||
|
gate_channels,
|
||
|
kernel_size,
|
||
|
padding=dilation,
|
||
|
dilation=dilation)
|
||
|
self.diffusion_projection = Linear(residual_channels, residual_channels)
|
||
|
self.conditioner_projection = Conv1D(encoder_hidden, gate_channels, 1)
|
||
|
self.output_projection = Conv1D(residual_channels, gate_channels, 1)
|
||
|
|
||
|
def forward(
|
||
|
self,
|
||
|
x: paddle.Tensor,
|
||
|
diffusion_step: paddle.Tensor,
|
||
|
cond: paddle.Tensor, ):
|
||
|
"""Calculate forward propagation.
|
||
|
Args:
|
||
|
spec (Tensor(float32)): input feature. (B, residual_channels, T)
|
||
|
diffusion_step (Tensor(int64)): The timestep input (adding noise step). (B,)
|
||
|
cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, residual_channels, T)
|
||
|
|
||
|
Returns:
|
||
|
x (Tensor(float32)): output (B, residual_channels, T)
|
||
|
|
||
|
"""
|
||
|
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
|
||
|
cond = self.conditioner_projection(cond)
|
||
|
y = x + diffusion_step
|
||
|
|
||
|
y = self.dilated_conv(y) + cond
|
||
|
|
||
|
gate, filter = paddle.chunk(y, 2, axis=1)
|
||
|
y = F.sigmoid(gate) * paddle.tanh(filter)
|
||
|
|
||
|
y = self.output_projection(y)
|
||
|
residual, skip = paddle.chunk(y, 2, axis=1)
|
||
|
return (x + residual) / math.sqrt(2.0), skip
|
||
|
|
||
|
|
||
|
class SinusoidalPosEmb(nn.Layer):
|
||
|
"""Positional embedding
|
||
|
"""
|
||
|
|
||
|
def __init__(self, dim: int=256):
|
||
|
super().__init__()
|
||
|
self.dim = dim
|
||
|
|
||
|
def forward(self, x: paddle.Tensor):
|
||
|
x = paddle.cast(x, 'float32')
|
||
|
half_dim = self.dim // 2
|
||
|
emb = math.log(10000) / (half_dim - 1)
|
||
|
emb = paddle.exp(paddle.arange(half_dim) * -emb)
|
||
|
emb = x[:, None] * emb[None, :]
|
||
|
emb = paddle.concat([emb.sin(), emb.cos()], axis=-1)
|
||
|
return emb
|
||
|
|
||
|
|
||
|
class DiffNet(nn.Layer):
|
||
|
"""A Mel-Spectrogram Denoiser
|
||
|
|
||
|
Args:
|
||
|
in_channels (int, optional):
|
||
|
Number of channels of the input mel-spectrogram, by default 80
|
||
|
out_channels (int, optional):
|
||
|
Number of channels of the output mel-spectrogram, by default 80
|
||
|
kernel_size (int, optional):
|
||
|
Kernel size of the residual blocks inside, by default 3
|
||
|
layers (int, optional):
|
||
|
Number of residual blocks inside, by default 20
|
||
|
stacks (int, optional):
|
||
|
The number of groups to split the residual blocks into, by default 5
|
||
|
Within each group, the dilation of the residual block grows exponentially.
|
||
|
residual_channels (int, optional):
|
||
|
Residual channel of the residual blocks, by default 256
|
||
|
gate_channels (int, optional):
|
||
|
Gate channel of the residual blocks, by default 512
|
||
|
skip_channels (int, optional):
|
||
|
Skip channel of the residual blocks, by default 256
|
||
|
aux_channels (int, optional):
|
||
|
Auxiliary channel of the residual blocks, by default 256
|
||
|
dropout (float, optional):
|
||
|
Dropout of the residual blocks, by default 0.
|
||
|
bias (bool, optional):
|
||
|
Whether to use bias in residual blocks, by default True
|
||
|
use_weight_norm (bool, optional):
|
||
|
Whether to use weight norm in all convolutions, by default False
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
in_channels: int=80,
|
||
|
out_channels: int=80,
|
||
|
kernel_size: int=3,
|
||
|
layers: int=20,
|
||
|
stacks: int=5,
|
||
|
residual_channels: int=256,
|
||
|
gate_channels: int=512,
|
||
|
skip_channels: int=256,
|
||
|
aux_channels: int=256,
|
||
|
dropout: float=0.,
|
||
|
bias: bool=True,
|
||
|
use_weight_norm: bool=False,
|
||
|
init_type: str="kaiming_normal", ):
|
||
|
super().__init__()
|
||
|
|
||
|
self.in_channels = in_channels
|
||
|
self.out_channels = out_channels
|
||
|
self.layers = layers
|
||
|
self.aux_channels = aux_channels
|
||
|
self.residual_channels = residual_channels
|
||
|
self.gate_channels = gate_channels
|
||
|
self.kernel_size = kernel_size
|
||
|
self.dilation_cycle_length = layers // stacks
|
||
|
self.skip_channels = skip_channels
|
||
|
|
||
|
self.input_projection = Conv1D(self.in_channels, self.residual_channels,
|
||
|
1)
|
||
|
self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels)
|
||
|
dim = self.residual_channels
|
||
|
self.mlp = nn.Sequential(
|
||
|
Linear(dim, dim * 4), nn.Mish(), Linear(dim * 4, dim))
|
||
|
self.residual_layers = nn.LayerList([
|
||
|
ResidualBlock(
|
||
|
encoder_hidden=self.aux_channels,
|
||
|
residual_channels=self.residual_channels,
|
||
|
gate_channels=self.gate_channels,
|
||
|
kernel_size=self.kernel_size,
|
||
|
dilation=2**(i % self.dilation_cycle_length))
|
||
|
for i in range(self.layers)
|
||
|
])
|
||
|
self.skip_projection = Conv1D(self.residual_channels,
|
||
|
self.skip_channels, 1)
|
||
|
self.output_projection = Conv1D(self.residual_channels,
|
||
|
self.out_channels, 1)
|
||
|
zeros_(self.output_projection.weight)
|
||
|
|
||
|
def forward(
|
||
|
self,
|
||
|
spec: paddle.Tensor,
|
||
|
diffusion_step: paddle.Tensor,
|
||
|
cond: paddle.Tensor, ):
|
||
|
"""Calculate forward propagation.
|
||
|
Args:
|
||
|
spec (Tensor(float32)): The input mel-spectrogram. (B, n_mel, T)
|
||
|
diffusion_step (Tensor(int64)): The timestep input (adding noise step). (B,)
|
||
|
cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, D_enc_out, T)
|
||
|
|
||
|
Returns:
|
||
|
x (Tensor(float32)): pred noise (B, n_mel, T)
|
||
|
|
||
|
"""
|
||
|
x = spec
|
||
|
x = self.input_projection(x) # x [B, residual_channel, T]
|
||
|
|
||
|
x = F.relu(x)
|
||
|
diffusion_step = self.diffusion_embedding(diffusion_step)
|
||
|
diffusion_step = self.mlp(diffusion_step)
|
||
|
skip = []
|
||
|
for layer_id, layer in enumerate(self.residual_layers):
|
||
|
x, skip_connection = layer(
|
||
|
x=x,
|
||
|
diffusion_step=diffusion_step,
|
||
|
cond=cond, )
|
||
|
skip.append(skip_connection)
|
||
|
x = paddle.sum(
|
||
|
paddle.stack(skip), axis=0) / math.sqrt(len(self.residual_layers))
|
||
|
x = self.skip_projection(x)
|
||
|
x = F.relu(x)
|
||
|
x = self.output_projection(x) # [B, 80, T]
|
||
|
return x
|