|
|
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
import math
|
|
|
|
import time
|
|
|
|
from typing import List
|
|
|
|
from typing import Tuple
|
|
|
|
from typing import Union
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import paddle
|
|
|
|
from paddle import nn
|
|
|
|
from paddle.nn import functional as F
|
|
|
|
from paddle.nn import initializer as I
|
|
|
|
|
|
|
|
from paddlespeech.t2s.modules import geometry as geo
|
|
|
|
from paddlespeech.t2s.utils import checkpoint
|
|
|
|
|
|
|
|
__all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
|
|
|
|
|
|
|
|
|
|
|
|
def fold(x, n_group):
|
|
|
|
"""Fold audio or spectrogram's temporal dimension in to groups.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x(Tensor): The input tensor. shape=(\*, time_steps)
|
|
|
|
n_group(int): The size of a group.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: Folded tensor. shape=(\*, time_steps // n_group, group)
|
|
|
|
"""
|
|
|
|
spatial_shape = list(x.shape[:-1])
|
|
|
|
time_steps = paddle.shape(x)[-1]
|
|
|
|
new_shape = spatial_shape + [time_steps // n_group, n_group]
|
|
|
|
return paddle.reshape(x, new_shape)
|
|
|
|
|
|
|
|
|
|
|
|
class UpsampleNet(nn.LayerList):
|
|
|
|
"""Layer to upsample mel spectrogram to the same temporal resolution with
|
|
|
|
the corresponding waveform.
|
|
|
|
|
|
|
|
It consists of several conv2dtranspose layers which perform deconvolution
|
|
|
|
on mel and time dimension.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
|
|
|
|
The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
|
|
|
|
Layers. Each upscale_factor is used as the ``stride`` for the
|
|
|
|
corresponding Conv2DTranspose. Defaults to [16, 16], this the default
|
|
|
|
upsampling factor is 256.
|
|
|
|
|
|
|
|
Notes:
|
|
|
|
``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
|
|
|
|
transformation used to extract spectrogram features from audio.
|
|
|
|
|
|
|
|
For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
|
|
|
|
transformation whose ``hop_length`` equals 256 is suitable.
|
|
|
|
|
|
|
|
See Also
|
|
|
|
|
|
|
|
``librosa.core.stft``
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, upsample_factors):
|
|
|
|
super().__init__()
|
|
|
|
for factor in upsample_factors:
|
|
|
|
std = math.sqrt(1 / (3 * 2 * factor))
|
|
|
|
init = I.Uniform(-std, std)
|
|
|
|
self.append(
|
|
|
|
nn.utils.weight_norm(
|
|
|
|
nn.Conv2DTranspose(
|
|
|
|
1,
|
|
|
|
1, (3, 2 * factor),
|
|
|
|
padding=(1, factor // 2),
|
|
|
|
stride=(1, factor),
|
|
|
|
weight_attr=init,
|
|
|
|
bias_attr=init)))
|
|
|
|
|
|
|
|
# upsample factors
|
|
|
|
self.upsample_factor = np.prod(upsample_factors)
|
|
|
|
self.upsample_factors = upsample_factors
|
|
|
|
|
|
|
|
def forward(self, x, trim_conv_artifact=False):
|
|
|
|
"""Forward pass of the ``UpsampleNet``
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
|
|
|
|
trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps \* upsample_factor)
|
|
|
|
|
|
|
|
Notes:
|
|
|
|
If trim_conv_artifact is ``True``, the output time steps is less
|
|
|
|
than ``time_steps \* upsample_factors``.
|
|
|
|
"""
|
|
|
|
x = paddle.unsqueeze(x, 1) # (B, C, T) -> (B, 1, C, T)
|
|
|
|
for layer in self:
|
|
|
|
x = layer(x)
|
|
|
|
if trim_conv_artifact:
|
|
|
|
time_cutoff = layer._kernel_size[1] - layer._stride[1]
|
|
|
|
x = x[:, :, :, :-time_cutoff]
|
|
|
|
x = F.leaky_relu(x, 0.4)
|
|
|
|
x = paddle.squeeze(x, 1) # back to (B, C, T)
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
class ResidualBlock(nn.Layer):
|
|
|
|
"""ResidualBlock, the basic unit of ResidualNet used in WaveFlow.
|
|
|
|
|
|
|
|
It has a conv2d layer, which has causal padding in height dimension and
|
|
|
|
same paddign in width dimension. It also has projection for the condition
|
|
|
|
and output.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
channels (int): Feature size of the input.
|
|
|
|
cond_channels (int): Featuer size of the condition.
|
|
|
|
kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
|
|
|
|
dilations (int): Dilations of the Convolution2d applied to the input.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, channels, cond_channels, kernel_size, dilations):
|
|
|
|
super().__init__()
|
|
|
|
# input conv
|
|
|
|
std = math.sqrt(1 / channels * np.prod(kernel_size))
|
|
|
|
init = I.Uniform(-std, std)
|
|
|
|
receptive_field = [
|
|
|
|
1 + (k - 1) * d for (k, d) in zip(kernel_size, dilations)
|
|
|
|
]
|
|
|
|
rh, rw = receptive_field
|
|
|
|
paddings = [rh - 1, 0, rw // 2, (rw - 1) // 2] # causal & same
|
|
|
|
conv = nn.Conv2D(
|
|
|
|
channels,
|
|
|
|
2 * channels,
|
|
|
|
kernel_size,
|
|
|
|
padding=paddings,
|
|
|
|
dilation=dilations,
|
|
|
|
weight_attr=init,
|
|
|
|
bias_attr=init)
|
|
|
|
self.conv = nn.utils.weight_norm(conv)
|
|
|
|
self.rh = rh
|
|
|
|
self.rw = rw
|
|
|
|
self.dilations = dilations
|
|
|
|
|
|
|
|
# condition projection
|
|
|
|
std = math.sqrt(1 / cond_channels)
|
|
|
|
init = I.Uniform(-std, std)
|
|
|
|
condition_proj = nn.Conv2D(
|
|
|
|
cond_channels,
|
|
|
|
2 * channels, (1, 1),
|
|
|
|
weight_attr=init,
|
|
|
|
bias_attr=init)
|
|
|
|
self.condition_proj = nn.utils.weight_norm(condition_proj)
|
|
|
|
|
|
|
|
# parametric residual & skip connection
|
|
|
|
std = math.sqrt(1 / channels)
|
|
|
|
init = I.Uniform(-std, std)
|
|
|
|
out_proj = nn.Conv2D(
|
|
|
|
channels, 2 * channels, (1, 1), weight_attr=init, bias_attr=init)
|
|
|
|
self.out_proj = nn.utils.weight_norm(out_proj)
|
|
|
|
|
|
|
|
def forward(self, x, condition):
|
|
|
|
"""Compute output for a whole folded sequence.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x (Tensor): The input. [shape=(batch_size, channel, height, width)]
|
|
|
|
condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
|
|
|
|
skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
|
|
|
|
"""
|
|
|
|
x_in = x
|
|
|
|
x = self.conv(x)
|
|
|
|
x += self.condition_proj(condition)
|
|
|
|
|
|
|
|
content, gate = paddle.chunk(x, 2, axis=1)
|
|
|
|
x = paddle.tanh(content) * F.sigmoid(gate)
|
|
|
|
|
|
|
|
x = self.out_proj(x)
|
|
|
|
res, skip = paddle.chunk(x, 2, axis=1)
|
|
|
|
res = x_in + res
|
|
|
|
return res, skip
|
|
|
|
|
|
|
|
def start_sequence(self):
|
|
|
|
"""Prepare the layer for incremental computation of causal
|
|
|
|
convolution. Reset the buffer for causal convolution.
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
ValueError: If not in evaluation mode.
|
|
|
|
"""
|
|
|
|
if self.training:
|
|
|
|
raise ValueError("Only use start sequence at evaluation mode.")
|
|
|
|
self._conv_buffer = paddle.zeros([1])
|
|
|
|
|
|
|
|
# NOTE: call self.conv's weight norm hook expliccitly since
|
|
|
|
# its weight will be visited directly in `add_input` without
|
|
|
|
# calling its `__call__` method. If we do not trigger the weight
|
|
|
|
# norm hook, the weight may be outdated. e.g. after loading from
|
|
|
|
# a saved checkpoint
|
|
|
|
# see also: https://github.com/pytorch/pytorch/issues/47588
|
|
|
|
for hook in self.conv._forward_pre_hooks.values():
|
|
|
|
hook(self.conv, None)
|
|
|
|
|
|
|
|
def add_input(self, x_row, condition_row):
|
|
|
|
"""Compute the output for a row and update the buffer.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
|
|
|
|
condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
|
|
|
|
skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
|
|
|
|
|
|
|
|
"""
|
|
|
|
x_row_in = x_row
|
|
|
|
if len(paddle.shape(self._conv_buffer)) == 1:
|
|
|
|
self._init_buffer(x_row)
|
|
|
|
self._update_buffer(x_row)
|
|
|
|
rw = self.rw
|
|
|
|
x_row = F.conv2d(
|
|
|
|
self._conv_buffer,
|
|
|
|
self.conv.weight,
|
|
|
|
self.conv.bias,
|
|
|
|
padding=[0, 0, rw // 2, (rw - 1) // 2],
|
|
|
|
dilation=self.dilations)
|
|
|
|
x_row += self.condition_proj(condition_row)
|
|
|
|
content, gate = paddle.chunk(x_row, 2, axis=1)
|
|
|
|
x_row = paddle.tanh(content) * F.sigmoid(gate)
|
|
|
|
|
|
|
|
x_row = self.out_proj(x_row)
|
|
|
|
res, skip = paddle.chunk(x_row, 2, axis=1)
|
|
|
|
res = x_row_in + res
|
|
|
|
return res, skip
|
|
|
|
|
|
|
|
def _init_buffer(self, input):
|
|
|
|
batch_size, channels, _, width = input.shape
|
|
|
|
self._conv_buffer = paddle.zeros(
|
|
|
|
[batch_size, channels, self.rh, width], dtype=input.dtype)
|
|
|
|
|
|
|
|
def _update_buffer(self, input):
|
|
|
|
self._conv_buffer = paddle.concat(
|
|
|
|
[self._conv_buffer[:, :, 1:, :], input], axis=2)
|
|
|
|
|
|
|
|
|
|
|
|
class ResidualNet(nn.LayerList):
|
|
|
|
"""A stack of several ResidualBlocks. It merges condition at each layer.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
n_layer (int): Number of ResidualBlocks in the ResidualNet.
|
|
|
|
residual_channels (int): Feature size of each ResidualBlocks.
|
|
|
|
condition_channels (int): Feature size of the condition.
|
|
|
|
kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
|
|
|
|
dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
ValueError: If the length of dilations_h does not equals n_layers.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
n_layer: int,
|
|
|
|
residual_channels: int,
|
|
|
|
condition_channels: int,
|
|
|
|
kernel_size: Tuple[int],
|
|
|
|
dilations_h: List[int]):
|
|
|
|
if len(dilations_h) != n_layer:
|
|
|
|
raise ValueError(
|
|
|
|
"number of dilations_h should equals num of layers")
|
|
|
|
super().__init__()
|
|
|
|
for i in range(n_layer):
|
|
|
|
dilation = (dilations_h[i], 2**i)
|
|
|
|
layer = ResidualBlock(residual_channels, condition_channels,
|
|
|
|
kernel_size, dilation)
|
|
|
|
self.append(layer)
|
|
|
|
|
|
|
|
def forward(self, x, condition):
|
|
|
|
"""Comput the output of given the input and the condition.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x (Tensor): The input. shape=(batch_size, channel, height, width)
|
|
|
|
condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
|
|
|
|
|
|
|
|
"""
|
|
|
|
skip_connections = []
|
|
|
|
for layer in self:
|
|
|
|
x, skip = layer(x, condition)
|
|
|
|
skip_connections.append(skip)
|
|
|
|
out = paddle.sum(paddle.stack(skip_connections, 0), 0)
|
|
|
|
return out
|
|
|
|
|
|
|
|
def start_sequence(self):
|
|
|
|
"""Prepare the layer for incremental computation.
|
|
|
|
"""
|
|
|
|
for layer in self:
|
|
|
|
layer.start_sequence()
|
|
|
|
|
|
|
|
def add_input(self, x_row, condition_row):
|
|
|
|
"""Compute the output for a row and update the buffers.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
|
|
|
|
condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
|
|
|
|
skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
|
|
|
|
|
|
|
|
"""
|
|
|
|
skip_connections = []
|
|
|
|
for layer in self:
|
|
|
|
x_row, skip = layer.add_input(x_row, condition_row)
|
|
|
|
skip_connections.append(skip)
|
|
|
|
out = paddle.sum(paddle.stack(skip_connections, 0), 0)
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
class Flow(nn.Layer):
|
|
|
|
"""A bijection (Reversable layer) that transform a density of latent
|
|
|
|
variables p(Z) into a complex data distribution p(X).
|
|
|
|
|
|
|
|
It's an auto regressive flow. The ``forward`` method implements the
|
|
|
|
probability density estimation. The ``inverse`` method implements the
|
|
|
|
sampling.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
n_layers (int): Number of ResidualBlocks in the Flow.
|
|
|
|
channels (int): Feature size of the ResidualBlocks.
|
|
|
|
mel_bands (int): Feature size of the mel spectrogram (mel bands).
|
|
|
|
kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
|
|
|
|
n_group (int): Number of timesteps to the folded into a group.
|
|
|
|
"""
|
|
|
|
dilations_dict = {
|
|
|
|
8: [1, 1, 1, 1, 1, 1, 1, 1],
|
|
|
|
16: [1, 1, 1, 1, 1, 1, 1, 1],
|
|
|
|
32: [1, 2, 4, 1, 2, 4, 1, 2],
|
|
|
|
64: [1, 2, 4, 8, 16, 1, 2, 4],
|
|
|
|
128: [1, 2, 4, 8, 16, 32, 64, 1]
|
|
|
|
}
|
|
|
|
|
|
|
|
def __init__(self, n_layers, channels, mel_bands, kernel_size, n_group):
|
|
|
|
super().__init__()
|
|
|
|
# input projection
|
|
|
|
self.input_proj = nn.utils.weight_norm(
|
|
|
|
nn.Conv2D(
|
|
|
|
1,
|
|
|
|
channels, (1, 1),
|
|
|
|
weight_attr=I.Uniform(-1., 1.),
|
|
|
|
bias_attr=I.Uniform(-1., 1.)))
|
|
|
|
|
|
|
|
# residual net
|
|
|
|
self.resnet = ResidualNet(n_layers, channels, mel_bands, kernel_size,
|
|
|
|
self.dilations_dict[n_group])
|
|
|
|
|
|
|
|
# output projection
|
|
|
|
self.output_proj = nn.Conv2D(
|
|
|
|
channels,
|
|
|
|
2, (1, 1),
|
|
|
|
weight_attr=I.Constant(0.),
|
|
|
|
bias_attr=I.Constant(0.))
|
|
|
|
|
|
|
|
# specs
|
|
|
|
self.n_group = n_group
|
|
|
|
|
|
|
|
def _predict_parameters(self, x, condition):
|
|
|
|
x = self.input_proj(x)
|
|
|
|
x = self.resnet(x, condition)
|
|
|
|
bijection_params = self.output_proj(x)
|
|
|
|
logs, b = paddle.chunk(bijection_params, 2, axis=1)
|
|
|
|
return logs, b
|
|
|
|
|
|
|
|
def _transform(self, x, logs, b):
|
|
|
|
z_0 = x[:, :, :1, :] # the first row, just copy it
|
|
|
|
z_out = x[:, :, 1:, :] * paddle.exp(logs) + b
|
|
|
|
z_out = paddle.concat([z_0, z_out], axis=2)
|
|
|
|
return z_out
|
|
|
|
|
|
|
|
def forward(self, x, condition):
|
|
|
|
"""Probability density estimation. It is done by inversely transform
|
|
|
|
a sample from p(X) into a sample from p(Z).
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
|
|
|
|
condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
z (Tensor): shape(batch, 1, height, width), the transformed sample.
|
|
|
|
Tuple[Tensor, Tensor]:
|
|
|
|
The parameter of the transformation.
|
|
|
|
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
|
|
|
|
b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z.
|
|
|
|
"""
|
|
|
|
# (B, C, H-1, W)
|
|
|
|
logs, b = self._predict_parameters(x[:, :, :-1, :],
|
|
|
|
condition[:, :, 1:, :])
|
|
|
|
z = self._transform(x, logs, b)
|
|
|
|
return z, (logs, b)
|
|
|
|
|
|
|
|
def _predict_row_parameters(self, x_row, condition_row):
|
|
|
|
x_row = self.input_proj(x_row)
|
|
|
|
x_row = self.resnet.add_input(x_row, condition_row)
|
|
|
|
bijection_params = self.output_proj(x_row)
|
|
|
|
logs, b = paddle.chunk(bijection_params, 2, axis=1)
|
|
|
|
return logs, b
|
|
|
|
|
|
|
|
def _inverse_transform_row(self, z_row, logs, b):
|
|
|
|
x_row = (z_row - b) * paddle.exp(-logs)
|
|
|
|
return x_row
|
|
|
|
|
|
|
|
def _inverse_row(self, z_row, x_row, condition_row):
|
|
|
|
logs, b = self._predict_row_parameters(x_row, condition_row)
|
|
|
|
x_next_row = self._inverse_transform_row(z_row, logs, b)
|
|
|
|
return x_next_row, (logs, b)
|
|
|
|
|
|
|
|
def _start_sequence(self):
|
|
|
|
self.resnet.start_sequence()
|
|
|
|
|
|
|
|
def inverse(self, z, condition):
|
|
|
|
"""Sampling from the the distrition p(X). It is done by sample form
|
|
|
|
p(Z) and transform the sample. It is a auto regressive transformation.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
|
|
|
|
condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
|
|
|
|
Returns:
|
|
|
|
Tensor:
|
|
|
|
The transformed sample. shape=(batch, 1, height, width)
|
|
|
|
"""
|
|
|
|
z_0 = z[:, :, :1, :]
|
|
|
|
x = paddle.zeros_like(z)
|
|
|
|
x[:, :, :1, :] = z_0
|
|
|
|
|
|
|
|
self._start_sequence()
|
|
|
|
|
|
|
|
num_step = paddle.ones([1], dtype='int32') * (self.n_group)
|
|
|
|
for i in range(1, num_step):
|
|
|
|
x_row = x[:, :, i - 1:i, :]
|
|
|
|
z_row = z[:, :, i:i + 1, :]
|
|
|
|
condition_row = condition[:, :, i:i + 1, :]
|
|
|
|
x_next_row, (logs, b) = self._inverse_row(z_row, x_row,
|
|
|
|
condition_row)
|
|
|
|
x[:, :, i:i + 1, :] = x_next_row
|
|
|
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
class WaveFlow(nn.LayerList):
|
|
|
|
"""An Deep Reversible layer that is composed of severel auto regressive
|
|
|
|
flows.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
n_flows (int): Number of flows in the WaveFlow model.
|
|
|
|
n_layers (int): Number of ResidualBlocks in each Flow.
|
|
|
|
n_group (int): Number of timesteps to fold as a group.
|
|
|
|
channels (int): Feature size of each ResidualBlock.
|
|
|
|
mel_bands (int): Feature size of mel spectrogram (mel bands).
|
|
|
|
kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
|
|
|
|
kernel_size):
|
|
|
|
if n_group % 2 or n_flows % 2:
|
|
|
|
raise ValueError(
|
|
|
|
"number of flows and number of group must be even "
|
|
|
|
"since a permutation along group among flows is used.")
|
|
|
|
super().__init__()
|
|
|
|
for _ in range(n_flows):
|
|
|
|
self.append(
|
|
|
|
Flow(n_layers, channels, mel_bands, kernel_size, n_group))
|
|
|
|
|
|
|
|
# permutations in h
|
|
|
|
self.perms = self._create_perm(n_group, n_flows)
|
|
|
|
|
|
|
|
# specs
|
|
|
|
self.n_group = n_group
|
|
|
|
self.n_flows = n_flows
|
|
|
|
|
|
|
|
def _create_perm(self, n_group, n_flows):
|
|
|
|
indices = list(range(n_group))
|
|
|
|
half = n_group // 2
|
|
|
|
perms = []
|
|
|
|
for i in range(n_flows):
|
|
|
|
if i < n_flows // 2:
|
|
|
|
perm = indices[::-1]
|
|
|
|
else:
|
|
|
|
perm = list(reversed(indices[:half])) + list(
|
|
|
|
reversed(indices[half:]))
|
|
|
|
perm = paddle.to_tensor(perm)
|
|
|
|
self.register_buffer(perm.name, perm)
|
|
|
|
perms.append(perm)
|
|
|
|
return perms
|
|
|
|
|
|
|
|
def _trim(self, x, condition):
|
|
|
|
assert condition.shape[-1] >= x.shape[-1]
|
|
|
|
pruned_len = int(paddle.shape(x)[-1] // self.n_group * self.n_group)
|
|
|
|
|
|
|
|
if x.shape[-1] > pruned_len:
|
|
|
|
x = x[:, :pruned_len]
|
|
|
|
if condition.shape[-1] > pruned_len:
|
|
|
|
condition = condition[:, :, :pruned_len]
|
|
|
|
return x, condition
|
|
|
|
|
|
|
|
def forward(self, x, condition):
|
|
|
|
"""Probability density estimation of random variable x given the
|
|
|
|
condition.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
x (Tensor): The audio. shape=(batch_size, time_steps)
|
|
|
|
condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: The transformed random variable. shape=(batch_size, time_steps)
|
|
|
|
Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
|
|
|
|
"""
|
|
|
|
# x: (B, T)
|
|
|
|
# condition: (B, C, T) upsampled condition
|
|
|
|
x, condition = self._trim(x, condition)
|
|
|
|
|
|
|
|
# to (B, C, h, T//h) layout
|
|
|
|
x = paddle.unsqueeze(
|
|
|
|
paddle.transpose(fold(x, self.n_group), [0, 2, 1]), 1)
|
|
|
|
condition = paddle.transpose(
|
|
|
|
fold(condition, self.n_group), [0, 1, 3, 2])
|
|
|
|
|
|
|
|
# flows
|
|
|
|
logs_list = []
|
|
|
|
for i, layer in enumerate(self):
|
|
|
|
x, (logs, b) = layer(x, condition)
|
|
|
|
logs_list.append(logs)
|
|
|
|
# permute paddle has no shuffle dim
|
|
|
|
x = geo.shuffle_dim(x, 2, perm=self.perms[i])
|
|
|
|
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
|
|
|
|
|
|
|
z = paddle.squeeze(x, 1) # (B, H, W)
|
|
|
|
batch_size = z.shape[0]
|
|
|
|
z = paddle.reshape(paddle.transpose(z, [0, 2, 1]), [batch_size, -1])
|
|
|
|
|
|
|
|
log_det_jacobian = paddle.sum(paddle.stack(logs_list))
|
|
|
|
return z, log_det_jacobian
|
|
|
|
|
|
|
|
def inverse(self, z, condition):
|
|
|
|
"""Sampling from the the distrition p(X).
|
|
|
|
|
|
|
|
It is done by sample a ``z`` form p(Z) and transform it into ``x``.
|
|
|
|
Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
|
|
|
|
autoregressive manner.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
|
|
|
|
condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
z, condition = self._trim(z, condition)
|
|
|
|
# to (B, C, h, T//h) layout
|
|
|
|
z = paddle.unsqueeze(
|
|
|
|
paddle.transpose(fold(z, self.n_group), [0, 2, 1]), 1)
|
|
|
|
condition = paddle.transpose(
|
|
|
|
fold(condition, self.n_group), [0, 1, 3, 2])
|
|
|
|
|
|
|
|
# reverse it flow by flow
|
|
|
|
for i in reversed(range(self.n_flows)):
|
|
|
|
z = geo.shuffle_dim(z, 2, perm=self.perms[i])
|
|
|
|
condition = geo.shuffle_dim(condition, 2, perm=self.perms[i])
|
|
|
|
z = self[i].inverse(z, condition)
|
|
|
|
|
|
|
|
x = paddle.squeeze(z, 1) # (B, H, W)
|
|
|
|
batch_size = x.shape[0]
|
|
|
|
x = paddle.reshape(paddle.transpose(x, [0, 2, 1]), [batch_size, -1])
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
class ConditionalWaveFlow(nn.LayerList):
|
|
|
|
"""ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
upsample_factors (List[int]): Upsample factors for the upsample net.
|
|
|
|
n_flows (int): Number of flows in the WaveFlow model.
|
|
|
|
n_layers (int): Number of ResidualBlocks in each Flow.
|
|
|
|
n_group (int): Number of timesteps to fold as a group.
|
|
|
|
channels (int): Feature size of each ResidualBlock.
|
|
|
|
n_mels (int): Feature size of mel spectrogram (mel bands).
|
|
|
|
kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
upsample_factors: List[int],
|
|
|
|
n_flows: int,
|
|
|
|
n_layers: int,
|
|
|
|
n_group: int,
|
|
|
|
channels: int,
|
|
|
|
n_mels: int,
|
|
|
|
kernel_size: Union[int, List[int]]):
|
|
|
|
super().__init__()
|
|
|
|
self.encoder = UpsampleNet(upsample_factors)
|
|
|
|
self.decoder = WaveFlow(
|
|
|
|
n_flows=n_flows,
|
|
|
|
n_layers=n_layers,
|
|
|
|
n_group=n_group,
|
|
|
|
channels=channels,
|
|
|
|
mel_bands=n_mels,
|
|
|
|
kernel_size=kernel_size)
|
|
|
|
|
|
|
|
def forward(self, audio, mel):
|
|
|
|
"""Compute the transformed random variable z (x to z) and the log of
|
|
|
|
the determinant of the jacobian of the transformation from x to z.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
audio(Tensor): The audio. shape=(B, T)
|
|
|
|
mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
|
|
|
|
Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
|
|
|
|
"""
|
|
|
|
condition = self.encoder(mel)
|
|
|
|
z, log_det_jacobian = self.decoder(audio, condition)
|
|
|
|
return z, log_det_jacobian
|
|
|
|
|
|
|
|
@paddle.no_grad()
|
|
|
|
def infer(self, mel):
|
|
|
|
"""Generate raw audio given mel spectrogram.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: The synthesized audio, where``T <= T_mel \* upsample_factors``. shape=(B, T)
|
|
|
|
"""
|
|
|
|
start = time.time()
|
|
|
|
condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T)
|
|
|
|
batch_size, _, time_steps = condition.shape
|
|
|
|
z = paddle.randn([batch_size, time_steps], dtype=mel.dtype)
|
|
|
|
x = self.decoder.inverse(z, condition)
|
|
|
|
end = time.time()
|
|
|
|
print("time: {}s".format(end - start))
|
|
|
|
return x
|
|
|
|
|
|
|
|
@paddle.no_grad()
|
|
|
|
def predict(self, mel):
|
|
|
|
"""Generate raw audio given mel spectrogram.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
np.ndarray: The synthesized audio. shape=(T,)
|
|
|
|
"""
|
|
|
|
mel = paddle.to_tensor(mel)
|
|
|
|
mel = paddle.unsqueeze(mel, 0)
|
|
|
|
audio = self.infer(mel)
|
|
|
|
audio = audio[0].numpy()
|
|
|
|
return audio
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_pretrained(cls, config, checkpoint_path):
|
|
|
|
"""Build a ConditionalWaveFlow model from a pretrained model.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
config(yacs.config.CfgNode): model configs
|
|
|
|
checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
ConditionalWaveFlow The model built from pretrained result.
|
|
|
|
"""
|
|
|
|
model = cls(upsample_factors=config.model.upsample_factors,
|
|
|
|
n_flows=config.model.n_flows,
|
|
|
|
n_layers=config.model.n_layers,
|
|
|
|
n_group=config.model.n_group,
|
|
|
|
channels=config.model.channels,
|
|
|
|
n_mels=config.data.n_mels,
|
|
|
|
kernel_size=config.model.kernel_size)
|
|
|
|
checkpoint.load_parameters(model, checkpoint_path=checkpoint_path)
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
class WaveFlowLoss(nn.Layer):
|
|
|
|
"""Criterion of a WaveFlow model.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
sigma (float): The standard deviation of the gaussian noise used in WaveFlow,
|
|
|
|
by default 1.0.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, sigma=1.0):
|
|
|
|
super().__init__()
|
|
|
|
self.sigma = sigma
|
|
|
|
self.const = 0.5 * np.log(2 * np.pi) + np.log(self.sigma)
|
|
|
|
|
|
|
|
def forward(self, z, log_det_jacobian):
|
|
|
|
"""Compute the loss given the transformed random variable z and the
|
|
|
|
log_det_jacobian of transformation from x to z.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
z(Tensor): The transformed random variable (x to z). shape=(B, T)
|
|
|
|
log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
|
|
|
|
transformation from x to z. shape=(1,)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Tensor: The loss. shape=(1,)
|
|
|
|
"""
|
|
|
|
loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
|
|
|
|
) - log_det_jacobian
|
|
|
|
loss = loss / np.prod(z.shape)
|
|
|
|
return loss + self.const
|
|
|
|
|
|
|
|
|
|
|
|
class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
|
|
|
|
def forward(self, mel):
|
|
|
|
"""Generate raw audio given mel spectrogram.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
np.ndarray: The synthesized audio. shape=(T,)
|
|
|
|
|
|
|
|
"""
|
|
|
|
audio = self.predict(mel)
|
|
|
|
return audio
|