|
|
|
@ -19,31 +19,36 @@ This work is licensed under the Creative Commons Attribution-NonCommercial
|
|
|
|
|
http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
|
|
|
|
|
Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
|
|
|
|
|
"""
|
|
|
|
|
# import copy
|
|
|
|
|
import math
|
|
|
|
|
|
|
|
|
|
import paddle
|
|
|
|
|
import paddle.nn.functional as F
|
|
|
|
|
from paddle import nn
|
|
|
|
|
|
|
|
|
|
from paddlespeech.utils.initialize import _calculate_gain
|
|
|
|
|
from paddlespeech.utils.initialize import xavier_uniform_
|
|
|
|
|
|
|
|
|
|
# from munch import Munch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DownSample(nn.Layer):
|
|
|
|
|
def __init__(self, layer_type: str):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.layer_type = layer_type
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
def forward(self, x: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
layer_type == 'none': Shape (B, dim_in, n_mels, T)
|
|
|
|
|
layer_type == 'timepreserve': Shape (B, dim_in, n_mels // 2, T)
|
|
|
|
|
layer_type == 'half': Shape (B, dim_in, n_mels // 2, T // 2)
|
|
|
|
|
"""
|
|
|
|
|
if self.layer_type == 'none':
|
|
|
|
|
return x
|
|
|
|
|
elif self.layer_type == 'timepreserve':
|
|
|
|
|
return F.avg_pool2d(x, (2, 1))
|
|
|
|
|
out = F.avg_pool2d(x, (2, 1))
|
|
|
|
|
return out
|
|
|
|
|
elif self.layer_type == 'half':
|
|
|
|
|
return F.avg_pool2d(x, 2)
|
|
|
|
|
out = F.avg_pool2d(x, 2)
|
|
|
|
|
return out
|
|
|
|
|
else:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]'
|
|
|
|
@ -55,13 +60,24 @@ class UpSample(nn.Layer):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.layer_type = layer_type
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
def forward(self, x: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
layer_type == 'none': Shape (B, dim_in, n_mels, T)
|
|
|
|
|
layer_type == 'timepreserve': Shape (B, dim_in, n_mels * 2, T)
|
|
|
|
|
layer_type == 'half': Shape (B, dim_in, n_mels * 2, T * 2)
|
|
|
|
|
"""
|
|
|
|
|
if self.layer_type == 'none':
|
|
|
|
|
return x
|
|
|
|
|
elif self.layer_type == 'timepreserve':
|
|
|
|
|
return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
|
|
|
|
|
out = F.interpolate(x, scale_factor=(2, 1), mode='nearest')
|
|
|
|
|
return out
|
|
|
|
|
elif self.layer_type == 'half':
|
|
|
|
|
return F.interpolate(x, scale_factor=2, mode='nearest')
|
|
|
|
|
out = F.interpolate(x, scale_factor=2, mode='nearest')
|
|
|
|
|
return out
|
|
|
|
|
else:
|
|
|
|
|
raise RuntimeError(
|
|
|
|
|
'Got unexpected upsampletype %s, expected is [none, timepreserve, half]'
|
|
|
|
@ -127,9 +143,19 @@ class ResBlk(nn.Layer):
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
def forward(self, x: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
downsample == 'none': Shape (B, dim_in, n_mels, T).
|
|
|
|
|
downsample == 'timepreserve': Shape (B, dim_out, T, n_mels // 2, T).
|
|
|
|
|
downsample == 'half': Shape (B, dim_out, T, n_mels // 2, T // 2).
|
|
|
|
|
"""
|
|
|
|
|
x = self._shortcut(x) + self._residual(x)
|
|
|
|
|
# unit variance
|
|
|
|
|
return x / math.sqrt(2)
|
|
|
|
|
out = x / math.sqrt(2)
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AdaIN(nn.Layer):
|
|
|
|
@ -140,12 +166,21 @@ class AdaIN(nn.Layer):
|
|
|
|
|
self.fc = nn.Linear(style_dim, num_features * 2)
|
|
|
|
|
|
|
|
|
|
def forward(self, x: paddle.Tensor, s: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)): Shape (B, style_dim, n_mels, T).
|
|
|
|
|
s(Tensor(float32)): Shape (style_dim, ).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
Shape (B, style_dim, T, n_mels, T).
|
|
|
|
|
"""
|
|
|
|
|
if len(s.shape) == 1:
|
|
|
|
|
s = s[None]
|
|
|
|
|
h = self.fc(s)
|
|
|
|
|
h = h.reshape((h.shape[0], h.shape[1], 1, 1))
|
|
|
|
|
gamma, beta = paddle.split(h, 2, axis=1)
|
|
|
|
|
return (1 + gamma) * self.norm(x) + beta
|
|
|
|
|
out = (1 + gamma) * self.norm(x) + beta
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AdainResBlk(nn.Layer):
|
|
|
|
@ -162,6 +197,7 @@ class AdainResBlk(nn.Layer):
|
|
|
|
|
self.upsample = UpSample(layer_type=upsample)
|
|
|
|
|
self.learned_sc = dim_in != dim_out
|
|
|
|
|
self._build_weights(dim_in, dim_out, style_dim)
|
|
|
|
|
self.layer_type = upsample
|
|
|
|
|
|
|
|
|
|
def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64):
|
|
|
|
|
self.conv1 = nn.Conv2D(
|
|
|
|
@ -204,6 +240,18 @@ class AdainResBlk(nn.Layer):
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
def forward(self, x: paddle.Tensor, s: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)):
|
|
|
|
|
Shape (B, dim_in, n_mels, T).
|
|
|
|
|
s(Tensor(float32)):
|
|
|
|
|
Shape (64,).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
upsample == 'none': Shape (B, dim_out, T, n_mels, T).
|
|
|
|
|
upsample == 'timepreserve': Shape (B, dim_out, T, n_mels * 2, T).
|
|
|
|
|
upsample == 'half': Shape (B, dim_out, T, n_mels * 2, T * 2).
|
|
|
|
|
"""
|
|
|
|
|
out = self._residual(x, s)
|
|
|
|
|
if self.w_hpf == 0:
|
|
|
|
|
out = (out + self._shortcut(x)) / math.sqrt(2)
|
|
|
|
@ -219,7 +267,8 @@ class HighPass(nn.Layer):
|
|
|
|
|
def forward(self, x: paddle.Tensor):
|
|
|
|
|
filter = self.filter.unsqueeze(0).unsqueeze(1).tile(
|
|
|
|
|
[x.shape[1], 1, 1, 1])
|
|
|
|
|
return F.conv2d(x, filter, padding=1, groups=x.shape[1])
|
|
|
|
|
out = F.conv2d(x, filter, padding=1, groups=x.shape[1])
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Generator(nn.Layer):
|
|
|
|
@ -276,12 +325,10 @@ class Generator(nn.Layer):
|
|
|
|
|
w_hpf=w_hpf,
|
|
|
|
|
upsample=_downtype)) # stack-like
|
|
|
|
|
dim_in = dim_out
|
|
|
|
|
|
|
|
|
|
# bottleneck blocks (encoder)
|
|
|
|
|
for _ in range(2):
|
|
|
|
|
self.encode.append(
|
|
|
|
|
ResBlk(dim_in=dim_out, dim_out=dim_out, normalize=True))
|
|
|
|
|
|
|
|
|
|
# F0 blocks
|
|
|
|
|
if F0_channel != 0:
|
|
|
|
|
self.decode.insert(0,
|
|
|
|
@ -290,7 +337,6 @@ class Generator(nn.Layer):
|
|
|
|
|
dim_out=dim_out,
|
|
|
|
|
style_dim=style_dim,
|
|
|
|
|
w_hpf=w_hpf))
|
|
|
|
|
|
|
|
|
|
# bottleneck blocks (decoder)
|
|
|
|
|
for _ in range(2):
|
|
|
|
|
self.decode.insert(0,
|
|
|
|
@ -299,7 +345,6 @@ class Generator(nn.Layer):
|
|
|
|
|
dim_out=dim_out + int(F0_channel / 2),
|
|
|
|
|
style_dim=style_dim,
|
|
|
|
|
w_hpf=w_hpf))
|
|
|
|
|
|
|
|
|
|
if F0_channel != 0:
|
|
|
|
|
self.F0_conv = nn.Sequential(
|
|
|
|
|
ResBlk(
|
|
|
|
@ -307,7 +352,6 @@ class Generator(nn.Layer):
|
|
|
|
|
dim_out=int(F0_channel / 2),
|
|
|
|
|
normalize=True,
|
|
|
|
|
downsample="half"), )
|
|
|
|
|
|
|
|
|
|
if w_hpf > 0:
|
|
|
|
|
self.hpf = HighPass(w_hpf)
|
|
|
|
|
|
|
|
|
@ -316,26 +360,44 @@ class Generator(nn.Layer):
|
|
|
|
|
s: paddle.Tensor,
|
|
|
|
|
masks: paddle.Tensor=None,
|
|
|
|
|
F0: paddle.Tensor=None):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)):
|
|
|
|
|
Shape (B, 1, n_mels, T).
|
|
|
|
|
s(Tensor(float32)):
|
|
|
|
|
Shape (64,).
|
|
|
|
|
masks:
|
|
|
|
|
None.
|
|
|
|
|
F0:
|
|
|
|
|
Shape (B, num_features(256), n_mels // 8, T).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
output of generator. Shape (B, 1, n_mels, T // 4 * 4)
|
|
|
|
|
"""
|
|
|
|
|
x = self.stem(x)
|
|
|
|
|
cache = {}
|
|
|
|
|
# output: (B, max_conv_dim, n_mels // 16, T // 4)
|
|
|
|
|
for block in self.encode:
|
|
|
|
|
if (masks is not None) and (x.shape[2] in [32, 64, 128]):
|
|
|
|
|
cache[x.shape[2]] = x
|
|
|
|
|
x = block(x)
|
|
|
|
|
|
|
|
|
|
if F0 is not None:
|
|
|
|
|
# input: (B, num_features(256), n_mels // 8, T)
|
|
|
|
|
# output: (B, num_features(256) // 2, n_mels // 16, T // 2)
|
|
|
|
|
F0 = self.F0_conv(F0)
|
|
|
|
|
# output: (B, num_features(256) // 2, n_mels // 16, T // 4)
|
|
|
|
|
F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]])
|
|
|
|
|
x = paddle.concat([x, F0], axis=1)
|
|
|
|
|
|
|
|
|
|
# input: (B, max_conv_dim+num_features(256) // 2, n_mels // 16, T // 4 * 4)
|
|
|
|
|
# output: (B, dim_in, n_mels, T // 4 * 4)
|
|
|
|
|
for block in self.decode:
|
|
|
|
|
x = block(x, s)
|
|
|
|
|
if (masks is not None) and (x.shape[2] in [32, 64, 128]):
|
|
|
|
|
mask = masks[0] if x.shape[2] in [32] else masks[1]
|
|
|
|
|
mask = F.interpolate(mask, size=x.shape[2], mode='bilinear')
|
|
|
|
|
x = x + self.hpf(mask * cache[x.shape[2]])
|
|
|
|
|
|
|
|
|
|
return self.to_out(x)
|
|
|
|
|
out = self.to_out(x)
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MappingNetwork(nn.Layer):
|
|
|
|
@ -366,14 +428,25 @@ class MappingNetwork(nn.Layer):
|
|
|
|
|
])
|
|
|
|
|
|
|
|
|
|
def forward(self, z: paddle.Tensor, y: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
z(Tensor(float32)):
|
|
|
|
|
Shape (B, 1, n_mels, T).
|
|
|
|
|
y(Tensor(float32)):
|
|
|
|
|
speaker label. Shape (B, ).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
Shape (style_dim, )
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
h = self.shared(z)
|
|
|
|
|
out = []
|
|
|
|
|
for layer in self.unshared:
|
|
|
|
|
out += [layer(h)]
|
|
|
|
|
# (batch, num_domains, style_dim)
|
|
|
|
|
# (B, num_domains, style_dim)
|
|
|
|
|
out = paddle.stack(out, axis=1)
|
|
|
|
|
idx = paddle.arange(y.shape[0])
|
|
|
|
|
# (batch, style_dim)
|
|
|
|
|
# (style_dim, )
|
|
|
|
|
s = out[idx, y]
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
@ -419,15 +492,25 @@ class StyleEncoder(nn.Layer):
|
|
|
|
|
self.unshared.append(nn.Linear(dim_out, style_dim))
|
|
|
|
|
|
|
|
|
|
def forward(self, x: paddle.Tensor, y: paddle.Tensor):
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
x(Tensor(float32)):
|
|
|
|
|
Shape (B, 1, n_mels, T).
|
|
|
|
|
y(Tensor(float32)):
|
|
|
|
|
speaker label. Shape (B, ).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor:
|
|
|
|
|
Shape (style_dim, )
|
|
|
|
|
"""
|
|
|
|
|
h = self.shared(x)
|
|
|
|
|
h = h.reshape((h.shape[0], -1))
|
|
|
|
|
out = []
|
|
|
|
|
for layer in self.unshared:
|
|
|
|
|
out += [layer(h)]
|
|
|
|
|
# (batch, num_domains, style_dim)
|
|
|
|
|
# (B, num_domains, style_dim)
|
|
|
|
|
out = paddle.stack(out, axis=1)
|
|
|
|
|
idx = paddle.arange(y.shape[0])
|
|
|
|
|
# (batch, style_dim)
|
|
|
|
|
# (style_dim,)
|
|
|
|
|
s = out[idx, y]
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
@ -454,25 +537,12 @@ class Discriminator(nn.Layer):
|
|
|
|
|
self.num_domains = num_domains
|
|
|
|
|
|
|
|
|
|
def forward(self, x: paddle.Tensor, y: paddle.Tensor):
|
|
|
|
|
return self.dis(x, y)
|
|
|
|
|
out = self.dis(x, y)
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
def classifier(self, x: paddle.Tensor):
|
|
|
|
|
return self.cls.get_feature(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LinearNorm(nn.Layer):
|
|
|
|
|
def __init__(self,
|
|
|
|
|
in_dim: int,
|
|
|
|
|
out_dim: int,
|
|
|
|
|
bias: bool=True,
|
|
|
|
|
w_init_gain: str='linear'):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias)
|
|
|
|
|
xavier_uniform_(
|
|
|
|
|
self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
|
|
|
|
|
|
|
|
|
|
def forward(self, x):
|
|
|
|
|
return self.linear_layer(x)
|
|
|
|
|
out = self.cls.get_feature(x)
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Discriminator2D(nn.Layer):
|
|
|
|
@ -520,97 +590,13 @@ class Discriminator2D(nn.Layer):
|
|
|
|
|
|
|
|
|
|
def get_feature(self, x: paddle.Tensor):
|
|
|
|
|
out = self.main(x)
|
|
|
|
|
# (batch, num_domains)
|
|
|
|
|
# (B, num_domains)
|
|
|
|
|
out = out.reshape((out.shape[0], -1))
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
def forward(self, x: paddle.Tensor, y: paddle.Tensor):
|
|
|
|
|
out = self.get_feature(x)
|
|
|
|
|
idx = paddle.arange(y.shape[0])
|
|
|
|
|
# (batch)
|
|
|
|
|
# (B,) ?
|
|
|
|
|
out = out[idx, y]
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
def build_model(args, F0_model: nn.Layer, ASR_model: nn.Layer):
|
|
|
|
|
generator = Generator(
|
|
|
|
|
dim_in=args.dim_in,
|
|
|
|
|
style_dim=args.style_dim,
|
|
|
|
|
max_conv_dim=args.max_conv_dim,
|
|
|
|
|
w_hpf=args.w_hpf,
|
|
|
|
|
F0_channel=args.F0_channel)
|
|
|
|
|
mapping_network = MappingNetwork(
|
|
|
|
|
latent_dim=args.latent_dim,
|
|
|
|
|
style_dim=args.style_dim,
|
|
|
|
|
num_domains=args.num_domains,
|
|
|
|
|
hidden_dim=args.max_conv_dim)
|
|
|
|
|
style_encoder = StyleEncoder(
|
|
|
|
|
dim_in=args.dim_in,
|
|
|
|
|
style_dim=args.style_dim,
|
|
|
|
|
num_domains=args.num_domains,
|
|
|
|
|
max_conv_dim=args.max_conv_dim)
|
|
|
|
|
discriminator = Discriminator(
|
|
|
|
|
dim_in=args.dim_in,
|
|
|
|
|
num_domains=args.num_domains,
|
|
|
|
|
max_conv_dim=args.max_conv_dim,
|
|
|
|
|
n_repeat=args.n_repeat)
|
|
|
|
|
generator_ema = copy.deepcopy(generator)
|
|
|
|
|
mapping_network_ema = copy.deepcopy(mapping_network)
|
|
|
|
|
style_encoder_ema = copy.deepcopy(style_encoder)
|
|
|
|
|
|
|
|
|
|
nets = Munch(
|
|
|
|
|
generator=generator,
|
|
|
|
|
mapping_network=mapping_network,
|
|
|
|
|
style_encoder=style_encoder,
|
|
|
|
|
discriminator=discriminator,
|
|
|
|
|
f0_model=F0_model,
|
|
|
|
|
asr_model=ASR_model)
|
|
|
|
|
|
|
|
|
|
nets_ema = Munch(
|
|
|
|
|
generator=generator_ema,
|
|
|
|
|
mapping_network=mapping_network_ema,
|
|
|
|
|
style_encoder=style_encoder_ema)
|
|
|
|
|
|
|
|
|
|
return nets, nets_ema
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class StarGANv2VC(nn.Layer):
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
# spk_num
|
|
|
|
|
num_domains: int=20,
|
|
|
|
|
dim_in: int=64,
|
|
|
|
|
style_dim: int=64,
|
|
|
|
|
latent_dim: int=16,
|
|
|
|
|
max_conv_dim: int=512,
|
|
|
|
|
n_repeat: int=4,
|
|
|
|
|
w_hpf: int=0,
|
|
|
|
|
F0_channel: int=256):
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
self.generator = Generator(
|
|
|
|
|
dim_in=dim_in,
|
|
|
|
|
style_dim=style_dim,
|
|
|
|
|
max_conv_dim=max_conv_dim,
|
|
|
|
|
w_hpf=w_hpf,
|
|
|
|
|
F0_channel=F0_channel)
|
|
|
|
|
# MappingNetwork and StyleEncoder are used to generate reference_embeddings
|
|
|
|
|
self.mapping_network = MappingNetwork(
|
|
|
|
|
latent_dim=latent_dim,
|
|
|
|
|
style_dim=style_dim,
|
|
|
|
|
num_domains=num_domains,
|
|
|
|
|
hidden_dim=max_conv_dim)
|
|
|
|
|
|
|
|
|
|
self.style_encoder = StyleEncoder(
|
|
|
|
|
dim_in=dim_in,
|
|
|
|
|
style_dim=style_dim,
|
|
|
|
|
num_domains=num_domains,
|
|
|
|
|
max_conv_dim=max_conv_dim)
|
|
|
|
|
|
|
|
|
|
self.discriminator = Discriminator(
|
|
|
|
|
dim_in=dim_in,
|
|
|
|
|
num_domains=num_domains,
|
|
|
|
|
max_conv_dim=max_conv_dim,
|
|
|
|
|
repeat_num=n_repeat)
|
|
|
|
|
'''
|
|
|
|
|