|
|
|
@ -14,28 +14,9 @@
|
|
|
|
|
import paddle
|
|
|
|
|
from paddle import nn
|
|
|
|
|
|
|
|
|
|
from paddlespeech.t2s.modules.nets_utils import initialize
|
|
|
|
|
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
|
|
|
|
|
"""
|
|
|
|
|
encodings: (B, T, C)
|
|
|
|
|
durations: (B, T)
|
|
|
|
|
"""
|
|
|
|
|
batch_size, t_enc = paddle.shape(durations)
|
|
|
|
|
slens = paddle.sum(durations, -1)
|
|
|
|
|
t_dec = paddle.max(slens)
|
|
|
|
|
M = paddle.zeros([batch_size, t_dec, t_enc])
|
|
|
|
|
for i in range(batch_size):
|
|
|
|
|
k = 0
|
|
|
|
|
for j in range(t_enc):
|
|
|
|
|
d = durations[i, j]
|
|
|
|
|
# If the d == 0, slice action is meaningless and not supported
|
|
|
|
|
if d >= 1:
|
|
|
|
|
M[0, k:k + d, j] = 1
|
|
|
|
|
k += d
|
|
|
|
|
encodings = paddle.matmul(M, encodings)
|
|
|
|
|
return encodings
|
|
|
|
|
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ResidualBlock(nn.Layer):
|
|
|
|
@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpeedySpeech(nn.Layer):
|
|
|
|
|
def __init__(self,
|
|
|
|
|
vocab_size,
|
|
|
|
|
encoder_hidden_size,
|
|
|
|
|
encoder_kernel_size,
|
|
|
|
|
encoder_dilations,
|
|
|
|
|
duration_predictor_hidden_size,
|
|
|
|
|
decoder_hidden_size,
|
|
|
|
|
decoder_output_size,
|
|
|
|
|
decoder_kernel_size,
|
|
|
|
|
decoder_dilations,
|
|
|
|
|
tone_size=None,
|
|
|
|
|
spk_num=None):
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
vocab_size,
|
|
|
|
|
encoder_hidden_size,
|
|
|
|
|
encoder_kernel_size,
|
|
|
|
|
encoder_dilations,
|
|
|
|
|
duration_predictor_hidden_size,
|
|
|
|
|
decoder_hidden_size,
|
|
|
|
|
decoder_output_size,
|
|
|
|
|
decoder_kernel_size,
|
|
|
|
|
decoder_dilations,
|
|
|
|
|
tone_size=None,
|
|
|
|
|
spk_num=None,
|
|
|
|
|
init_type: str="xavier_uniform", ):
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
# initialize parameters
|
|
|
|
|
initialize(self, init_type)
|
|
|
|
|
|
|
|
|
|
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
|
|
|
|
|
encoder_hidden_size, encoder_kernel_size,
|
|
|
|
|
encoder_dilations, spk_num)
|
|
|
|
@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
|
self.encoder = encoder
|
|
|
|
|
self.duration_predictor = duration_predictor
|
|
|
|
|
self.decoder = decoder
|
|
|
|
|
# define length regulator
|
|
|
|
|
self.length_regulator = LengthRegulator()
|
|
|
|
|
|
|
|
|
|
nn.initializer.set_global_initializer(None)
|
|
|
|
|
|
|
|
|
|
def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
|
|
|
|
|
# input of embedding must be int64
|
|
|
|
@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
|
|
|
|
|
|
# expand encodings
|
|
|
|
|
durations_to_expand = durations
|
|
|
|
|
encodings = expand(encodings, durations_to_expand)
|
|
|
|
|
encodings = self.length_regulator(encodings, durations_to_expand)
|
|
|
|
|
|
|
|
|
|
# decode
|
|
|
|
|
# remove positional encoding here
|
|
|
|
@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
|
durations_to_expand = durations_to_expand.astype(paddle.int64)
|
|
|
|
|
else:
|
|
|
|
|
durations_to_expand = durations
|
|
|
|
|
encodings = expand(encodings, durations_to_expand)
|
|
|
|
|
encodings = self.length_regulator(
|
|
|
|
|
encodings, durations_to_expand, is_inference=True)
|
|
|
|
|
|
|
|
|
|
shape = paddle.shape(encodings)
|
|
|
|
|
t_dec, feature_size = shape[1], shape[2]
|
|
|
|
|