|
|
@ -14,7 +14,7 @@
|
|
|
|
import numpy as np
|
|
|
|
import numpy as np
|
|
|
|
import paddle
|
|
|
|
import paddle
|
|
|
|
from paddle import nn
|
|
|
|
from paddle import nn
|
|
|
|
|
|
|
|
import paddle.nn.functional as F
|
|
|
|
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
|
|
|
|
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -96,7 +96,7 @@ class TextEmbedding(nn.Layer):
|
|
|
|
|
|
|
|
|
|
|
|
class SpeedySpeechEncoder(nn.Layer):
|
|
|
|
class SpeedySpeechEncoder(nn.Layer):
|
|
|
|
def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
|
|
|
|
def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
|
|
|
|
dilations):
|
|
|
|
dilations, spk_num=None):
|
|
|
|
super().__init__()
|
|
|
|
super().__init__()
|
|
|
|
self.embedding = TextEmbedding(
|
|
|
|
self.embedding = TextEmbedding(
|
|
|
|
vocab_size,
|
|
|
|
vocab_size,
|
|
|
@ -104,6 +104,15 @@ class SpeedySpeechEncoder(nn.Layer):
|
|
|
|
tone_size,
|
|
|
|
tone_size,
|
|
|
|
padding_idx=0,
|
|
|
|
padding_idx=0,
|
|
|
|
tone_padding_idx=0)
|
|
|
|
tone_padding_idx=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if spk_num:
|
|
|
|
|
|
|
|
self.spk_emb = nn.Embedding(
|
|
|
|
|
|
|
|
num_embeddings=spk_num,
|
|
|
|
|
|
|
|
embedding_dim=hidden_size,
|
|
|
|
|
|
|
|
padding_idx=0)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
self.spk_emb = None
|
|
|
|
|
|
|
|
|
|
|
|
self.prenet = nn.Sequential(
|
|
|
|
self.prenet = nn.Sequential(
|
|
|
|
nn.Linear(hidden_size, hidden_size),
|
|
|
|
nn.Linear(hidden_size, hidden_size),
|
|
|
|
nn.ReLU(), )
|
|
|
|
nn.ReLU(), )
|
|
|
@ -118,8 +127,10 @@ class SpeedySpeechEncoder(nn.Layer):
|
|
|
|
nn.BatchNorm1D(hidden_size, data_format="NLC"),
|
|
|
|
nn.BatchNorm1D(hidden_size, data_format="NLC"),
|
|
|
|
nn.Linear(hidden_size, hidden_size), )
|
|
|
|
nn.Linear(hidden_size, hidden_size), )
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, text, tones):
|
|
|
|
def forward(self, text, tones, spk_id=None):
|
|
|
|
embedding = self.embedding(text, tones)
|
|
|
|
embedding = self.embedding(text, tones)
|
|
|
|
|
|
|
|
if self.spk_emb:
|
|
|
|
|
|
|
|
embedding += self.spk_emb(spk_id).unsqueeze(1)
|
|
|
|
embedding = self.prenet(embedding)
|
|
|
|
embedding = self.prenet(embedding)
|
|
|
|
x = self.res_blocks(embedding)
|
|
|
|
x = self.res_blocks(embedding)
|
|
|
|
x = embedding + self.postnet1(x)
|
|
|
|
x = embedding + self.postnet1(x)
|
|
|
@ -171,11 +182,12 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
decoder_output_size,
|
|
|
|
decoder_output_size,
|
|
|
|
decoder_kernel_size,
|
|
|
|
decoder_kernel_size,
|
|
|
|
decoder_dilations,
|
|
|
|
decoder_dilations,
|
|
|
|
tone_size=None, ):
|
|
|
|
tone_size=None,
|
|
|
|
|
|
|
|
spk_num=None):
|
|
|
|
super().__init__()
|
|
|
|
super().__init__()
|
|
|
|
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
|
|
|
|
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
|
|
|
|
encoder_hidden_size, encoder_kernel_size,
|
|
|
|
encoder_hidden_size, encoder_kernel_size,
|
|
|
|
encoder_dilations)
|
|
|
|
encoder_dilations, spk_num)
|
|
|
|
duration_predictor = DurationPredictor(duration_predictor_hidden_size)
|
|
|
|
duration_predictor = DurationPredictor(duration_predictor_hidden_size)
|
|
|
|
decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
|
|
|
|
decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
|
|
|
|
decoder_kernel_size, decoder_dilations)
|
|
|
|
decoder_kernel_size, decoder_dilations)
|
|
|
@ -184,13 +196,15 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
self.duration_predictor = duration_predictor
|
|
|
|
self.duration_predictor = duration_predictor
|
|
|
|
self.decoder = decoder
|
|
|
|
self.decoder = decoder
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, text, tones, durations):
|
|
|
|
def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
|
|
|
|
# input of embedding must be int64
|
|
|
|
# input of embedding must be int64
|
|
|
|
text = paddle.cast(text, 'int64')
|
|
|
|
text = paddle.cast(text, 'int64')
|
|
|
|
tones = paddle.cast(tones, 'int64')
|
|
|
|
tones = paddle.cast(tones, 'int64')
|
|
|
|
|
|
|
|
if spk_id is not None:
|
|
|
|
|
|
|
|
spk_id = paddle.cast(spk_id, 'int64')
|
|
|
|
durations = paddle.cast(durations, 'int64')
|
|
|
|
durations = paddle.cast(durations, 'int64')
|
|
|
|
encodings = self.encoder(text, tones)
|
|
|
|
encodings = self.encoder(text, tones, spk_id)
|
|
|
|
# (B, T)
|
|
|
|
|
|
|
|
pred_durations = self.duration_predictor(encodings.detach())
|
|
|
|
pred_durations = self.duration_predictor(encodings.detach())
|
|
|
|
|
|
|
|
|
|
|
|
# expand encodings
|
|
|
|
# expand encodings
|
|
|
@ -204,7 +218,7 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
decoded = self.decoder(encodings)
|
|
|
|
decoded = self.decoder(encodings)
|
|
|
|
return decoded, pred_durations
|
|
|
|
return decoded, pred_durations
|
|
|
|
|
|
|
|
|
|
|
|
def inference(self, text, tones=None):
|
|
|
|
def inference(self, text, tones=None, spk_id=None):
|
|
|
|
# text: [T]
|
|
|
|
# text: [T]
|
|
|
|
# tones: [T]
|
|
|
|
# tones: [T]
|
|
|
|
# input of embedding must be int64
|
|
|
|
# input of embedding must be int64
|
|
|
@ -214,7 +228,8 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
tones = paddle.cast(tones, 'int64')
|
|
|
|
tones = paddle.cast(tones, 'int64')
|
|
|
|
tones = tones.unsqueeze(0)
|
|
|
|
tones = tones.unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
encodings = self.encoder(text, tones)
|
|
|
|
encodings = self.encoder(text, tones, spk_id)
|
|
|
|
|
|
|
|
|
|
|
|
pred_durations = self.duration_predictor(encodings) # (1, T)
|
|
|
|
pred_durations = self.duration_predictor(encodings) # (1, T)
|
|
|
|
durations_to_expand = paddle.round(pred_durations.exp())
|
|
|
|
durations_to_expand = paddle.round(pred_durations.exp())
|
|
|
|
durations_to_expand = (durations_to_expand).astype(paddle.int64)
|
|
|
|
durations_to_expand = (durations_to_expand).astype(paddle.int64)
|
|
|
@ -240,14 +255,13 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
decoded = self.decoder(encodings)
|
|
|
|
decoded = self.decoder(encodings)
|
|
|
|
return decoded[0]
|
|
|
|
return decoded[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SpeedySpeechInference(nn.Layer):
|
|
|
|
class SpeedySpeechInference(nn.Layer):
|
|
|
|
def __init__(self, normalizer, speedyspeech_model):
|
|
|
|
def __init__(self, normalizer, speedyspeech_model):
|
|
|
|
super().__init__()
|
|
|
|
super().__init__()
|
|
|
|
self.normalizer = normalizer
|
|
|
|
self.normalizer = normalizer
|
|
|
|
self.acoustic_model = speedyspeech_model
|
|
|
|
self.acoustic_model = speedyspeech_model
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, phones, tones):
|
|
|
|
def forward(self, phones, tones, spk_id=None):
|
|
|
|
normalized_mel = self.acoustic_model.inference(phones, tones)
|
|
|
|
normalized_mel = self.acoustic_model.inference(phones, tones, spk_id)
|
|
|
|
logmel = self.normalizer.inverse(normalized_mel)
|
|
|
|
logmel = self.normalizer.inverse(normalized_mel)
|
|
|
|
return logmel
|
|
|
|
return logmel
|
|
|
|