|
|
@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
decoded = self.decoder(encodings)
|
|
|
|
decoded = self.decoder(encodings)
|
|
|
|
return decoded, pred_durations
|
|
|
|
return decoded, pred_durations
|
|
|
|
|
|
|
|
|
|
|
|
def inference(self, text, tones=None, spk_id=None):
|
|
|
|
def inference(self, text, tones=None, durations=None, spk_id=None):
|
|
|
|
# text: [T]
|
|
|
|
# text: [T]
|
|
|
|
# tones: [T]
|
|
|
|
# tones: [T]
|
|
|
|
# input of embedding must be int64
|
|
|
|
# input of embedding must be int64
|
|
|
@ -234,6 +234,7 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
|
|
|
|
|
|
|
|
encodings = self.encoder(text, tones, spk_id)
|
|
|
|
encodings = self.encoder(text, tones, spk_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if type(durations) == type(None):
|
|
|
|
pred_durations = self.duration_predictor(encodings) # (1, T)
|
|
|
|
pred_durations = self.duration_predictor(encodings) # (1, T)
|
|
|
|
durations_to_expand = paddle.round(pred_durations.exp())
|
|
|
|
durations_to_expand = paddle.round(pred_durations.exp())
|
|
|
|
durations_to_expand = (durations_to_expand).astype(paddle.int64)
|
|
|
|
durations_to_expand = (durations_to_expand).astype(paddle.int64)
|
|
|
@ -252,6 +253,9 @@ class SpeedySpeech(nn.Layer):
|
|
|
|
k += d
|
|
|
|
k += d
|
|
|
|
|
|
|
|
|
|
|
|
encodings = paddle.matmul(M, encodings)
|
|
|
|
encodings = paddle.matmul(M, encodings)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
durations_to_expand = durations
|
|
|
|
|
|
|
|
encodings = expand(encodings, durations_to_expand)
|
|
|
|
|
|
|
|
|
|
|
|
shape = paddle.shape(encodings)
|
|
|
|
shape = paddle.shape(encodings)
|
|
|
|
t_dec, feature_size = shape[1], shape[2]
|
|
|
|
t_dec, feature_size = shape[1], shape[2]
|
|
|
@ -266,7 +270,11 @@ class SpeedySpeechInference(nn.Layer):
|
|
|
|
self.normalizer = normalizer
|
|
|
|
self.normalizer = normalizer
|
|
|
|
self.acoustic_model = speedyspeech_model
|
|
|
|
self.acoustic_model = speedyspeech_model
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, phones, tones, spk_id=None):
|
|
|
|
def forward(self, phones, tones, durations=None, spk_id=None):
|
|
|
|
normalized_mel = self.acoustic_model.inference(phones, tones, spk_id)
|
|
|
|
normalized_mel = self.acoustic_model.inference(
|
|
|
|
|
|
|
|
phones,
|
|
|
|
|
|
|
|
tones,
|
|
|
|
|
|
|
|
durations=durations,
|
|
|
|
|
|
|
|
spk_id=spk_id)
|
|
|
|
logmel = self.normalizer.inverse(normalized_mel)
|
|
|
|
logmel = self.normalizer.inverse(normalized_mel)
|
|
|
|
return logmel
|
|
|
|
return logmel
|