Add durations to gen_gta_mel.py inference

pull/1302/head
Jerryuhoo 3 years ago
parent 61b68ed3ef
commit be99807d61

@ -73,7 +73,7 @@ def evaluate(args, speedyspeech_config):
speedyspeech_normalizer = ZScore(mu, std)
speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
model)
model)
speedyspeech_inference.eval()
output_dir = Path(args.output_dir)
@ -138,6 +138,8 @@ def evaluate(args, speedyspeech_config):
speaker_id = None
durations = paddle.to_tensor(np.array(durations))
durations = paddle.unsqueeze(durations, axis=0)
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
@ -153,7 +155,7 @@ def evaluate(args, speedyspeech_config):
sub_output_dir.mkdir(parents=True, exist_ok=True)
with paddle.no_grad():
mel = speedyspeech_inference(phone_ids, tone_ids, spk_id=speaker_id)
mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)

@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
decoded = self.decoder(encodings)
return decoded, pred_durations
def inference(self, text, tones=None, spk_id=None):
def inference(self, text, tones=None, durations=None, spk_id=None):
# text: [T]
# tones: [T]
# input of embedding must be int64
@ -234,24 +234,28 @@ class SpeedySpeech(nn.Layer):
encodings = self.encoder(text, tones, spk_id)
pred_durations = self.duration_predictor(encodings) # (1, T)
durations_to_expand = paddle.round(pred_durations.exp())
durations_to_expand = (durations_to_expand).astype(paddle.int64)
slens = paddle.sum(durations_to_expand, -1) # [1]
t_dec = slens[0] # [1]
t_enc = paddle.shape(pred_durations)[-1]
M = paddle.zeros([1, t_dec, t_enc])
k = paddle.full([1], 0, dtype=paddle.int64)
for j in range(t_enc):
d = durations_to_expand[0, j]
# If the d == 0, slice action is meaningless and not supported
if d >= 1:
M[0, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
if type(durations) == type(None):
pred_durations = self.duration_predictor(encodings) # (1, T)
durations_to_expand = paddle.round(pred_durations.exp())
durations_to_expand = (durations_to_expand).astype(paddle.int64)
slens = paddle.sum(durations_to_expand, -1) # [1]
t_dec = slens[0] # [1]
t_enc = paddle.shape(pred_durations)[-1]
M = paddle.zeros([1, t_dec, t_enc])
k = paddle.full([1], 0, dtype=paddle.int64)
for j in range(t_enc):
d = durations_to_expand[0, j]
# If the d == 0, slice action is meaningless and not supported
if d >= 1:
M[0, k:k + d, j] = 1
k += d
encodings = paddle.matmul(M, encodings)
else:
durations_to_expand = durations
encodings = expand(encodings, durations_to_expand)
shape = paddle.shape(encodings)
t_dec, feature_size = shape[1], shape[2]
@ -266,7 +270,11 @@ class SpeedySpeechInference(nn.Layer):
self.normalizer = normalizer
self.acoustic_model = speedyspeech_model
def forward(self, phones, tones, spk_id=None):
normalized_mel = self.acoustic_model.inference(phones, tones, spk_id)
def forward(self, phones, tones, durations=None, spk_id=None):
normalized_mel = self.acoustic_model.inference(
phones,
tones,
durations=durations,
spk_id=spk_id)
logmel = self.normalizer.inverse(normalized_mel)
return logmel
return logmel
Loading…
Cancel
Save