From be99807d6172c65298e48610d2e15a859d71cb57 Mon Sep 17 00:00:00 2001 From: Jerryuhoo Date: Tue, 11 Jan 2022 16:32:33 +0800 Subject: [PATCH] Add durations to gen_gta_mel.py inference --- .../t2s/exps/speedyspeech/gen_gta_mel.py | 6 ++- .../t2s/models/speedyspeech/speedyspeech.py | 52 +++++++++++-------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py index ddd961a9..0c2bb02d 100644 --- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py @@ -73,7 +73,7 @@ def evaluate(args, speedyspeech_config): speedyspeech_normalizer = ZScore(mu, std) speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer, - model) + model) speedyspeech_inference.eval() output_dir = Path(args.output_dir) @@ -138,6 +138,8 @@ def evaluate(args, speedyspeech_config): speaker_id = None durations = paddle.to_tensor(np.array(durations)) + durations = paddle.unsqueeze(durations, axis=0) + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 # split data into 3 sections @@ -153,7 +155,7 @@ def evaluate(args, speedyspeech_config): sub_output_dir.mkdir(parents=True, exist_ok=True) with paddle.no_grad(): - mel = speedyspeech_inference(phone_ids, tone_ids, spk_id=speaker_id) + mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index 107c5f1c..263b4c6b 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded, pred_durations - def inference(self, text, tones=None, spk_id=None): + def inference(self, text, tones=None, durations=None, spk_id=None): # text: [T] # tones: [T] # input of embedding must be int64 @@ -234,24 +234,28 @@ class SpeedySpeech(nn.Layer): encodings = self.encoder(text, tones, spk_id) - pred_durations = self.duration_predictor(encodings) # (1, T) - durations_to_expand = paddle.round(pred_durations.exp()) - durations_to_expand = (durations_to_expand).astype(paddle.int64) - - slens = paddle.sum(durations_to_expand, -1) # [1] - t_dec = slens[0] # [1] - t_enc = paddle.shape(pred_durations)[-1] - M = paddle.zeros([1, t_dec, t_enc]) - - k = paddle.full([1], 0, dtype=paddle.int64) - for j in range(t_enc): - d = durations_to_expand[0, j] - # If the d == 0, slice action is meaningless and not supported - if d >= 1: - M[0, k:k + d, j] = 1 - k += d - - encodings = paddle.matmul(M, encodings) + if type(durations) == type(None): + pred_durations = self.duration_predictor(encodings) # (1, T) + durations_to_expand = paddle.round(pred_durations.exp()) + durations_to_expand = (durations_to_expand).astype(paddle.int64) + + slens = paddle.sum(durations_to_expand, -1) # [1] + t_dec = slens[0] # [1] + t_enc = paddle.shape(pred_durations)[-1] + M = paddle.zeros([1, t_dec, t_enc]) + + k = paddle.full([1], 0, dtype=paddle.int64) + for j in range(t_enc): + d = durations_to_expand[0, j] + # If the d == 0, slice action is meaningless and not supported + if d >= 1: + M[0, k:k + d, j] = 1 + k += d + + encodings = paddle.matmul(M, encodings) + else: + durations_to_expand = durations + encodings = expand(encodings, durations_to_expand) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2] @@ -266,7 +270,11 @@ class SpeedySpeechInference(nn.Layer): self.normalizer = normalizer self.acoustic_model = speedyspeech_model - def forward(self, phones, tones, spk_id=None): - normalized_mel = self.acoustic_model.inference(phones, tones, spk_id) + def forward(self, phones, tones, durations=None, spk_id=None): + normalized_mel = self.acoustic_model.inference( + phones, + tones, + durations=durations, + spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) - return logmel + return logmel \ No newline at end of file