From be99807d6172c65298e48610d2e15a859d71cb57 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Tue, 11 Jan 2022 16:32:33 +0800
Subject: [PATCH] Add durations to gen_gta_mel.py inference

---
 .../t2s/exps/speedyspeech/gen_gta_mel.py      |  6 ++-
 .../t2s/models/speedyspeech/speedyspeech.py   | 52 +++++++++++--------
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
index ddd961a9..0c2bb02d 100644
--- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@@ -73,7 +73,7 @@ def evaluate(args, speedyspeech_config):
     speedyspeech_normalizer = ZScore(mu, std)
 
     speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
-                                                      model)
+                                                   model)
     speedyspeech_inference.eval()
 
     output_dir = Path(args.output_dir)
@@ -138,6 +138,8 @@ def evaluate(args, speedyspeech_config):
             speaker_id = None
 
         durations = paddle.to_tensor(np.array(durations))
+        durations = paddle.unsqueeze(durations, axis=0)
+
         # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
         # split data into 3 sections
 
@@ -153,7 +155,7 @@ def evaluate(args, speedyspeech_config):
         sub_output_dir.mkdir(parents=True, exist_ok=True)
 
         with paddle.no_grad():
-            mel = speedyspeech_inference(phone_ids, tone_ids, spk_id=speaker_id)
+            mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
         np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
 
 
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 107c5f1c..263b4c6b 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
         decoded = self.decoder(encodings)
         return decoded, pred_durations
 
-    def inference(self, text, tones=None, spk_id=None):
+    def inference(self, text, tones=None, durations=None, spk_id=None):
         # text: [T]
         # tones: [T]
         # input of embedding must be int64
@@ -234,24 +234,28 @@ class SpeedySpeech(nn.Layer):
 
         encodings = self.encoder(text, tones, spk_id)
 
-        pred_durations = self.duration_predictor(encodings)  # (1, T)
-        durations_to_expand = paddle.round(pred_durations.exp())
-        durations_to_expand = (durations_to_expand).astype(paddle.int64)
-
-        slens = paddle.sum(durations_to_expand, -1)  # [1]
-        t_dec = slens[0]  # [1]
-        t_enc = paddle.shape(pred_durations)[-1]
-        M = paddle.zeros([1, t_dec, t_enc])
-
-        k = paddle.full([1], 0, dtype=paddle.int64)
-        for j in range(t_enc):
-            d = durations_to_expand[0, j]
-            # If the d == 0, slice action is meaningless and not supported
-            if d >= 1:
-                M[0, k:k + d, j] = 1
-            k += d
-
-        encodings = paddle.matmul(M, encodings)
+        if type(durations) == type(None):
+            pred_durations = self.duration_predictor(encodings)  # (1, T)
+            durations_to_expand = paddle.round(pred_durations.exp())
+            durations_to_expand = (durations_to_expand).astype(paddle.int64)
+
+            slens = paddle.sum(durations_to_expand, -1)  # [1]
+            t_dec = slens[0]  # [1]
+            t_enc = paddle.shape(pred_durations)[-1]
+            M = paddle.zeros([1, t_dec, t_enc])
+
+            k = paddle.full([1], 0, dtype=paddle.int64)
+            for j in range(t_enc):
+                d = durations_to_expand[0, j]
+                # If the d == 0, slice action is meaningless and not supported
+                if d >= 1:
+                    M[0, k:k + d, j] = 1
+                k += d
+
+            encodings = paddle.matmul(M, encodings)
+        else:
+            durations_to_expand = durations
+            encodings = expand(encodings, durations_to_expand)
 
         shape = paddle.shape(encodings)
         t_dec, feature_size = shape[1], shape[2]
@@ -266,7 +270,11 @@ class SpeedySpeechInference(nn.Layer):
         self.normalizer = normalizer
         self.acoustic_model = speedyspeech_model
 
-    def forward(self, phones, tones, spk_id=None):
-        normalized_mel = self.acoustic_model.inference(phones, tones, spk_id)
+    def forward(self, phones, tones, durations=None, spk_id=None):
+        normalized_mel = self.acoustic_model.inference(
+            phones, 
+            tones, 
+            durations=durations, 
+            spk_id=spk_id)
         logmel = self.normalizer.inverse(normalized_mel)
-        return logmel
+        return logmel
\ No newline at end of file