Add durations to gen_gta_mel.py inference

4 years ago · be99807d61
parent 61b68ed3ef
commit be99807d61
2 changed files with 34 additions and 24 deletions
--- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@ -138,6 +138,8 @@ def evaluate(args, speedyspeech_config):
            speaker_id = None
        durations = paddle.to_tensor(np.array(durations))
        durations = paddle.unsqueeze(durations, axis=0)
        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
        # split data into 3 sections
@ -153,7 +155,7 @@ def evaluate(args, speedyspeech_config):
        sub_output_dir.mkdir(parents=True, exist_ok=True)
        with paddle.no_grad():
-            mel = speedyspeech_inference(phone_ids, tone_ids, spk_id=speaker_id)
+            mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
        decoded = self.decoder(encodings)
        return decoded, pred_durations
-    def inference(self, text, tones=None, spk_id=None):
+    def inference(self, text, tones=None, durations=None, spk_id=None):
        # text: [T]
        # tones: [T]
        # input of embedding must be int64
@ -234,6 +234,7 @@ class SpeedySpeech(nn.Layer):
        encodings = self.encoder(text, tones, spk_id)
        if type(durations) == type(None):
            pred_durations = self.duration_predictor(encodings)  # (1, T)
            durations_to_expand = paddle.round(pred_durations.exp())
            durations_to_expand = (durations_to_expand).astype(paddle.int64)
@ -252,6 +253,9 @@ class SpeedySpeech(nn.Layer):
                k += d
            encodings = paddle.matmul(M, encodings)
        else:
            durations_to_expand = durations
            encodings = expand(encodings, durations_to_expand)
        shape = paddle.shape(encodings)
        t_dec, feature_size = shape[1], shape[2]
@ -266,7 +270,11 @@ class SpeedySpeechInference(nn.Layer):
        self.normalizer = normalizer
        self.acoustic_model = speedyspeech_model
-    def forward(self, phones, tones, spk_id=None):
+    def forward(self, phones, tones, durations=None, spk_id=None):
-        normalized_mel = self.acoustic_model.inference(phones, tones, spk_id)
+        normalized_mel = self.acoustic_model.inference(
            phones, 
            tones, 
            durations=durations, 
            spk_id=spk_id)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel