Add speaker embedding and speaker id for style fastspeech2 inference

3 years ago · 3cbfd7bf35
parent db121226b8
commit 3cbfd7bf35
1 changed files with 9 additions and 3 deletions
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
                energy: Union[paddle.Tensor, np.ndarray]=None,
                energy_scale: Union[int, float]=None,
                energy_bias: Union[int, float]=None,
-                robot: bool=False):
+                robot: bool=False,
+                spk_emb=None,
+                spk_id=None):
        """
        Parameters
        ----------
@ -938,8 +940,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
        Tensor
            Output sequence of features (L, odim).
        """
+        spk_id = paddle.to_tensor(spk_id)
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, durations=None, pitch=None, energy=None)
+            text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id)
        # priority: groundtruth > scale/bias > previous output
        # set durations
        if isinstance(durations, np.ndarray):
@ -991,7 +994,10 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
            durations=durations,
            pitch=pitch,
            energy=energy,
-            use_teacher_forcing=True)
+            use_teacher_forcing=True,
+            spk_emb=spk_emb, 
+            spk_id=spk_id
+            )

        logmel = self.normalizer.inverse(normalized_mel)
        return logmel