From 3cbfd7bf35d8900a2d7e07be62bd9736cc685d94 Mon Sep 17 00:00:00 2001 From: Jerryuhoo Date: Wed, 22 Dec 2021 22:26:36 +0800 Subject: [PATCH] Add speaker embedding and speaker id for style fastspeech2 inference --- paddlespeech/t2s/models/fastspeech2/fastspeech2.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index cdec03ab..1679f037 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): energy: Union[paddle.Tensor, np.ndarray]=None, energy_scale: Union[int, float]=None, energy_bias: Union[int, float]=None, - robot: bool=False): + robot: bool=False, + spk_emb=None, + spk_id=None): """ Parameters ---------- @@ -938,8 +940,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): Tensor Output sequence of features (L, odim). """ + spk_id = paddle.to_tensor(spk_id) normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, durations=None, pitch=None, energy=None) + text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id) # priority: groundtruth > scale/bias > previous output # set durations if isinstance(durations, np.ndarray): @@ -991,7 +994,10 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): durations=durations, pitch=pitch, energy=energy, - use_teacher_forcing=True) + use_teacher_forcing=True, + spk_emb=spk_emb, + spk_id=spk_id + ) logmel = self.normalizer.inverse(normalized_mel) return logmel