Merge pull request #1197 from jerryuhoo/develop

Add speaker embedding and speaker id for style fastspeech2 inference
pull/1202/head
TianYuan 3 years ago committed by GitHub
commit de8e09fd97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
energy: Union[paddle.Tensor, np.ndarray]=None, energy: Union[paddle.Tensor, np.ndarray]=None,
energy_scale: Union[int, float]=None, energy_scale: Union[int, float]=None,
energy_bias: Union[int, float]=None, energy_bias: Union[int, float]=None,
robot: bool=False): robot: bool=False,
spk_emb=None,
spk_id=None):
""" """
Parameters Parameters
---------- ----------
@ -938,8 +940,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
Tensor Tensor
Output sequence of features (L, odim). Output sequence of features (L, odim).
""" """
spk_id = paddle.to_tensor(spk_id)
normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
text, durations=None, pitch=None, energy=None) text, durations=None, pitch=None, energy=None, spk_emb=spk_emb, spk_id=spk_id)
# priority: groundtruth > scale/bias > previous output # priority: groundtruth > scale/bias > previous output
# set durations # set durations
if isinstance(durations, np.ndarray): if isinstance(durations, np.ndarray):
@ -991,7 +994,10 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
durations=durations, durations=durations,
pitch=pitch, pitch=pitch,
energy=energy, energy=energy,
use_teacher_forcing=True) use_teacher_forcing=True,
spk_emb=spk_emb,
spk_id=spk_id
)
logmel = self.normalizer.inverse(normalized_mel) logmel = self.normalizer.inverse(normalized_mel)
return logmel return logmel

Loading…
Cancel
Save