|
|
@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer):
|
|
|
|
spk_emb = self.spk_projection(F.normalize(spk_emb))
|
|
|
|
spk_emb = self.spk_projection(F.normalize(spk_emb))
|
|
|
|
hs = hs + spk_emb.unsqueeze(1)
|
|
|
|
hs = hs + spk_emb.unsqueeze(1)
|
|
|
|
elif self.spk_embed_integration_type == "concat":
|
|
|
|
elif self.spk_embed_integration_type == "concat":
|
|
|
|
|
|
|
|
# one wave `spk_emb` under synthesize, the dim is `1`
|
|
|
|
|
|
|
|
if spk_emb.dim() == 1:
|
|
|
|
|
|
|
|
spk_emb = spk_emb.unsqueeze(0)
|
|
|
|
# concat hidden states with spk embeds and then apply projection
|
|
|
|
# concat hidden states with spk embeds and then apply projection
|
|
|
|
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
|
|
|
|
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
|
|
|
|
shape=[-1, paddle.shape(hs)[1], -1])
|
|
|
|
shape=[-1, paddle.shape(hs)[1], -1])
|
|
|
|