From 3e53497a28ddfc69997d44def1e896df89e60353 Mon Sep 17 00:00:00 2001 From: megemini Date: Fri, 29 Nov 2024 19:16:55 +0800 Subject: [PATCH] =?UTF-8?q?[Hackathon=207th]=20=E4=BF=AE=E5=A4=8D=20vctk?= =?UTF-8?q?=20=E4=B8=AD=20`spk=5Femb`=20=E7=BB=B4=E5=BA=A6=E9=97=AE?= =?UTF-8?q?=E9=A2=98=20(#3916)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Fix] vctk spk_emb dim * [Update] dim == 1 --- paddlespeech/t2s/models/fastspeech2/fastspeech2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index a95a9b288..fcd54f0d2 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -841,6 +841,9 @@ class FastSpeech2(nn.Layer): spk_emb = self.spk_projection(F.normalize(spk_emb)) hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": + # one wave `spk_emb` under synthesize, the dim is `1` + if spk_emb.dim() == 1: + spk_emb = spk_emb.unsqueeze(0) # concat hidden states with spk embeds and then apply projection spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, paddle.shape(hs)[1], -1])