From a97c7b52064b974b55455b623bd39c6334f2b66c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Thu, 18 Nov 2021 08:42:21 +0000 Subject: [PATCH] rename spembs --- examples/aishell3/vc1/local/preprocess.sh | 2 +- examples/csmsc/voc1/run.sh | 2 +- paddlespeech/t2s/datasets/am_batch_fn.py | 16 +++---- .../t2s/exps/fastspeech2/normalize.py | 6 +-- .../t2s/exps/fastspeech2/preprocess.py | 30 ++++++------ .../t2s/exps/fastspeech2/synthesize.py | 10 ++-- paddlespeech/t2s/exps/fastspeech2/train.py | 4 +- .../t2s/exps/fastspeech2/voice_cloning.py | 17 +++---- .../t2s/models/fastspeech2/fastspeech2.py | 46 +++++++++---------- .../models/fastspeech2/fastspeech2_updater.py | 12 ++--- .../models/transformer_tts/transformer_tts.py | 39 ++++++++-------- 11 files changed, 92 insertions(+), 92 deletions(-) diff --git a/examples/aishell3/vc1/local/preprocess.sh b/examples/aishell3/vc1/local/preprocess.sh index fd52d1538..5f939a1a8 100755 --- a/examples/aishell3/vc1/local/preprocess.sh +++ b/examples/aishell3/vc1/local/preprocess.sh @@ -35,7 +35,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --config=${config_path} \ --num-cpu=20 \ --cut-sil=True \ - --embed-dir=dump/embed + --spk_emb_dir=dump/embed fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh index 163095439..409c8bf0f 100755 --- a/examples/csmsc/voc1/run.sh +++ b/examples/csmsc/voc1/run.sh @@ -3,7 +3,7 @@ set -e source path.sh -gpus=0,1 +gpus=4,5 stage=0 stop_stage=100 diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 590f1bcef..9470f9234 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -100,7 +100,7 @@ def fastspeech2_single_spk_batch_fn(examples): def fastspeech2_multi_spk_batch_fn(examples): - # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spembs"] + # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"] text = [np.array(item["text"], dtype=np.int64) for item in examples] speech = [np.array(item["speech"], dtype=np.float32) for item in examples] pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples] @@ -139,14 +139,14 @@ def fastspeech2_multi_spk_batch_fn(examples): "pitch": pitch, "energy": energy } - # spembs has a higher priority than spk_id - if "spembs" in examples[0]: - spembs = [ - np.array(item["spembs"], dtype=np.float32) for item in examples + # spk_emb has a higher priority than spk_id + if "spk_emb" in examples[0]: + spk_emb = [ + np.array(item["spk_emb"], dtype=np.float32) for item in examples ] - spembs = batch_sequences(spembs) - spembs = paddle.to_tensor(spembs) - batch["spembs"] = spembs + spk_emb = batch_sequences(spk_emb) + spk_emb = paddle.to_tensor(spk_emb) + batch["spk_emb"] = spk_emb elif "spk_id" in examples[0]: spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] spk_id = paddle.to_tensor(spk_id) diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py index d5b6e04a7..8ec20ebf0 100644 --- a/paddlespeech/t2s/exps/fastspeech2/normalize.py +++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py @@ -167,9 +167,9 @@ def main(): "pitch": str(pitch_path), "energy": str(energy_path) } - # add spembs for voice cloning - if "spembs" in item: - record["spembs"] = str(item["spembs"]) + # add spk_emb for voice cloning + if "spk_emb" in item: + record["spk_emb"] = str(item["spk_emb"]) output_metadata.append(record) output_metadata.sort(key=itemgetter('utt_id')) diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 65848ea11..b874b3a70 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -45,7 +45,7 @@ def process_sentence(config: Dict[str, Any], pitch_extractor=None, energy_extractor=None, cut_sil: bool=True, - embed_dir: Path=None): + spk_emb_dir: Path=None): utt_id = fp.stem # for vctk if utt_id.endswith("_mic2"): @@ -117,12 +117,12 @@ def process_sentence(config: Dict[str, Any], "energy": str(energy_path), "speaker": speaker } - if embed_dir: - if speaker in os.listdir(embed_dir): + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): embed_name = utt_id + ".npy" - embed_path = embed_dir / speaker / embed_name + embed_path = spk_emb_dir / speaker / embed_name if embed_path.is_file(): - record["spembs"] = str(embed_path) + record["spk_emb"] = str(embed_path) else: return None return record @@ -137,13 +137,13 @@ def process_sentences(config, energy_extractor=None, nprocs: int=1, cut_sil: bool=True, - embed_dir: Path=None): + spk_emb_dir: Path=None): if nprocs == 1: results = [] for fp in fps: record = process_sentence(config, fp, sentences, output_dir, mel_extractor, pitch_extractor, - energy_extractor, cut_sil, embed_dir) + energy_extractor, cut_sil, spk_emb_dir) if record: results.append(record) else: @@ -154,7 +154,7 @@ def process_sentences(config, future = pool.submit(process_sentence, config, fp, sentences, output_dir, mel_extractor, pitch_extractor, energy_extractor, - cut_sil, embed_dir) + cut_sil, spk_emb_dir) future.add_done_callback(lambda p: progress.update()) futures.append(future) @@ -213,7 +213,7 @@ def main(): help="whether cut sil in the edge of audio") parser.add_argument( - "--embed-dir", + "--spk_emb_dir", default=None, type=str, help="directory to speaker embedding files.") @@ -226,10 +226,10 @@ def main(): dumpdir.mkdir(parents=True, exist_ok=True) dur_file = Path(args.dur_file).expanduser() - if args.embed_dir: - embed_dir = Path(args.embed_dir).expanduser().resolve() + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() else: - embed_dir = None + spk_emb_dir = None assert rootdir.is_dir() assert dur_file.is_file() @@ -339,7 +339,7 @@ def main(): energy_extractor, nprocs=args.num_cpu, cut_sil=args.cut_sil, - embed_dir=embed_dir) + spk_emb_dir=spk_emb_dir) if dev_wav_files: process_sentences( config, @@ -350,7 +350,7 @@ def main(): pitch_extractor, energy_extractor, cut_sil=args.cut_sil, - embed_dir=embed_dir) + spk_emb_dir=spk_emb_dir) if test_wav_files: process_sentences( config, @@ -362,7 +362,7 @@ def main(): energy_extractor, nprocs=args.num_cpu, cut_sil=args.cut_sil, - embed_dir=embed_dir) + spk_emb_dir=spk_emb_dir) if __name__ == "__main__": diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize.py b/paddlespeech/t2s/exps/fastspeech2/synthesize.py index f7d462c51..c9f36f9ca 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py @@ -49,7 +49,7 @@ def evaluate(args, fastspeech2_config, pwg_config): fields += ["spk_id"] elif args.voice_cloning: print("voice cloning!") - fields += ["spembs"] + fields += ["spk_emb"] else: print("single speaker fastspeech2!") print("num_speakers:", num_speakers) @@ -99,15 +99,15 @@ def evaluate(args, fastspeech2_config, pwg_config): for datum in test_dataset: utt_id = datum["utt_id"] text = paddle.to_tensor(datum["text"]) - spembs = None + spk_emb = None spk_id = None - if args.voice_cloning and "spembs" in datum: - spembs = paddle.to_tensor(np.load(datum["spembs"])) + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) elif "spk_id" in datum: spk_id = paddle.to_tensor(datum["spk_id"]) with paddle.no_grad(): wav = pwg_inference( - fastspeech2_inference(text, spk_id=spk_id, spembs=spembs)) + fastspeech2_inference(text, spk_id=spk_id, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 5dc278106..6f42a71e4 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -73,8 +73,8 @@ def train_sp(args, config): elif args.voice_cloning: print("Training voice cloning!") collate_fn = fastspeech2_multi_spk_batch_fn - fields += ["spembs"] - converters["spembs"] = np.load + fields += ["spk_emb"] + converters["spk_emb"] = np.load else: print("single speaker fastspeech2!") collate_fn = fastspeech2_single_spk_batch_fn diff --git a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py index 1fda598bc..9fbd49641 100644 --- a/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py +++ b/paddlespeech/t2s/exps/fastspeech2/voice_cloning.py @@ -107,24 +107,25 @@ def voice_cloning(args, fastspeech2_config, pwg_config): mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) # print("mel_sequences: ", mel_sequences.shape) with paddle.no_grad(): - spembs = speaker_encoder.embed_utterance( + spk_emb = speaker_encoder.embed_utterance( paddle.to_tensor(mel_sequences)) - # print("spembs shape: ", spembs.shape) + # print("spk_emb shape: ", spk_emb.shape) with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inference(phone_ids, spembs=spembs)) + wav = pwg_inference( + fastspeech2_inference(phone_ids, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), samplerate=fastspeech2_config.fs) print(f"{utt_id} done!") - # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spembs - random_spembs = np.random.rand(256) * 0.2 - random_spembs = paddle.to_tensor(random_spembs) - utt_id = "random_spembs" + # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb + random_spk_emb = np.random.rand(256) * 0.2 + random_spk_emb = paddle.to_tensor(random_spk_emb) + utt_id = "random_spk_emb" with paddle.no_grad(): - wav = pwg_inference(fastspeech2_inference(phone_ids, spembs=spembs)) + wav = pwg_inference(fastspeech2_inference(phone_ids, spk_emb=spk_emb)) sf.write( str(output_dir / (utt_id + ".wav")), wav.numpy(), diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 38c0cc0e6..d0084f721 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -297,7 +297,7 @@ class FastSpeech2(nn.Layer): pitch: paddle.Tensor, energy: paddle.Tensor, tone_id: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, spk_id: paddle.Tensor=None ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -320,7 +320,7 @@ class FastSpeech2(nn.Layer): Batch of padded token-averaged energy (B, Tmax, 1). tone_id : Tensor, optional(int64) Batch of padded tone ids (B, Tmax). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). spk_id : Tnesor, optional(int64) Batch of speaker ids (B,) @@ -364,7 +364,7 @@ class FastSpeech2(nn.Layer): ps, es, is_inference=False, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) # modify mod part of groundtruth @@ -385,7 +385,7 @@ class FastSpeech2(nn.Layer): es: paddle.Tensor=None, is_inference: bool=False, alpha: float=1.0, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None) -> Sequence[paddle.Tensor]: # forward encoder @@ -395,12 +395,12 @@ class FastSpeech2(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - # spembs has a higher priority than spk_id - if spembs is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) elif spk_id is not None: - spembs = self.spk_embedding_table(spk_id) - hs = self._integrate_with_spk_embed(hs, spembs) + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) # integrate tone embedding if self.tone_embed_dim is not None: @@ -488,7 +488,7 @@ class FastSpeech2(nn.Layer): energy: paddle.Tensor=None, alpha: float=1.0, use_teacher_forcing: bool=False, - spembs=None, + spk_emb=None, spk_id=None, tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: @@ -511,7 +511,7 @@ class FastSpeech2(nn.Layer): use_teacher_forcing : bool, optional Whether to use teacher forcing. If true, groundtruth of duration, pitch and energy will be used. - spembs : Tensor, optional + spk_emb : Tensor, optional peaker embedding vector (spk_embed_dim,). spk_id : Tensor, optional(int64) Batch of padded spk ids (1,). @@ -535,8 +535,8 @@ class FastSpeech2(nn.Layer): if y is not None: ys = y.unsqueeze(0) - if spembs is not None: - spembs = spembs.unsqueeze(0) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) if tone_id is not None: tone_id = tone_id.unsqueeze(0) @@ -555,7 +555,7 @@ class FastSpeech2(nn.Layer): ds=ds, ps=ps, es=es, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id, is_inference=True) @@ -567,19 +567,19 @@ class FastSpeech2(nn.Layer): ys, is_inference=True, alpha=alpha, - spembs=spembs, + spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) return outs[0], d_outs[0], p_outs[0], e_outs[0] - def _integrate_with_spk_embed(self, hs, spembs): + def _integrate_with_spk_embed(self, hs, spk_emb): """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -589,13 +589,13 @@ class FastSpeech2(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.spk_projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.spk_projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand( + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( shape=[-1, hs.shape[1], -1]) - hs = self.spk_projection(paddle.concat([hs, spembs], axis=-1)) + hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.") @@ -680,9 +680,9 @@ class FastSpeech2Inference(nn.Layer): self.normalizer = normalizer self.acoustic_model = model - def forward(self, text, spk_id=None, spembs=None): + def forward(self, text, spk_id=None, spk_emb=None): normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, spk_id=spk_id, spembs=spembs) + text, spk_id=spk_id, spk_emb=spk_emb) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index cf717d8d6..0dabf934c 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -54,9 +54,9 @@ class FastSpeech2Updater(StandardUpdater): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None - spembs = batch["spembs"] if "spembs" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None # No explicit speaker identifier labels are used during voice cloning training. - if spembs is not None: + if spk_emb is not None: spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( @@ -68,7 +68,7 @@ class FastSpeech2Updater(StandardUpdater): pitch=batch["pitch"], energy=batch["energy"], spk_id=spk_id, - spembs=spembs) + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, @@ -131,8 +131,8 @@ class FastSpeech2Evaluator(StandardEvaluator): losses_dict = {} # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None - spembs = batch["spembs"] if "spembs" in batch else None - if spembs is not None: + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: spk_id = None before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( @@ -144,7 +144,7 @@ class FastSpeech2Evaluator(StandardEvaluator): pitch=batch["pitch"], energy=batch["energy"], spk_id=spk_id, - spembs=spembs) + spk_emb=spk_emb) l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( after_outs=after_outs, diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index 97233c766..5958a1660 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -391,7 +391,7 @@ class TransformerTTS(nn.Layer): text_lengths: paddle.Tensor, speech: paddle.Tensor, speech_lengths: paddle.Tensor, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: """Calculate forward propagation. @@ -405,7 +405,7 @@ class TransformerTTS(nn.Layer): Batch of padded target features (B, Lmax, odim). speech_lengths : Tensor(int64) Batch of the lengths of each target (B,). - spembs : Tensor, optional + spk_emb : Tensor, optional Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -439,7 +439,7 @@ class TransformerTTS(nn.Layer): # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, - spembs) + spk_emb) # modifiy mod part of groundtruth @@ -467,7 +467,7 @@ class TransformerTTS(nn.Layer): ilens: paddle.Tensor, ys: paddle.Tensor, olens: paddle.Tensor, - spembs: paddle.Tensor, + spk_emb: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: # forward encoder x_masks = self._source_mask(ilens) @@ -480,7 +480,7 @@ class TransformerTTS(nn.Layer): # integrate speaker embedding if self.spk_embed_dim is not None: - hs = self._integrate_with_spk_embed(hs, spembs) + hs = self._integrate_with_spk_embed(hs, spk_emb) # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: @@ -514,7 +514,7 @@ class TransformerTTS(nn.Layer): self, text: paddle.Tensor, speech: paddle.Tensor=None, - spembs: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, threshold: float=0.5, minlenratio: float=0.0, maxlenratio: float=10.0, @@ -528,7 +528,7 @@ class TransformerTTS(nn.Layer): Input sequence of characters (T,). speech : Tensor, optional Feature sequence to extract style (N, idim). - spembs : Tensor, optional + spk_emb : Tensor, optional Speaker embedding vector (spk_embed_dim,). threshold : float, optional Threshold in inference. @@ -551,7 +551,6 @@ class TransformerTTS(nn.Layer): """ # input of embedding must be int64 y = speech - spemb = spembs # add eos at the last of sequence text = numpy.pad( @@ -564,12 +563,12 @@ class TransformerTTS(nn.Layer): # get teacher forcing outputs xs, ys = x.unsqueeze(0), y.unsqueeze(0) - spembs = None if spemb is None else spemb.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) ilens = paddle.to_tensor( [xs.shape[1]], dtype=paddle.int64, place=xs.place) olens = paddle.to_tensor( [ys.shape[1]], dtype=paddle.int64, place=ys.place) - outs, *_ = self._forward(xs, ilens, ys, olens, spembs) + outs, *_ = self._forward(xs, ilens, ys, olens, spk_emb) # get attention weights att_ws = [] @@ -590,9 +589,9 @@ class TransformerTTS(nn.Layer): hs = hs + style_embs.unsqueeze(1) # integrate speaker embedding - if self.spk_embed_dim is not None: - spembs = spemb.unsqueeze(0) - hs = self._integrate_with_spk_embed(hs, spembs) + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + hs = self._integrate_with_spk_embed(hs, spk_emb) # set limits of length maxlen = int(hs.shape[1] * maxlenratio / self.reduction_factor) @@ -726,14 +725,14 @@ class TransformerTTS(nn.Layer): def _integrate_with_spk_embed(self, hs: paddle.Tensor, - spembs: paddle.Tensor) -> paddle.Tensor: + spk_emb: paddle.Tensor) -> paddle.Tensor: """Integrate speaker embedding with hidden states. Parameters ---------- hs : Tensor Batch of hidden state sequences (B, Tmax, adim). - spembs : Tensor + spk_emb : Tensor Batch of speaker embeddings (B, spk_embed_dim). Returns @@ -744,13 +743,13 @@ class TransformerTTS(nn.Layer): """ if self.spk_embed_integration_type == "add": # apply projection and then add to hidden states - spembs = self.projection(F.normalize(spembs)) - hs = hs + spembs.unsqueeze(1) + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) elif self.spk_embed_integration_type == "concat": # concat hidden states with spk embeds and then apply projection - spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1], - -1) - hs = self.projection(paddle.concat([hs, spembs], axis=-1)) + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(-1, hs.shape[1], + -1) + hs = self.projection(paddle.concat([hs, spk_emb], axis=-1)) else: raise NotImplementedError("support only add or concat.")