diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index 1db7973b8..a87db3e8e 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -735,6 +735,118 @@ class FastSpeech2(nn.Layer): tone_id=tone_id) return hs + def _inference(self, + xs: paddle.Tensor, + ilens: paddle.Tensor, + olens: paddle.Tensor=None, + ds: paddle.Tensor=None, + ps: paddle.Tensor=None, + es: paddle.Tensor=None, + is_inference: bool=False, + return_after_enc=False, + alpha: float=1.0, + spk_emb=None, + spk_id=None, + tone_id=None) -> Sequence[paddle.Tensor]: + # forward encoder + x_masks = self._source_mask(ilens) + # (B, Tmax, adim) + hs, _ = self.encoder(xs, x_masks) + + # integrate speaker embedding + if self.spk_embed_dim is not None: + # spk_emb has a higher priority than spk_id + if spk_emb is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + elif spk_id is not None: + spk_emb = self.spk_embedding_table(spk_id) + hs = self._integrate_with_spk_embed(hs, spk_emb) + + # integrate tone embedding + if self.tone_embed_dim is not None: + if tone_id is not None: + tone_embs = self.tone_embedding_table(tone_id) + hs = self._integrate_with_tone_embed(hs, tone_embs) + # forward duration predictor and variance predictors + d_masks = make_pad_mask(ilens) + + if self.stop_gradient_from_pitch_predictor: + p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1)) + else: + p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1)) + if self.stop_gradient_from_energy_predictor: + e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1)) + else: + e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1)) + + if is_inference: + # (B, Tmax) + if ds is not None: + d_outs = ds + else: + d_outs = self.duration_predictor.inference(hs, d_masks) + if ps is not None: + p_outs = ps + if es is not None: + e_outs = es + + # use prediction in inference + # (B, Tmax, 1) + + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + e_embs + p_embs + + # (B, Lmax, adim) + hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) + else: + d_outs = self.duration_predictor(hs, d_masks) + # use groundtruth in training + p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + e_embs + p_embs + + # (B, Lmax, adim) + hs = self.length_regulator(hs, ds, is_inference=False) + + # forward decoder + if olens is not None and not is_inference: + if self.reduction_factor > 1: + olens_in = paddle.to_tensor( + [olen // self.reduction_factor for olen in olens.numpy()]) + else: + olens_in = olens + # (B, 1, T) + h_masks = self._source_mask(olens_in) + else: + h_masks = None + if return_after_enc: + return hs, h_masks + + if self.decoder_type == 'cnndecoder': + # remove output masks for dygraph to static graph + zs = self.decoder(hs, h_masks) + before_outs = zs + else: + # (B, Lmax, adim) + zs, _ = self.decoder(hs, h_masks) + # (B, Lmax, odim) + before_outs = self.feat_out(zs).reshape( + (paddle.shape(zs)[0], -1, self.odim)) + + # postnet -> (B, Lmax//r * r, odim) + if self.postnet is None: + after_outs = before_outs + else: + after_outs = before_outs + self.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + + return before_outs, after_outs, d_outs, p_outs, e_outs + def inference( self, text: paddle.Tensor, @@ -794,7 +906,7 @@ class FastSpeech2(nn.Layer): es = e.unsqueeze(0) if e is not None else None # (1, L, odim) - _, outs, d_outs, p_outs, e_outs = self._forward( + _, outs, d_outs, p_outs, e_outs = self._inference( xs, ilens, ds=ds, @@ -806,7 +918,7 @@ class FastSpeech2(nn.Layer): is_inference=True) else: # (1, L, odim) - _, outs, d_outs, p_outs, e_outs = self._forward( + _, outs, d_outs, p_outs, e_outs = self._inference( xs, ilens, is_inference=True, @@ -814,6 +926,8 @@ class FastSpeech2(nn.Layer): spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) + + return outs[0], d_outs[0], p_outs[0], e_outs[0] def _integrate_with_spk_embed(self, hs, spk_emb): @@ -1135,9 +1249,7 @@ class FastSpeech2Loss(nn.Layer): """ speaker_loss = 0.0 - - import pdb;pdb.set_trace() - + # apply mask to remove padded part if self.use_masking: out_masks = make_non_pad_mask(olens).unsqueeze(-1) @@ -1165,8 +1277,7 @@ class FastSpeech2Loss(nn.Layer): mask_index = spk_logits.abs().sum(axis=1)!=0 spk_ids = spk_ids[mask_index] spk_logits = spk_logits[mask_index] - speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0] - print("sssssssssssssssssss") + # calculate loss l1_loss = self.l1_criterion(before_outs, ys) @@ -1175,6 +1286,9 @@ class FastSpeech2Loss(nn.Layer): duration_loss = self.duration_criterion(d_outs, ds) pitch_loss = self.mse_criterion(p_outs, ps) energy_loss = self.mse_criterion(e_outs, es) + + if spk_logits is not None and spk_ids is not None: + speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0] # if spk_logits is None or spk_ids is None: # speaker_loss = 0.0 diff --git a/paddlespeech/t2s/modules/multi_speakers/__init__.py b/paddlespeech/t2s/modules/multi_speakers/__init__.py new file mode 100644 index 000000000..abf198b97 --- /dev/null +++ b/paddlespeech/t2s/modules/multi_speakers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.