update

3 years ago · 6d557dd388
parent 593f247460
commit 6d557dd388
2 changed files with 134 additions and 7 deletions
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -735,6 +735,118 @@ class FastSpeech2(nn.Layer):
            tone_id=tone_id)
        return hs

+    def _inference(self,
+                 xs: paddle.Tensor,
+                 ilens: paddle.Tensor,
+                 olens: paddle.Tensor=None,
+                 ds: paddle.Tensor=None,
+                 ps: paddle.Tensor=None,
+                 es: paddle.Tensor=None,
+                 is_inference: bool=False,
+                 return_after_enc=False,
+                 alpha: float=1.0,
+                 spk_emb=None,
+                 spk_id=None,
+                 tone_id=None) -> Sequence[paddle.Tensor]:
+        # forward encoder
+        x_masks = self._source_mask(ilens)
+        # (B, Tmax, adim)
+        hs, _ = self.encoder(xs, x_masks)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            # spk_emb has a higher priority than spk_id
+            if spk_emb is not None:
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+            elif spk_id is not None:
+                spk_emb = self.spk_embedding_table(spk_id)
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        # integrate tone embedding
+        if self.tone_embed_dim is not None:
+            if tone_id is not None:
+                tone_embs = self.tone_embedding_table(tone_id)
+                hs = self._integrate_with_tone_embed(hs, tone_embs)
+        # forward duration predictor and variance predictors
+        d_masks = make_pad_mask(ilens)
+
+        if self.stop_gradient_from_pitch_predictor:
+            p_outs = self.pitch_predictor(hs.detach(), d_masks.unsqueeze(-1))
+        else:
+            p_outs = self.pitch_predictor(hs, d_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            e_outs = self.energy_predictor(hs.detach(), d_masks.unsqueeze(-1))
+        else:
+            e_outs = self.energy_predictor(hs, d_masks.unsqueeze(-1))
+
+        if is_inference:
+            # (B, Tmax)
+            if ds is not None:
+                d_outs = ds
+            else:
+                d_outs = self.duration_predictor.inference(hs, d_masks)
+            if ps is not None:
+                p_outs = ps
+            if es is not None:
+                e_outs = es
+
+            # use prediction in inference
+            # (B, Tmax, 1)
+
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs = hs + e_embs + p_embs
+
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)
+            # use groundtruth in training
+            p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs = hs + e_embs + p_embs
+
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, ds, is_inference=False)
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = paddle.to_tensor(
+                    [olen // self.reduction_factor for olen in olens.numpy()])
+            else:
+                olens_in = olens
+            # (B, 1, T)
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+        if return_after_enc:
+            return hs, h_masks
+
+        if self.decoder_type == 'cnndecoder':
+            # remove output masks for dygraph to static graph
+            zs = self.decoder(hs, h_masks)
+            before_outs = zs
+        else:
+            # (B, Lmax, adim)
+            zs, _ = self.decoder(hs, h_masks)
+            # (B, Lmax, odim)
+            before_outs = self.feat_out(zs).reshape(
+                (paddle.shape(zs)[0], -1, self.odim))
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs
+
    def inference(
            self,
            text: paddle.Tensor,
@ -794,7 +906,7 @@ class FastSpeech2(nn.Layer):
            es = e.unsqueeze(0) if e is not None else None

            # (1, L, odim)
-            _, outs, d_outs, p_outs, e_outs = self._forward(
+            _, outs, d_outs, p_outs, e_outs = self._inference(
                xs,
                ilens,
                ds=ds,
@ -806,7 +918,7 @@ class FastSpeech2(nn.Layer):
                is_inference=True)
        else:
            # (1, L, odim)
-            _, outs, d_outs, p_outs, e_outs = self._forward(
+            _, outs, d_outs, p_outs, e_outs = self._inference(
                xs,
                ilens,
                is_inference=True,
@ -814,6 +926,8 @@ class FastSpeech2(nn.Layer):
                spk_emb=spk_emb,
                spk_id=spk_id,
                tone_id=tone_id)
+
+        
        return outs[0], d_outs[0], p_outs[0], e_outs[0]

    def _integrate_with_spk_embed(self, hs, spk_emb):
@ -1135,9 +1249,7 @@ class FastSpeech2Loss(nn.Layer):
        
        """
        speaker_loss = 0.0
-
-        import pdb;pdb.set_trace()
-
+        
        # apply mask to remove padded part
        if self.use_masking:
            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
@ -1165,8 +1277,7 @@ class FastSpeech2Loss(nn.Layer):
                mask_index = spk_logits.abs().sum(axis=1)!=0
                spk_ids = spk_ids[mask_index]
                spk_logits = spk_logits[mask_index]
-                speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0]
-                print("sssssssssssssssssss")
+                

        # calculate loss
        l1_loss = self.l1_criterion(before_outs, ys)
@ -1175,6 +1286,9 @@ class FastSpeech2Loss(nn.Layer):
        duration_loss = self.duration_criterion(d_outs, ds)
        pitch_loss = self.mse_criterion(p_outs, ps)
        energy_loss = self.mse_criterion(e_outs, es)
+
+        if spk_logits is not None and spk_ids is not None:
+            speaker_loss = self.ce_criterion(spk_logits, spk_ids)/spk_logits.shape[0]
        
        # if spk_logits is None or spk_ids is None:
        #     speaker_loss = 0.0
--- a/paddlespeech/t2s/modules/multi_speakers/init.py
+++ b/paddlespeech/t2s/modules/multi_speakers/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.