update

3 years ago · bd47de824c
parent c9c6960f7e
commit bd47de824c
6 changed files with 89 additions and 62 deletions
--- a/examples/opencpop/svs1/conf/default.yaml
+++ b/examples/opencpop/svs1/conf/default.yaml
@ -34,7 +34,9 @@ model:
    # music score related
    note_num: 300                                     # number of note
    is_slur_num: 2                                    # number of slur
-    stretch: True                                     # whether to stretch before diffusion
+    # fastspeech2 module options
+    use_energy_pred: False                            # whether use energy predictor
+    use_postnet: False                                # whether use postnet

    # fastspeech2 module
    fastspeech2_params:
@ -106,7 +108,7 @@ model:
        beta_end: 0.06                                # beta end parameter for the scheduler
        beta_schedule: "linear"                       # beta schedule parameter for the scheduler
        num_max_timesteps: 100                        # The max timestep transition from real to noise
-
+        stretch: True                                 # whether to stretch before diffusion


 ###########################################################
@ -150,6 +152,7 @@ save_interval_steps: 2000                 # Interval steps to save checkpoint.
 eval_interval_steps: 2000                 # Interval steps to evaluate the network.
 num_snapshots: 5

+
 ###########################################################
 #                       OTHER SETTING                     #
 ###########################################################
--- a/paddlespeech/t2s/exps/diffsinger/train.py
+++ b/paddlespeech/t2s/exps/diffsinger/train.py
@ -137,11 +137,11 @@ def train_sp(args, config):
    odim = config.n_mels
    config["model"]["fastspeech2_params"]["spk_num"] = spk_num
    model = DiffSinger(
+        spec_min=spec_min,
+        spec_max=spec_max,
        idim=vocab_size,
        odim=odim,
-        **config["model"],
-        spec_min=spec_min,
-        spec_max=spec_max)
+        **config["model"], )
    model_fs2 = model.fs2
    model_ds = model.diffusion
    if world_size > 1:
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -373,11 +373,11 @@ def get_am_inference(
            spec_max = paddle.to_tensor(spec_max)
        am_config["model"]["fastspeech2_params"]["spk_num"] = spk_num
        am = am_class(
+            spec_min=spec_min,
+            spec_max=spec_max,
            idim=vocab_size,
            odim=odim,
-            **am_config["model"],
-            spec_min=spec_min,
-            spec_max=spec_max, )
+            **am_config["model"], )
    elif am_name == 'speedyspeech':
        am = am_class(
            vocab_size=vocab_size,
--- a/paddlespeech/t2s/models/diffsinger/diffsinger.py
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py
@ -42,9 +42,14 @@ class DiffSinger(nn.Layer):

    def __init__(
            self,
+            # min and max spec for stretching before diffusion
+            spec_min: paddle.Tensor,
+            spec_max: paddle.Tensor,
            # fastspeech2midi config
            idim: int,
            odim: int,
+            use_energy_pred: bool=False,
+            use_postnet: bool=False,
            # music score related 
            note_num: int=300,
            is_slur_num: int=2,
@ -134,24 +139,23 @@ class DiffSinger(nn.Layer):
                "beta_start": 0.0001,
                "beta_end": 0.06,
                "beta_schedule": "squaredcos_cap_v2",
-                "num_max_timesteps": 60
-            },
-            stretch: bool=True,
-            spec_min: paddle.Tensor=None,
-            spec_max: paddle.Tensor=None, ):
+                "num_max_timesteps": 60,
+                "stretch": True,
+            }, ):
        """Initialize DiffSinger module.

        Args:
-            idim (int): 
-                Dimension of the inputs (Input vocabrary size.).
-            odim (int): 
-                Dimension of the outputs (Acoustic feature dimension.).
+            spec_min (paddle.Tensor): The minimum value of the feature(mel) to stretch before diffusion.
+            spec_max (paddle.Tensor): The maximum value of the feature(mel) to stretch before diffusion.
+            idim (int): Dimension of the inputs (Input vocabrary size.).
+            odim (int): Dimension of the outputs (Acoustic feature dimension.).
+            use_energy_pred (bool, optional): whether use energy predictor. Defaults False.
+            use_postnet (bool, optional): whether use postnet. Defaults False.
            note_num (int, optional): The number of note. Defaults to 300.
            is_slur_num (int, optional): The number of slur. Defaults to 2.
            fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module.
            denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
            diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
-            stretch (bool): Whether to stretch before diffusion. Defaults True.
        """
        assert check_argument_types()
        super().__init__()
@ -160,12 +164,13 @@ class DiffSinger(nn.Layer):
            odim=odim,
            fastspeech2_params=fastspeech2_params,
            note_num=note_num,
-            is_slur_num=is_slur_num)
+            is_slur_num=is_slur_num,
+            use_energy_pred=use_energy_pred,
+            use_postnet=use_postnet, )
        denoiser = DiffNet(**denoiser_params)
        self.diffusion = GaussianDiffusion(
            denoiser,
            **diffusion_params,
-            stretch=stretch,
            min_values=spec_min,
            max_values=spec_max, )

@ -319,7 +324,7 @@ class DiffSingerInference(nn.Layer):
            logmel(Tensor(float32)): denorm logmel, [T, mel_bin]
        """
        normalized_mel = self.acoustic_model.inference(
-            text,
+            text=text,
            note=note,
            note_dur=note_dur,
            is_slur=is_slur,
--- a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
@ -121,17 +121,18 @@ class DiffSingerUpdater(StandardUpdater):
            report("train/ssim_loss_fs2", float(ssim_loss_fs2))
            report("train/duration_loss", float(duration_loss))
            report("train/pitch_loss", float(pitch_loss))
-            report("train/energy_loss", float(energy_loss))

            losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
            losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
            losses_dict["duration_loss"] = float(duration_loss)
            losses_dict["pitch_loss"] = float(pitch_loss)
-            losses_dict["energy_loss"] = float(energy_loss)

            if speaker_loss != 0.:
                report("train/speaker_loss", float(speaker_loss))
                losses_dict["speaker_loss"] = float(speaker_loss)
+            if energy_loss != 0.:
+                report("train/energy_loss", float(energy_loss))
+                losses_dict["energy_loss"] = float(energy_loss)

            losses_dict["loss_fs2"] = float(loss_fs2)
            self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
@ -250,17 +251,18 @@ class DiffSingerEvaluator(StandardEvaluator):
        report("eval/ssim_loss_fs2", float(ssim_loss_fs2))
        report("eval/duration_loss", float(duration_loss))
        report("eval/pitch_loss", float(pitch_loss))
-        report("eval/energy_loss", float(energy_loss))

        losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
        losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
        losses_dict["duration_loss"] = float(duration_loss)
        losses_dict["pitch_loss"] = float(pitch_loss)
-        losses_dict["energy_loss"] = float(energy_loss)

        if speaker_loss != 0.:
            report("eval/speaker_loss", float(speaker_loss))
            losses_dict["speaker_loss"] = float(speaker_loss)
+        if energy_loss != 0.:
+            report("eval/energy_loss", float(energy_loss))
+            losses_dict["energy_loss"] = float(energy_loss)

        losses_dict["loss_fs2"] = float(loss_fs2)

--- a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
+++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
@ -42,7 +42,9 @@ class FastSpeech2MIDI(FastSpeech2):
            # note emb
            note_num: int=300,
            # is_slur emb
-            is_slur_num: int=2, ):
+            is_slur_num: int=2,
+            use_energy_pred: bool=False,
+            use_postnet: bool=False, ):
        """Initialize FastSpeech2 module for svs.
        Args:
            fastspeech2_params (Dict):
@ -57,6 +59,10 @@ class FastSpeech2MIDI(FastSpeech2):
        """
        assert check_argument_types()
        super().__init__(idim=idim, odim=odim, **fastspeech2_params)
+        self.use_energy_pred = use_energy_pred
+        self.use_postnet = use_postnet
+        if not self.use_postnet:
+            self.postnet = None

        self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[
            "adim"]
@ -214,12 +220,14 @@ class FastSpeech2MIDI(FastSpeech2):
        if is_train_diffusion:
            hs = self.length_regulator(hs, ds, is_inference=False)
            p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
-            e_outs = self.energy_predictor(hs.detach(), pitch_masks)
            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
                (0, 2, 1))
-            e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
-                (0, 2, 1))
-            hs = hs + p_embs + e_embs
+            hs += p_embs
+            if self.use_energy_pred:
+                e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                e_embs = self.energy_embed(
+                    e_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                hs += e_embs

        elif is_inference:
            # (B, Tmax)
@ -238,7 +246,11 @@ class FastSpeech2MIDI(FastSpeech2):
                    p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
                else:
                    p_outs = self.pitch_predictor(hs, pitch_masks)
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs

+            if self.use_energy_pred:
                if es is not None:
                    e_outs = es
                else:
@ -246,12 +258,9 @@ class FastSpeech2MIDI(FastSpeech2):
                        e_outs = self.energy_predictor(hs.detach(), pitch_masks)
                    else:
                        e_outs = self.energy_predictor(hs, pitch_masks)
-
-            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
-                (0, 2, 1))
-            e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose(
-                (0, 2, 1))
-            hs = hs + p_embs + e_embs
+                e_embs = self.energy_embed(
+                    e_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                hs += e_embs

        # training
        else:
@ -262,15 +271,18 @@ class FastSpeech2MIDI(FastSpeech2):
                p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
            else:
                p_outs = self.pitch_predictor(hs, pitch_masks)
+            p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+
+            if self.use_energy_pred:
                if self.stop_gradient_from_energy_predictor:
                    e_outs = self.energy_predictor(hs.detach(), pitch_masks)
                else:
                    e_outs = self.energy_predictor(hs, pitch_masks)
-            p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
-                (0, 2, 1))
                e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
                    (0, 2, 1))
-            hs = hs + p_embs + e_embs
+                hs += e_embs

        # forward decoder
        if olens is not None and not is_inference:
@ -304,7 +316,6 @@ class FastSpeech2MIDI(FastSpeech2):
        else:
            after_outs = before_outs + self.postnet(
                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
-        after_outs = before_outs

        return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits

@ -475,6 +486,9 @@ class FastSpeech2MIDI(FastSpeech2):
                spk_emb=spk_emb,
                spk_id=spk_id, )

+        if e_outs is None:
+            e_outs = [None]
+
        return outs[0], d_outs[0], p_outs[0], e_outs[0]


@ -552,14 +566,14 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
            # make feature for ssim loss
            out_pad_masks = make_pad_mask(olens).unsqueeze(-1)
            before_outs_ssim = masked_fill(before_outs, out_pad_masks, 0.0)
-            if after_outs is not None:
+            if not paddle.equal_all(after_outs, before_outs):
                after_outs_ssim = masked_fill(after_outs, out_pad_masks, 0.0)
            ys_ssim = masked_fill(ys, out_pad_masks, 0.0)

            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
            before_outs = before_outs.masked_select(
                out_masks.broadcast_to(before_outs.shape))
-            if after_outs is not None:
+            if not paddle.equal_all(after_outs, before_outs):
                after_outs = after_outs.masked_select(
                    out_masks.broadcast_to(after_outs.shape))
            ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
@ -570,9 +584,10 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
            pitch_masks = out_masks
            p_outs = p_outs.masked_select(
                pitch_masks.broadcast_to(p_outs.shape))
+            ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
+            if e_outs is not None:
                e_outs = e_outs.masked_select(
                    pitch_masks.broadcast_to(e_outs.shape))
-            ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
                es = es.masked_select(pitch_masks.broadcast_to(es.shape))

            if spk_logits is not None and spk_ids is not None:
@ -589,7 +604,7 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
        l1_loss = self.l1_criterion(before_outs, ys)
        ssim_loss = 1.0 - ssim(
            before_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1))
-        if after_outs is not None:
+        if not paddle.equal_all(after_outs, before_outs):
            l1_loss += self.l1_criterion(after_outs, ys)
            ssim_loss += (
                1.0 - ssim(after_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1)))
@ -598,6 +613,7 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):

        duration_loss = self.duration_criterion(d_outs, ds)
        pitch_loss = self.l1_criterion(p_outs, ps)
+        if e_outs is not None:
            energy_loss = self.l1_criterion(e_outs, es)

        if spk_logits is not None and spk_ids is not None:
@ -630,6 +646,7 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
            pitch_loss = pitch_loss.multiply(pitch_weights)
            pitch_loss = pitch_loss.masked_select(
                pitch_masks.broadcast_to(pitch_loss.shape)).sum()
+            if e_outs is not None:
                energy_loss = energy_loss.multiply(pitch_weights)
                energy_loss = energy_loss.masked_select(
                    pitch_masks.broadcast_to(energy_loss.shape)).sum()