update, test=tts

3 years ago · d54d94deb5
parent 1c5471e4b0
commit d54d94deb5
4 changed files with 14 additions and 16 deletions
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@ -116,7 +116,7 @@ optional arguments:
 5. `--phones-dict` is the path of the phone vocabulary file.
 6. `--speaker-dict` is the path of the speaker id map file when training a multi-speaker FastSpeech2.
-We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: config["model"]["enable_speaker_classifier"], config["model"]["hidden_sc_dim"] and config["updater"]["spk_loss_scale"] in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself.
+We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: `config["model"]["enable_speaker_classifier"]`, `config["model"]["hidden_sc_dim"]` and `config["updater"]["spk_loss_scale"]` in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself.
 ### Synthesizing
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@ -155,8 +155,8 @@ def train_sp(args, config):
        optimizer=optimizer,
        dataloader=train_dataloader,
        output_dir=output_dir,
-        **config["updater"],
+        enable_spk_cls=enable_spk_cls,
-        enable_spk_cls=enable_spk_cls)
+        **config["updater"], )
    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
@ -164,8 +164,8 @@ def train_sp(args, config):
        model,
        dev_dataloader,
        output_dir=output_dir,
-        **config["updater"],
+        enable_spk_cls=enable_spk_cls,
-        enable_spk_cls=enable_spk_cls)
+        **config["updater"], )
    if dist.get_rank() == 0:
        trainer.extend(evaluator, trigger=(1, "epoch"))
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
@ -109,7 +109,8 @@ class FastSpeech2Updater(StandardUpdater):
            spk_logits=spk_logits,
            spk_ids=spk_id, )
-        loss = l1_loss + duration_loss + pitch_loss + energy_loss + self.spk_loss_scale * speaker_loss
+        scaled_speaker_loss = self.spk_loss_scale * speaker_loss
        loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss
        optimizer = self.optimizer
        optimizer.clear_grad()
@ -123,8 +124,7 @@ class FastSpeech2Updater(StandardUpdater):
        report("train/energy_loss", float(energy_loss))
        if self.enable_spk_cls:
            report("train/speaker_loss", float(speaker_loss))
-            report("train/scale_speaker_loss",
+            report("train/scaled_speaker_loss", float(scaled_speaker_loss))
                   float(self.spk_loss_scale * speaker_loss))
        losses_dict["l1_loss"] = float(l1_loss)
        losses_dict["duration_loss"] = float(duration_loss)
@ -133,8 +133,7 @@ class FastSpeech2Updater(StandardUpdater):
        losses_dict["energy_loss"] = float(energy_loss)
        if self.enable_spk_cls:
            losses_dict["speaker_loss"] = float(speaker_loss)
-            losses_dict["scale_speaker_loss"] = float(self.spk_loss_scale *
+            losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss)
                                                      speaker_loss)
        losses_dict["loss"] = float(loss)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
@ -211,7 +210,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
            olens=olens,
            spk_logits=spk_logits,
            spk_ids=spk_id, )
-        loss = l1_loss + duration_loss + pitch_loss + energy_loss + self.spk_loss_scale * speaker_loss
+
        scaled_speaker_loss = self.spk_loss_scale * speaker_loss
        loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss
        report("eval/loss", float(loss))
        report("eval/l1_loss", float(l1_loss))
@ -220,8 +221,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
        report("eval/energy_loss", float(energy_loss))
        if self.enable_spk_cls:
            report("train/speaker_loss", float(speaker_loss))
-            report("train/scale_speaker_loss",
+            report("train/scaled_speaker_loss", float(scaled_speaker_loss))
                   float(self.spk_loss_scale * speaker_loss))
        losses_dict["l1_loss"] = float(l1_loss)
        losses_dict["duration_loss"] = float(duration_loss)
@ -229,8 +229,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
        losses_dict["energy_loss"] = float(energy_loss)
        if self.enable_spk_cls:
            losses_dict["speaker_loss"] = float(speaker_loss)
-            losses_dict["scale_speaker_loss"] = float(self.spk_loss_scale *
+            losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss)
                                                      speaker_loss)
        losses_dict["loss"] = float(loss)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
--- a/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py
+++ b/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py
@ -37,7 +37,6 @@ class GradientReversalFunction(PyLayer):
        """
        lambda_, = ctx.saved_tensor()
        dx = -lambda_ * grads
        #return dx
        return paddle.clip(dx, min=-0.5, max=0.5)