diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md index f63d5d8fe..012028007 100644 --- a/examples/zh_en_tts/tts3/README.md +++ b/examples/zh_en_tts/tts3/README.md @@ -116,7 +116,7 @@ optional arguments: 5. `--phones-dict` is the path of the phone vocabulary file. 6. `--speaker-dict` is the path of the speaker id map file when training a multi-speaker FastSpeech2. -We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: config["model"]["enable_speaker_classifier"], config["model"]["hidden_sc_dim"] and config["updater"]["spk_loss_scale"] in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself. +We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: `config["model"]["enable_speaker_classifier"]`, `config["model"]["hidden_sc_dim"]` and `config["updater"]["spk_loss_scale"]` in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself. ### Synthesizing diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index 0f5edb37e..d31e62a82 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -155,8 +155,8 @@ def train_sp(args, config): optimizer=optimizer, dataloader=train_dataloader, output_dir=output_dir, - **config["updater"], - enable_spk_cls=enable_spk_cls) + enable_spk_cls=enable_spk_cls, + **config["updater"], ) trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) @@ -164,8 +164,8 @@ def train_sp(args, config): model, dev_dataloader, output_dir=output_dir, - **config["updater"], - enable_spk_cls=enable_spk_cls) + enable_spk_cls=enable_spk_cls, + **config["updater"], ) if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index bbff927b7..b398267e6 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -109,7 +109,8 @@ class FastSpeech2Updater(StandardUpdater): spk_logits=spk_logits, spk_ids=spk_id, ) - loss = l1_loss + duration_loss + pitch_loss + energy_loss + self.spk_loss_scale * speaker_loss + scaled_speaker_loss = self.spk_loss_scale * speaker_loss + loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss optimizer = self.optimizer optimizer.clear_grad() @@ -123,8 +124,7 @@ class FastSpeech2Updater(StandardUpdater): report("train/energy_loss", float(energy_loss)) if self.enable_spk_cls: report("train/speaker_loss", float(speaker_loss)) - report("train/scale_speaker_loss", - float(self.spk_loss_scale * speaker_loss)) + report("train/scaled_speaker_loss", float(scaled_speaker_loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["duration_loss"] = float(duration_loss) @@ -133,8 +133,7 @@ class FastSpeech2Updater(StandardUpdater): losses_dict["energy_loss"] = float(energy_loss) if self.enable_spk_cls: losses_dict["speaker_loss"] = float(speaker_loss) - losses_dict["scale_speaker_loss"] = float(self.spk_loss_scale * - speaker_loss) + losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) @@ -211,7 +210,9 @@ class FastSpeech2Evaluator(StandardEvaluator): olens=olens, spk_logits=spk_logits, spk_ids=spk_id, ) - loss = l1_loss + duration_loss + pitch_loss + energy_loss + self.spk_loss_scale * speaker_loss + + scaled_speaker_loss = self.spk_loss_scale * speaker_loss + loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss report("eval/loss", float(loss)) report("eval/l1_loss", float(l1_loss)) @@ -220,8 +221,7 @@ class FastSpeech2Evaluator(StandardEvaluator): report("eval/energy_loss", float(energy_loss)) if self.enable_spk_cls: report("train/speaker_loss", float(speaker_loss)) - report("train/scale_speaker_loss", - float(self.spk_loss_scale * speaker_loss)) + report("train/scaled_speaker_loss", float(scaled_speaker_loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["duration_loss"] = float(duration_loss) @@ -229,8 +229,7 @@ class FastSpeech2Evaluator(StandardEvaluator): losses_dict["energy_loss"] = float(energy_loss) if self.enable_spk_cls: losses_dict["speaker_loss"] = float(speaker_loss) - losses_dict["scale_speaker_loss"] = float(self.spk_loss_scale * - speaker_loss) + losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) diff --git a/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py b/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py index e98758099..64da16053 100644 --- a/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py +++ b/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py @@ -37,7 +37,6 @@ class GradientReversalFunction(PyLayer): """ lambda_, = ctx.saved_tensor() dx = -lambda_ * grads - #return dx return paddle.clip(dx, min=-0.5, max=0.5)