update, test=tts

pull/2588/head
liangym 3 years ago
parent 1c5471e4b0
commit d54d94deb5

@ -116,7 +116,7 @@ optional arguments:
5. `--phones-dict` is the path of the phone vocabulary file. 5. `--phones-dict` is the path of the phone vocabulary file.
6. `--speaker-dict` is the path of the speaker id map file when training a multi-speaker FastSpeech2. 6. `--speaker-dict` is the path of the speaker id map file when training a multi-speaker FastSpeech2.
We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: config["model"]["enable_speaker_classifier"], config["model"]["hidden_sc_dim"] and config["updater"]["spk_loss_scale"] in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself. We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: `config["model"]["enable_speaker_classifier"]`, `config["model"]["hidden_sc_dim"]` and `config["updater"]["spk_loss_scale"]` in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself.
### Synthesizing ### Synthesizing

@ -155,8 +155,8 @@ def train_sp(args, config):
optimizer=optimizer, optimizer=optimizer,
dataloader=train_dataloader, dataloader=train_dataloader,
output_dir=output_dir, output_dir=output_dir,
**config["updater"], enable_spk_cls=enable_spk_cls,
enable_spk_cls=enable_spk_cls) **config["updater"], )
trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
@ -164,8 +164,8 @@ def train_sp(args, config):
model, model,
dev_dataloader, dev_dataloader,
output_dir=output_dir, output_dir=output_dir,
**config["updater"], enable_spk_cls=enable_spk_cls,
enable_spk_cls=enable_spk_cls) **config["updater"], )
if dist.get_rank() == 0: if dist.get_rank() == 0:
trainer.extend(evaluator, trigger=(1, "epoch")) trainer.extend(evaluator, trigger=(1, "epoch"))

@ -109,7 +109,8 @@ class FastSpeech2Updater(StandardUpdater):
spk_logits=spk_logits, spk_logits=spk_logits,
spk_ids=spk_id, ) spk_ids=spk_id, )
loss = l1_loss + duration_loss + pitch_loss + energy_loss + self.spk_loss_scale * speaker_loss scaled_speaker_loss = self.spk_loss_scale * speaker_loss
loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss
optimizer = self.optimizer optimizer = self.optimizer
optimizer.clear_grad() optimizer.clear_grad()
@ -123,8 +124,7 @@ class FastSpeech2Updater(StandardUpdater):
report("train/energy_loss", float(energy_loss)) report("train/energy_loss", float(energy_loss))
if self.enable_spk_cls: if self.enable_spk_cls:
report("train/speaker_loss", float(speaker_loss)) report("train/speaker_loss", float(speaker_loss))
report("train/scale_speaker_loss", report("train/scaled_speaker_loss", float(scaled_speaker_loss))
float(self.spk_loss_scale * speaker_loss))
losses_dict["l1_loss"] = float(l1_loss) losses_dict["l1_loss"] = float(l1_loss)
losses_dict["duration_loss"] = float(duration_loss) losses_dict["duration_loss"] = float(duration_loss)
@ -133,8 +133,7 @@ class FastSpeech2Updater(StandardUpdater):
losses_dict["energy_loss"] = float(energy_loss) losses_dict["energy_loss"] = float(energy_loss)
if self.enable_spk_cls: if self.enable_spk_cls:
losses_dict["speaker_loss"] = float(speaker_loss) losses_dict["speaker_loss"] = float(speaker_loss)
losses_dict["scale_speaker_loss"] = float(self.spk_loss_scale * losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss)
speaker_loss)
losses_dict["loss"] = float(loss) losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v) self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items()) for k, v in losses_dict.items())
@ -211,7 +210,9 @@ class FastSpeech2Evaluator(StandardEvaluator):
olens=olens, olens=olens,
spk_logits=spk_logits, spk_logits=spk_logits,
spk_ids=spk_id, ) spk_ids=spk_id, )
loss = l1_loss + duration_loss + pitch_loss + energy_loss + self.spk_loss_scale * speaker_loss
scaled_speaker_loss = self.spk_loss_scale * speaker_loss
loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss
report("eval/loss", float(loss)) report("eval/loss", float(loss))
report("eval/l1_loss", float(l1_loss)) report("eval/l1_loss", float(l1_loss))
@ -220,8 +221,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
report("eval/energy_loss", float(energy_loss)) report("eval/energy_loss", float(energy_loss))
if self.enable_spk_cls: if self.enable_spk_cls:
report("train/speaker_loss", float(speaker_loss)) report("train/speaker_loss", float(speaker_loss))
report("train/scale_speaker_loss", report("train/scaled_speaker_loss", float(scaled_speaker_loss))
float(self.spk_loss_scale * speaker_loss))
losses_dict["l1_loss"] = float(l1_loss) losses_dict["l1_loss"] = float(l1_loss)
losses_dict["duration_loss"] = float(duration_loss) losses_dict["duration_loss"] = float(duration_loss)
@ -229,8 +229,7 @@ class FastSpeech2Evaluator(StandardEvaluator):
losses_dict["energy_loss"] = float(energy_loss) losses_dict["energy_loss"] = float(energy_loss)
if self.enable_spk_cls: if self.enable_spk_cls:
losses_dict["speaker_loss"] = float(speaker_loss) losses_dict["speaker_loss"] = float(speaker_loss)
losses_dict["scale_speaker_loss"] = float(self.spk_loss_scale * losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss)
speaker_loss)
losses_dict["loss"] = float(loss) losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v) self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items()) for k, v in losses_dict.items())

@ -37,7 +37,6 @@ class GradientReversalFunction(PyLayer):
""" """
lambda_, = ctx.saved_tensor() lambda_, = ctx.saved_tensor()
dx = -lambda_ * grads dx = -lambda_ * grads
#return dx
return paddle.clip(dx, min=-0.5, max=0.5) return paddle.clip(dx, min=-0.5, max=0.5)

Loading…
Cancel
Save