From ef7d15dc023b0f2f2c96ea31ff28e9c9045aae98 Mon Sep 17 00:00:00 2001 From: liangym Date: Wed, 1 Feb 2023 07:30:54 +0000 Subject: [PATCH] base diffsinger, test=tts --- examples/opencpop/svs1/conf/default.yaml | 148 +++++++ examples/opencpop/svs1/path.sh | 13 + paddlespeech/t2s/exps/diffsinger/train.py | 86 ++-- paddlespeech/t2s/exps/syn_utils.py | 15 +- paddlespeech/t2s/exps/synthesize.py | 41 +- .../t2s/models/diffsinger/diffsinger.py | 398 ++++++++++++------ .../models/diffsinger/diffsinger_updater.py | 308 +++++++------- 7 files changed, 677 insertions(+), 332 deletions(-) create mode 100644 examples/opencpop/svs1/conf/default.yaml create mode 100755 examples/opencpop/svs1/path.sh diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml new file mode 100644 index 000000000..fff355afa --- /dev/null +++ b/examples/opencpop/svs1/conf/default.yaml @@ -0,0 +1,148 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 512 # FFT size (samples). +n_shift: 128 # Hop size (samples). 12.5ms +win_length: 512 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 30 # Minimum frequency of Mel basis. +fmax: 12000 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 750 # Maximum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 32 +num_workers: 4 + + +########################################################### +# MODEL SETTING # +########################################################### +# fastspeech2 module +fs2_model: + adim: 256 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + use_scaled_pos_enc: True # whether to use scaled positional encoding + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + init_type: xavier_uniform # initialization type + init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding + init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder + note_num: 300 + is_slur_num: 2 + +denoiser_model: + in_channels: 80 + out_channels: 80 + kernel_size: 3 + layers: 20 + stacks: 4 + residual_channels: 256 + gate_channels: 512 + skip_channels: 256 + aux_channels: 256 + dropout: 0.1 + bias: True + use_weight_norm: False + init_type: kaiming_uniform + +diffusion: + num_train_timesteps: 100 + beta_start: 0.0001 + beta_end: 0.06 + beta_schedule: "squaredcos_cap_v2" + num_max_timesteps: 60 + + +########################################################### +# UPDATER SETTING # +########################################################### +fs2_updater: + use_masking: True # whether to apply masking for padded part in loss calculation + +ds_updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +# gpu_num=2 config +# fastspeech2 optimizer +fs2_optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +# diffusion optimizer +ds_optimizer_params: + beta1: 0.9 + beta2: 0.98 + weight_decay: 0.0 + +ds_scheduler_params: + learning_rate: 0.001 + gamma: 0.5 + step_size: 25000 +ds_grad_norm: 1 + + +########################################################### +# INTERVAL SETTING # +########################################################### +ds_train_start_steps: 80000 # Number of steps to start to train diffusion module. +train_max_steps: 160000 # Number of training steps. +save_interval_steps: 1000 # Interval steps to save checkpoint. +eval_interval_steps: 250 # Interval steps to evaluate the network. +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 +find_unused_parameters: True diff --git a/examples/opencpop/svs1/path.sh b/examples/opencpop/svs1/path.sh new file mode 100755 index 000000000..8bda5dce6 --- /dev/null +++ b/examples/opencpop/svs1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=diffsinger +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py index ce612d037..f0f2c0af7 100644 --- a/paddlespeech/t2s/exps/diffsinger/train.py +++ b/paddlespeech/t2s/exps/diffsinger/train.py @@ -23,8 +23,11 @@ import paddle import yaml from paddle import DataParallel from paddle import distributed as dist +from paddle import nn from paddle.io import DataLoader from paddle.io import DistributedBatchSampler +from paddle.optimizer import AdamW +from paddle.optimizer.lr import StepDecay from yacs.config import CfgNode from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_multi_spk_batch_fn @@ -33,6 +36,8 @@ from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.diffsinger import DiffSinger from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater +from paddlespeech.t2s.models.diffsinger import DiffusionLoss +from paddlespeech.t2s.models.diffsinger import FastSpeech2MIDILoss from paddlespeech.t2s.training.extensions.snapshot import Snapshot from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import build_optimizers @@ -40,7 +45,6 @@ from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.trainer import Trainer from paddlespeech.t2s.utils import str2bool - def train_sp(args, config): # decides device type and whether to run in parallel # setup running environment correctly @@ -59,8 +63,9 @@ def train_sp(args, config): f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) fields = [ - "text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy", - "note", "note_dur", "is_slur"] + "text", "text_lengths", "speech", "speech_lengths", "durations", + "pitch", "energy", "note", "note_dur", "is_slur" + ] converters = {"speech": np.load, "pitch": np.load, "energy": np.load} spk_num = None if args.speaker_dict is not None: @@ -99,7 +104,6 @@ def train_sp(args, config): converters=converters, ) # collate function and dataloader - train_sampler = DistributedBatchSampler( train_dataset, batch_size=config.batch_size, @@ -129,13 +133,32 @@ def train_sp(args, config): print("vocab_size:", vocab_size) odim = config.n_mels + config["fs2_model"]["idim"] = vocab_size + config["fs2_model"]["odim"] = odim + config["fs2_model"]["spk_num"] = spk_num + model = DiffSinger( - idim=vocab_size, odim=odim, spk_num=spk_num, **config["model"]) + fs2_config=config["fs2_model"], + denoiser_config=config["denoiser_model"], + diffusion_config=config["diffusion"]) if world_size > 1: model = DataParallel(model) - print("model done!") - - optimizer = build_optimizers(model, **config["optimizer"]) + print("models done!") + + criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"]) + criterion_ds = DiffusionLoss(**config["ds_updater"]) + print("criterions done!") + + optimizer_fs2 = build_optimizers(model._layers.fs2, + **config["fs2_optimizer"]) + lr_schedule_ds = StepDecay(**config["ds_scheduler_params"]) + gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"]) + optimizer_ds = AdamW( + learning_rate=lr_schedule_ds, + grad_clip=gradient_clip_ds, + parameters=model._layers.diffusion.parameters(), + **config["ds_optimizer_params"]) + # optimizer_ds = build_optimizers(ds, **config["ds_optimizer"]) print("optimizer done!") output_dir = Path(args.output_dir) @@ -145,33 +168,42 @@ def train_sp(args, config): # copy conf to output_dir shutil.copyfile(args.config, output_dir / config_name) - if "enable_speaker_classifier" in config.model: - enable_spk_cls = config.model.enable_speaker_classifier - else: - enable_spk_cls = False - updater = DiffSingerUpdater( model=model, - optimizer=optimizer, + optimizers={ + "fs2": optimizer_fs2, + "ds": optimizer_ds, + }, + criterions={ + "fs2": criterion_fs2, + "ds": criterion_ds, + }, dataloader=train_dataloader, - output_dir=output_dir, - enable_spk_cls=enable_spk_cls, - **config["updater"], ) - - trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + ds_train_start_steps=config.ds_train_start_steps, + output_dir=output_dir) evaluator = DiffSingerEvaluator( - model, - dev_dataloader, - output_dir=output_dir, - enable_spk_cls=enable_spk_cls, - **config["updater"], ) + model=model, + criterions={ + "fs2": criterion_fs2, + "ds": criterion_ds, + }, + dataloader=dev_dataloader, + output_dir=output_dir) + trainer = Trainer( + updater, + stop_trigger=(config.train_max_steps, "iteration"), + out=output_dir, ) if dist.get_rank() == 0: - trainer.extend(evaluator, trigger=(1, "epoch")) - trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + evaluator, trigger=(config.eval_interval_steps, 'iteration')) + trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration')) trainer.extend( - Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + Snapshot(max_size=config.num_snapshots), + trigger=(config.save_interval_steps, 'iteration')) + + print("Trainer Done!") trainer.run() diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index a8508019d..410775138 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -23,6 +23,7 @@ from typing import Optional import numpy as np import onnxruntime as ort import paddle +import yaml from paddle import inference from paddle import jit from paddle.io import DataLoader @@ -59,6 +60,7 @@ model_alias = { "paddlespeech.t2s.models.diffsinger:DiffSinger", "diffsinger_inference": "paddlespeech.t2s.models.diffsinger:DiffSingerInference", + # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -147,6 +149,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]], print("single speaker fastspeech2!") elif am_name == 'diffsinger': fields = ["utt_id", "text", "note", "note_dur", "is_slur"] + elif am_name == 'fastspeech2midi': + fields = ["utt_id", "text", "note", "note_dur", "is_slur"] elif am_name == 'speedyspeech': fields = ["utt_id", "phones", "tones"] elif am_name == 'tacotron2': @@ -353,9 +357,14 @@ def get_am_inference(am: str='fastspeech2_csmsc', if am_name == 'fastspeech2': am = am_class( idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) - if am_name == 'diffsinger': + elif am_name == 'diffsinger': + am_config["fs2_model"]["idim"] = vocab_size + am_config["fs2_model"]["odim"] = am_config.n_mels + am_config["fs2_model"]["spk_num"] = spk_num am = am_class( - idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) + fs2_config=am_config["fs2_model"], + denoiser_config=am_config["denoiser_model"], + diffusion_config=am_config["diffusion"]) elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, @@ -366,8 +375,6 @@ def get_am_inference(am: str='fastspeech2_csmsc', am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) elif am_name == 'erniesat': am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - else: - print("wrong am, please input right am!!!") am.set_state_dict(paddle.load(am_ckpt)["main_params"]) am.eval() diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 9e060b42d..009e33a16 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -112,8 +112,14 @@ def evaluate(args): note = paddle.to_tensor(datum["note"]) note_dur = paddle.to_tensor(datum["note_dur"]) is_slur = paddle.to_tensor(datum["is_slur"]) - mel = am_inference(phone_ids, note=note, note_dur=note_dur, is_slur=is_slur) - # vocoder + get_mel_fs2 = False + # mel: [T, mel_bin] + mel = am_inference( + phone_ids, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=get_mel_fs2) wav = voc_inference(mel) wav = wav.numpy() @@ -140,10 +146,16 @@ def parse_args(): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc', - 'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix', - "diffsinger_opencpop" + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', + 'fastspeech2_aishell3', + 'fastspeech2_vctk', + 'tacotron2_csmsc', + 'tacotron2_ljspeech', + 'tacotron2_aishell3', + 'fastspeech2_mix', + "diffsinger_opencpop", ], help='Choose acoustic model type of tts task.') parser.add_argument( @@ -176,10 +188,19 @@ def parse_args(): type=str, default='pwgan_csmsc', choices=[ - 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc', - 'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk', - 'style_melgan_csmsc', "pwgan_opencpop", + 'pwgan_csmsc', + 'pwgan_ljspeech', + 'pwgan_aishell3', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'wavernn_csmsc', + 'hifigan_csmsc', + 'hifigan_ljspeech', + 'hifigan_aishell3', + 'hifigan_vctk', + 'style_melgan_csmsc', + "pwgan_opencpop", + "hifigan_opencpop", ], help='Choose vocoder type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py index 24c5d4ee8..2f973bf0a 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py @@ -25,8 +25,11 @@ import paddle.nn.functional as F from paddle import nn from typeguard import check_argument_types +from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier +from paddlespeech.t2s.modules.diffusion import GaussianDiffusion +from paddlespeech.t2s.modules.diffusion import WaveNetDenoiser from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_pad_mask @@ -41,22 +44,13 @@ from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder -class DiffSinger(nn.Layer): - """DiffSinger module. - - This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._ - .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`: - https://arxiv.org/pdf/2105.02446.pdf - - Args: - - Returns: - +class FastSpeech2MIDI(FastSpeech2): + """The Fastspeech2 module of DiffSinger. """ def __init__( self, - # network structure related + # fastspeech2 network structure related idim: int, odim: int, adim: int=384, @@ -133,12 +127,8 @@ class DiffSinger(nn.Layer): tone_embed_integration_type: str="add", # note emb note_num: int=300, - # note_embed_dim: int=384, - note_embed_integration_type: str="add", # is_slur emb is_slur_num: int=2, - # is_slur_embed_dim: int=384, - is_slur_embed_integration_type: str="add", # training related init_type: str="xavier_uniform", init_enc_alpha: float=1.0, @@ -146,7 +136,7 @@ class DiffSinger(nn.Layer): # speaker classifier enable_speaker_classifier: bool=False, hidden_sc_dim: int=256, ): - """Initialize DiffSinger module. + """Initialize FastSpeech2 module for svs. Args: idim (int): Dimension of the inputs. @@ -252,7 +242,7 @@ class DiffSinger(nn.Layer): Kernel size of energy embedding. energy_embed_dropout_rate (float): Dropout rate for energy embedding. - stop_gradient_from_energy_predictor(bool): + stop_gradient_from_energy_predictor (bool): Whether to stop gradient from energy predictor to encoder. spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None, @@ -271,7 +261,7 @@ class DiffSinger(nn.Layer): How to integrate tone embedding. init_type (str): How to initialize transformer parameters. - init_enc_alpha (float): + init_enc_alpha (float): Initial value of alpha in scaled pos encoding of the encoder. init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. @@ -279,10 +269,16 @@ class DiffSinger(nn.Layer): Whether to use speaker classifier module hidden_sc_dim (int): The hidden layer dim of speaker classifier + note_num (Optional[int]): + Number of note. If not None, assume that the + note_ids will be provided as the input and use note_embedding_table. + is_slur_num (Optional[int]): + Number of note. If not None, assume that the + is_slur_ids will be provided as the input """ assert check_argument_types() - super().__init__() + super().__init__(idim, odim) # store hyperparameters self.odim = odim @@ -306,12 +302,9 @@ class DiffSinger(nn.Layer): self.note_embed_dim = adim if self.note_embed_dim is not None: - self.note_embed_integration_type = note_embed_integration_type self.note_dur_layer = nn.Linear(1, self.note_embed_dim) self.is_slur_embed_dim = adim - if self.is_slur_embed_dim is not None: - self.is_slur_embed_integration_type = is_slur_embed_integration_type # use idx 0 as padding idx self.padding_idx = 0 @@ -627,6 +620,7 @@ class DiffSinger(nn.Layer): ps: paddle.Tensor=None, es: paddle.Tensor=None, is_inference: bool=False, + is_train_diffusion: bool=False, return_after_enc=False, alpha: float=1.0, spk_emb=None, @@ -639,7 +633,12 @@ class DiffSinger(nn.Layer): is_slur_emb = self.is_slur_embedding_table(is_slur) # (B, Tmax, adim) - hs, _ = self.encoder(xs, x_masks, note_emb, note_dur_emb, is_slur_emb,) + hs, _ = self.encoder( + xs, + x_masks, + note_emb, + note_dur_emb, + is_slur_emb, ) if self.spk_num and self.enable_speaker_classifier and not is_inference: hs_for_spk_cls = self.grad_reverse(hs) @@ -668,12 +667,24 @@ class DiffSinger(nn.Layer): else: pitch_masks = None - if is_inference: + # inference for decoder input for duffusion + if is_train_diffusion: + hs = self.length_regulator(hs, ds, is_inference=False) + p_outs = self.pitch_predictor(hs.detach(), pitch_masks) + e_outs = self.energy_predictor(hs.detach(), pitch_masks) + p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + e_embs = self.energy_embed(e_outs.transpose((0, 2, 1))).transpose( + (0, 2, 1)) + hs = hs + e_embs + p_embs + + elif is_inference: # (B, Tmax) if ds is not None: d_outs = ds else: d_outs = self.duration_predictor.inference(hs, d_masks) + # (B, Lmax, adim) hs = self.length_regulator(hs, d_outs, alpha, is_inference=True) @@ -699,6 +710,7 @@ class DiffSinger(nn.Layer): (0, 2, 1)) hs = hs + e_embs + p_embs + # training else: d_outs = self.duration_predictor(hs, d_masks) # (B, Lmax, adim) @@ -717,7 +729,6 @@ class DiffSinger(nn.Layer): (0, 2, 1)) hs = hs + e_embs + p_embs - # forward decoder if olens is not None and not is_inference: if self.reduction_factor > 1: @@ -750,7 +761,7 @@ class DiffSinger(nn.Layer): else: after_outs = before_outs + self.postnet( before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - + return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits def encoder_infer( @@ -764,6 +775,7 @@ class DiffSinger(nn.Layer): spk_id=None, tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + # input of embedding must be int64 x = paddle.cast(text, 'int64') note = paddle.cast(note, 'int64') @@ -785,7 +797,7 @@ class DiffSinger(nn.Layer): # (1, L, odim) # use *_ to avoid bug in dygraph to static graph - hs, *_ = self._forward( + hs, _ = self._forward( xs, note, note_dur, @@ -799,6 +811,55 @@ class DiffSinger(nn.Layer): tone_id=tone_id) return hs + # for diffusion + def encoder_infer_batch( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech_lengths: paddle.Tensor, + ds: paddle.Tensor=None, + ps: paddle.Tensor=None, + es: paddle.Tensor=None, + alpha: float=1.0, + spk_emb=None, + spk_id=None, + tone_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + + # input of embedding must be int64 + xs = paddle.cast(text, 'int64') + note = paddle.cast(note, 'int64') + note_dur = paddle.cast(note_dur, 'float32') + is_slur = paddle.cast(is_slur, 'int64') + + if spk_emb is not None: + spk_emb = spk_emb.unsqueeze(0) + + if tone_id is not None: + tone_id = tone_id.unsqueeze(0) + + # (1, L, odim) + # use *_ to avoid bug in dygraph to static graph + hs, h_masks = self._forward( + xs, + note, + note_dur, + is_slur, + ilens=text_lengths, + olens=speech_lengths, + ds=ds, + ps=ps, + es=es, + return_after_enc=True, + is_train_diffusion=True, + alpha=alpha, + spk_emb=spk_emb, + spk_id=spk_id, + tone_id=tone_id) + return hs, h_masks + def inference( self, text: paddle.Tensor, @@ -896,112 +957,8 @@ class DiffSinger(nn.Layer): return outs[0], d_outs[0], p_outs[0], e_outs[0] - def _integrate_with_spk_embed(self, hs, spk_emb): - """Integrate speaker embedding with hidden states. - - Args: - hs(Tensor): - Batch of hidden state sequences (B, Tmax, adim). - spk_emb(Tensor): - Batch of speaker embeddings (B, spk_embed_dim). - - Returns: - - - """ - if self.spk_embed_integration_type == "add": - # apply projection and then add to hidden states - spk_emb = self.spk_projection(F.normalize(spk_emb)) - hs = hs + spk_emb.unsqueeze(1) - elif self.spk_embed_integration_type == "concat": - # concat hidden states with spk embeds and then apply projection - spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( - shape=[-1, paddle.shape(hs)[1], -1]) - hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) - else: - raise NotImplementedError("support only add or concat.") - - return hs - - def _integrate_with_tone_embed(self, hs, tone_embs): - """Integrate speaker embedding with hidden states. - - Args: - hs(Tensor): - Batch of hidden state sequences (B, Tmax, adim). - tone_embs(Tensor): - Batch of speaker embeddings (B, Tmax, tone_embed_dim). - - Returns: - - """ - if self.tone_embed_integration_type == "add": - # apply projection and then add to hidden states - tone_embs = self.tone_projection(F.normalize(tone_embs)) - hs = hs + tone_embs - - elif self.tone_embed_integration_type == "concat": - # concat hidden states with tone embeds and then apply projection - tone_embs = F.normalize(tone_embs).expand( - shape=[-1, hs.shape[1], -1]) - hs = self.tone_projection(paddle.concat([hs, tone_embs], axis=-1)) - else: - raise NotImplementedError("support only add or concat.") - return hs - def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor: - """Make masks for self-attention. - - Args: - ilens(Tensor): - Batch of lengths (B,). - - Returns: - Tensor: - Mask tensor for self-attention. dtype=paddle.bool - - Examples: - >>> ilens = [5, 3] - >>> self._source_mask(ilens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 0, 0]]]) bool - """ - x_masks = make_non_pad_mask(ilens) - return x_masks.unsqueeze(-2) - - def _reset_parameters(self, init_enc_alpha: float, init_dec_alpha: float): - - # initialize alpha in scaled positional encoding - if self.encoder_type == "transformer" and self.use_scaled_pos_enc: - init_enc_alpha = paddle.to_tensor(init_enc_alpha) - self.encoder.embed[-1].alpha = paddle.create_parameter( - shape=init_enc_alpha.shape, - dtype=str(init_enc_alpha.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign( - init_enc_alpha)) - if self.decoder_type == "transformer" and self.use_scaled_pos_enc: - init_dec_alpha = paddle.to_tensor(init_dec_alpha) - self.decoder.embed[-1].alpha = paddle.create_parameter( - shape=init_dec_alpha.shape, - dtype=str(init_dec_alpha.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign( - init_dec_alpha)) - - -class DiffSingerInference(nn.Layer): - def __init__(self, normalizer, model): - super().__init__() - self.normalizer = normalizer - self.acoustic_model = model - - def forward(self, text, note, note_dur, is_slur, spk_id=None, spk_emb=None): - normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, note=note, note_dur=note_dur, is_slur=is_slur, spk_id=spk_id, spk_emb=spk_emb) - logmel = self.normalizer.inverse(normalized_mel) - return logmel - - -class DiffSingerLoss(nn.Layer): +class FastSpeech2MIDILoss(nn.Layer): """Loss function module for DiffSinger.""" def __init__(self, use_masking: bool=True, @@ -1152,3 +1109,178 @@ class DiffSingerLoss(nn.Layer): pitch_masks.broadcast_to(energy_loss.shape)).sum() return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss + + +class DiffusionLoss(nn.Layer): + """Loss function module for DiffSinger.""" + + def __init__(self, use_masking: bool=True, + use_weighted_masking: bool=False): + """Initialize feed-forward Transformer loss module. + Args: + use_masking (bool): + Whether to apply masking for padded part in loss calculation. + use_weighted_masking (bool): + Whether to weighted masking in loss calculation. + """ + assert check_argument_types() + super().__init__() + + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + + def forward( + self, + ref_mels: paddle.Tensor, + out_mels: paddle.Tensor, + mel_masks: paddle.Tensor, ) -> paddle.Tensor: + """Calculate forward propagation. + + Args: + ref_mels(Tensor): + Batch of real mel (B, Lmax, odim). + out_mels(Tensor): + Batch of outputs mel (B, Lmax, odim). + mel_masks(Tensor): + Batch of mask of real mel (B, Lmax, 1). + Returns: + + """ + # apply mask to remove padded part + if self.use_masking: + out_mels = out_mels.masked_select( + mel_masks.broadcast_to(out_mels.shape)) + ref_mels = ref_mels.masked_select( + mel_masks.broadcast_to(ref_mels.shape)) + + # calculate loss + l1_loss = self.l1_criterion(out_mels, ref_mels) + + # make weighted mask and apply it + if self.use_weighted_masking: + mel_masks = mel_masks.unsqueeze(-1) + out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast( + dtype=paddle.float32).sum( + axis=1, keepdim=True) + out_weights /= ref_mels.shape[0] * ref_mels.shape[2] + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select( + mel_masks.broadcast_to(l1_loss.shape)).sum() + + return l1_loss + + +class DiffSinger(nn.Layer): + """DiffSinger module. + + This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._ + .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`: + https://arxiv.org/pdf/2105.02446.pdf + + Args: + + Returns: + + """ + + def __init__( + self, + fs2_config, + denoiser_config, + diffusion_config, ): + + assert check_argument_types() + super().__init__() + self.fs2 = FastSpeech2MIDI(**fs2_config) + denoiser = WaveNetDenoiser(**denoiser_config) + self.diffusion = GaussianDiffusion(denoiser, **diffusion_config) + + def forward( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + durations: paddle.Tensor, + pitch: paddle.Tensor, + energy: paddle.Tensor, + tone_id: paddle.Tensor=None, + spk_emb: paddle.Tensor=None, + spk_id: paddle.Tensor=None, + train_fs2: bool=True, + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + text_lengths=text_lengths, + speech=speech, + speech_lengths=speech_lengths, + durations=durations, + pitch=pitch, + energy=energy, + spk_id=spk_id, + spk_emb=spk_emb) + cond_fs2, mel_masks = self.fs2.encoder_infer_batch( + text=text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + text_lengths=text_lengths, + speech_lengths=speech_lengths, + ds=durations, + ps=pitch, + es=energy) + cond_fs2 = cond_fs2.transpose((0, 2, 1)) + mel = self.diffusion(speech.transpose((0, 2, 1)), cond_fs2.detach()) + + if train_fs2: + return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits + else: + return mel[0], mel_masks + + def inference( + self, + text: paddle.Tensor, + note: paddle.Tensor, + note_dur: paddle.Tensor, + is_slur: paddle.Tensor, + get_mel_fs2: bool=False, ): + mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur) + if get_mel_fs2: + return mel_fs2 + mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1)) + cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur) + cond_fs2 = cond_fs2.transpose((0, 2, 1)) + mel, _ = self.diffusion(mel_fs2, cond_fs2) + mel = mel.transpose((0, 2, 1)) + return mel[0] + + +class DiffSingerInference(nn.Layer): + def __init__(self, normalizer, model): + super().__init__() + self.normalizer = normalizer + self.acoustic_model = model + + def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False): + normalized_mel = self.acoustic_model.inference( + text, + note=note, + note_dur=note_dur, + is_slur=is_slur, + get_mel_fs2=get_mel_fs2) + logmel = self.normalizer.inverse(normalized_mel) + return logmel diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py index 1f5b15c7b..3a52c592d 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py @@ -13,17 +13,19 @@ # limitations under the License. import logging from pathlib import Path +from typing import Dict +import paddle from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.nn import Layer from paddle.optimizer import Optimizer -from paddlespeech.t2s.models.diffsinger import DiffSingerLoss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState logging.basicConfig( format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', @@ -36,27 +38,36 @@ class DiffSingerUpdater(StandardUpdater): def __init__( self, model: Layer, - optimizer: Optimizer, + optimizers: Dict[str, Optimizer], + criterions: Dict[str, Layer], dataloader: DataLoader, - init_state=None, - use_masking: bool=False, - spk_loss_scale: float=0.02, - use_weighted_masking: bool=False, - output_dir: Path=None, - enable_spk_cls: bool=False, ): - super().__init__(model, optimizer, dataloader, init_state=None) - - self.criterion = DiffSingerLoss( - use_masking=use_masking, - use_weighted_masking=use_weighted_masking, ) + fs2_train_start_steps: int=0, + ds_train_start_steps: int=160000, + output_dir: Path=None, ): + + super().__init__(model, optimizers, dataloader, init_state=None) + + self.optimizers = optimizers + self.optimizer_fs2: Optimizer = optimizers['fs2'] + self.optimizer_ds: Optimizer = optimizers['ds'] + + self.criterions = criterions + self.criterion_fs2 = criterions['fs2'] + self.criterion_ds = criterions['ds'] + + self.dataloader = dataloader + + self.fs2_train_start_steps = fs2_train_start_steps + self.ds_train_start_steps = ds_train_start_steps + + self.state = UpdaterState(iteration=0, epoch=0) + self.train_iterator = iter(self.dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) logger.addHandler(self.filehandler) self.logger = logger self.msg = "" - self.spk_loss_scale = spk_loss_scale - self.enable_spk_cls = enable_spk_cls def update_core(self, batch): self.msg = "Rank: {}, ".format(dist.get_rank()) @@ -68,24 +79,8 @@ class DiffSingerUpdater(StandardUpdater): if spk_emb is not None: spk_id = None - if type( - self.model - ) == DataParallel and self.model._layers.spk_num and self.model._layers.enable_speaker_classifier: - with self.model.no_sync(): - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( - text=batch["text"], - note=batch["note"], - note_dur=batch["note_dur"], - is_slur=batch["is_slur"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=spk_id, - spk_emb=spk_emb) - else: + # fastspeech2 + if self.state.iteration > self.fs2_train_start_steps and self.state.iteration < self.ds_train_start_steps: before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( text=batch["text"], note=batch["note"], @@ -98,74 +93,109 @@ class DiffSingerUpdater(StandardUpdater): pitch=batch["pitch"], energy=batch["energy"], spk_id=spk_id, - spk_emb=spk_emb) - - l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion( - after_outs=after_outs, - before_outs=before_outs, - d_outs=d_outs, - p_outs=p_outs, - e_outs=e_outs, - ys=ys, - ds=batch["durations"], - ps=batch["pitch"], - es=batch["energy"], - ilens=batch["text_lengths"], - olens=olens, - spk_logits=spk_logits, - spk_ids=spk_id, ) - - scaled_speaker_loss = self.spk_loss_scale * speaker_loss - loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss - - optimizer = self.optimizer - optimizer.clear_grad() - loss.backward() - optimizer.step() - - report("train/loss", float(loss)) - report("train/l1_loss", float(l1_loss)) - report("train/duration_loss", float(duration_loss)) - report("train/pitch_loss", float(pitch_loss)) - report("train/energy_loss", float(energy_loss)) - if self.enable_spk_cls: - report("train/speaker_loss", float(speaker_loss)) - report("train/scaled_speaker_loss", float(scaled_speaker_loss)) - - losses_dict["l1_loss"] = float(l1_loss) - losses_dict["duration_loss"] = float(duration_loss) - losses_dict["pitch_loss"] = float(pitch_loss) - losses_dict["energy_loss"] = float(energy_loss) - losses_dict["energy_loss"] = float(energy_loss) - if self.enable_spk_cls: - losses_dict["speaker_loss"] = float(speaker_loss) - losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss) - losses_dict["loss"] = float(loss) - self.msg += ', '.join('{}: {:>.6f}'.format(k, v) - for k, v in losses_dict.items()) + spk_emb=spk_emb, + train_fs2=True, ) + + l1_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2( + after_outs=after_outs, + before_outs=before_outs, + d_outs=d_outs, + p_outs=p_outs, + e_outs=e_outs, + ys=ys, + ds=batch["durations"], + ps=batch["pitch"], + es=batch["energy"], + ilens=batch["text_lengths"], + olens=olens, + spk_logits=spk_logits, + spk_ids=spk_id, ) + + loss_fs2 = l1_loss_fs2 + duration_loss + pitch_loss + energy_loss + + self.optimizer_fs2.clear_grad() + loss_fs2.backward() + self.optimizer_fs2.step() + + report("train/loss_fs2", float(loss_fs2)) + report("train/l1_loss_fs2", float(l1_loss_fs2)) + report("train/duration_loss", float(duration_loss)) + report("train/pitch_loss", float(pitch_loss)) + report("train/energy_loss", float(energy_loss)) + + losses_dict["l1_loss_fs2"] = float(l1_loss_fs2) + losses_dict["duration_loss"] = float(duration_loss) + losses_dict["pitch_loss"] = float(pitch_loss) + losses_dict["energy_loss"] = float(energy_loss) + losses_dict["energy_loss"] = float(energy_loss) + + losses_dict["loss_fs2"] = float(loss_fs2) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + if self.state.iteration > self.ds_train_start_steps: + for param in self.model._layers.fs2.parameters(): + param.trainable = False + + mel, mel_masks = self.model( + text=batch["text"], + note=batch["note"], + note_dur=batch["note_dur"], + is_slur=batch["is_slur"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb, + train_fs2=False, ) + + mel = mel.transpose((0, 2, 1)) + mel_masks = mel_masks.transpose((0, 2, 1)) + l1_loss_ds = self.criterion_ds( + ref_mels=batch["speech"], + out_mels=mel, + mel_masks=mel_masks, ) + + loss_ds = l1_loss_ds + + self.optimizer_ds.clear_grad() + loss_ds.backward() + self.optimizer_ds.step() + + report("train/loss_ds", float(loss_ds)) + report("train/l1_loss_ds", float(l1_loss_ds)) + losses_dict["l1_loss_ds"] = float(l1_loss_ds) + losses_dict["loss_ds"] = float(loss_ds) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + self.logger.info(self.msg) class DiffSingerEvaluator(StandardEvaluator): - def __init__(self, - model: Layer, - dataloader: DataLoader, - use_masking: bool=False, - use_weighted_masking: bool=False, - spk_loss_scale: float=0.02, - output_dir: Path=None, - enable_spk_cls: bool=False): + def __init__( + self, + model: Layer, + criterions: Dict[str, Layer], + dataloader: DataLoader, + output_dir: Path=None, ): super().__init__(model, dataloader) + self.model = model + + self.criterions = criterions + self.criterion_fs2 = criterions['fs2'] + self.criterion_ds = criterions['ds'] + + self.dataloader = dataloader log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) logger.addHandler(self.filehandler) self.logger = logger self.msg = "" - self.spk_loss_scale = spk_loss_scale - self.enable_spk_cls = enable_spk_cls - - self.criterion = DiffSingerLoss( - use_masking=use_masking, use_weighted_masking=use_weighted_masking) def evaluate_core(self, batch): self.msg = "Evaluate: " @@ -176,73 +206,35 @@ class DiffSingerEvaluator(StandardEvaluator): if spk_emb is not None: spk_id = None - if type( - self.model - ) == DataParallel and self.model._layers.spk_num and self.model._layers.enable_speaker_classifier: - with self.model.no_sync(): - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( - text=batch["text"], - note=batch["note"], - note_dur=batch["note_dur"], - is_slur=batch["is_slur"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=spk_id, - spk_emb=spk_emb) - else: - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( - text=batch["text"], - note=batch["note"], - note_dur=batch["note_dur"], - is_slur=batch["is_slur"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=spk_id, - spk_emb=spk_emb) - - l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion( - after_outs=after_outs, - before_outs=before_outs, - d_outs=d_outs, - p_outs=p_outs, - e_outs=e_outs, - ys=ys, - ds=batch["durations"], - ps=batch["pitch"], - es=batch["energy"], - ilens=batch["text_lengths"], - olens=olens, - spk_logits=spk_logits, - spk_ids=spk_id, ) - - scaled_speaker_loss = self.spk_loss_scale * speaker_loss - loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss - - report("eval/loss", float(loss)) - report("eval/l1_loss", float(l1_loss)) - report("eval/duration_loss", float(duration_loss)) - report("eval/pitch_loss", float(pitch_loss)) - report("eval/energy_loss", float(energy_loss)) - if self.enable_spk_cls: - report("train/speaker_loss", float(speaker_loss)) - report("train/scaled_speaker_loss", float(scaled_speaker_loss)) - - losses_dict["l1_loss"] = float(l1_loss) - losses_dict["duration_loss"] = float(duration_loss) - losses_dict["pitch_loss"] = float(pitch_loss) - losses_dict["energy_loss"] = float(energy_loss) - if self.enable_spk_cls: - losses_dict["speaker_loss"] = float(speaker_loss) - losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss) - losses_dict["loss"] = float(loss) + mel, mel_masks = self.model( + text=batch["text"], + note=batch["note"], + note_dur=batch["note_dur"], + is_slur=batch["is_slur"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb, + train_fs2=False, ) + + mel = mel.transpose((0, 2, 1)) + mel_masks = mel_masks.transpose((0, 2, 1)) + l1_loss_ds = self.criterion_ds( + ref_mels=batch["speech"], + out_mels=mel, + mel_masks=mel_masks, ) + + loss_ds = l1_loss_ds + + report("train/loss_ds", float(loss_ds)) + report("train/l1_loss_ds", float(l1_loss_ds)) + losses_dict["l1_loss_ds"] = float(l1_loss_ds) + losses_dict["loss_ds"] = float(loss_ds) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) + self.logger.info(self.msg)