diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py index c8ef956c..4804ffb4 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -324,7 +324,10 @@ class Tacotron2(nn.Layer): ys = ys[:, :max_out] labels = labels[:, :max_out] labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0) - return after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens + olens_in = olens // self.reduction_factor + else: + olens_in = olens + return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in def _forward( self, diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py index f1a2a50e..6d41702c 100644 --- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -72,11 +72,10 @@ class Tacotron2Updater(StandardUpdater): # spk_id!=None in multiple spk fastspeech2 spk_id = batch["spk_id"] if "spk_id" in batch else None spk_emb = batch["spk_emb"] if "spk_emb" in batch else None - # No explicit speaker identifier labels are used during voice cloning training. if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -101,11 +100,8 @@ class Tacotron2Updater(StandardUpdater): if self.use_guided_attn_loss: # NOTE: length of output for auto-regressive # input will be changed when r > 1 - if self.model.reduction_factor > 1: - olens_in = olens // self.model.reduction_factor - else: - olens_in = olens - attn_loss = self.attn_loss(att_ws, ilens, olens_in) + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss optimizer = self.optimizer @@ -169,7 +165,7 @@ class Tacotron2Evaluator(StandardEvaluator): if spk_emb is not None: spk_id = None - after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model( + after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -194,11 +190,8 @@ class Tacotron2Evaluator(StandardEvaluator): if self.use_guided_attn_loss: # NOTE: length of output for auto-regressive # input will be changed when r > 1 - if self.model.reduction_factor > 1: - olens_in = olens // self.model.reduction_factor - else: - olens_in = olens - attn_loss = self.attn_loss(att_ws, ilens, olens_in) + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) loss = loss + attn_loss report("eval/l1_loss", float(l1_loss)) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index ae6d7365..ba1f33ea 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -447,12 +447,15 @@ class TransformerTTS(nn.Layer): # modifiy mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] labels = labels[:, :max_olen] labels[:, -1] = 1.0 # make sure at least one frame has 1 + olens_in = olens // self.reduction_factor + else: + olens_in = olens + need_dict = {} need_dict['encoder'] = self.encoder need_dict['decoder'] = self.decoder @@ -462,7 +465,7 @@ class TransformerTTS(nn.Layer): 'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc - return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict + return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict def _forward( self, @@ -488,8 +491,7 @@ class TransformerTTS(nn.Layer): # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor] - olens_in = olens.new( - [olen // self.reduction_factor for olen in olens]) + olens_in = olens // self.reduction_factor else: ys_in, olens_in = ys, olens @@ -769,318 +771,3 @@ class TransformerTTSInference(nn.Layer): normalized_mel = self.acoustic_model.inference(text)[0] logmel = self.normalizer.inverse(normalized_mel) return logmel - - -class TransformerTTSLoss(nn.Layer): - """Loss function module for Tacotron2.""" - - def __init__(self, - use_masking=True, - use_weighted_masking=False, - bce_pos_weight=5.0): - """Initialize Tactoron2 loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float - Weight of positive sample of stop token. - - """ - super().__init__() - assert (use_masking != use_weighted_masking) or not use_masking - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - # define criterions - reduction = "none" if self.use_weighted_masking else "mean" - self.l1_criterion = nn.L1Loss(reduction=reduction) - self.mse_criterion = nn.MSELoss(reduction=reduction) - self.bce_criterion = nn.BCEWithLogitsLoss( - reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) - - def forward(self, after_outs, before_outs, logits, ys, labels, olens): - """Calculate forward propagation. - - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - logits : Tensor - Batch of stop logits (B, Lmax). - ys : Tensor - Batch of padded target features (B, Lmax, odim). - labels : LongTensor - Batch of the sequences of stop token labels (B, Lmax). - olens : LongTensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Mean square error loss value. - Tensor - Binary cross entropy loss value. - - """ - # make mask and apply it - if self.use_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - ys = ys.masked_select(masks.broadcast_to(ys.shape)) - after_outs = after_outs.masked_select( - masks.broadcast_to(after_outs.shape)) - before_outs = before_outs.masked_select( - masks.broadcast_to(before_outs.shape)) - # Operator slice does not have kernel for data_type[bool] - tmp_masks = paddle.cast(masks, dtype='int64') - tmp_masks = tmp_masks[:, :, 0] - tmp_masks = paddle.cast(tmp_masks, dtype='bool') - labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape)) - logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape)) - - # calculate loss - l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( - before_outs, ys) - mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( - before_outs, ys) - bce_loss = self.bce_criterion(logits, labels) - - # make weighted mask and apply it - if self.use_weighted_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - weights = masks.float() / masks.sum(dim=1, keepdim=True).float() - out_weights = weights.div(ys.shape[0] * ys.shape[2]) - logit_weights = weights.div(ys.shape[0]) - - # apply weight - l1_loss = l1_loss.multiply(out_weights) - l1_loss = l1_loss.masked_select( - masks.broadcast_to(l1_loss.shape)).sum() - - mse_loss = mse_loss.multiply(out_weights) - mse_loss = mse_loss.masked_select( - masks.broadcast_to(mse_loss.shape)).sum() - - bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) - bce_loss = bce_loss.masked_select( - masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum() - - return l1_loss, mse_loss, bce_loss - - -class GuidedAttentionLoss(nn.Layer): - """Guided attention loss function module. - - This module calculates the guided attention loss described - in `Efficiently Trainable Text-to-Speech System Based - on Deep Convolutional Networks with Guided Attention`_, - which forces the attention to be diagonal. - - .. _`Efficiently Trainable Text-to-Speech System - Based on Deep Convolutional Networks with Guided Attention`: - https://arxiv.org/abs/1710.08969 - - """ - - def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): - """Initialize guided attention loss module. - - Parameters - ---------- - sigma : float, optional - Standard deviation to control how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - super(GuidedAttentionLoss, self).__init__() - self.sigma = sigma - self.alpha = alpha - self.reset_always = reset_always - self.guided_attn_masks = None - self.masks = None - - def _reset_masks(self): - self.guided_attn_masks = None - self.masks = None - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of attention weights (B, T_max_out, T_max_in). - ilens : LongTensor - Batch of input lenghts (B,). - olens : LongTensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = self._make_guided_attention_masks(ilens, - olens) - if self.masks is None: - self.masks = self._make_masks(ilens, olens) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - return self.alpha * loss - - def _make_guided_attention_masks(self, ilens, olens): - n_batches = len(ilens) - max_ilen = max(ilens) - max_olen = max(olens) - guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) - - for idx, (ilen, olen) in enumerate(zip(ilens, olens)): - - ilen = int(ilen) - olen = int(olen) - guided_attn_masks[idx, :olen, : - ilen] = self._make_guided_attention_mask( - ilen, olen, self.sigma) - return guided_attn_masks - - @staticmethod - def _make_guided_attention_mask(ilen, olen, sigma): - """Make guided attention mask. - - Examples - ---------- - >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) - >>> guided_attn_mask.shape - [5, 5] - >>> guided_attn_mask - tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], - [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], - [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], - [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], - [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) - >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) - >>> guided_attn_mask.shape - [6, 3] - >>> guided_attn_mask - tensor([[0.0000, 0.2934, 0.7506], - [0.0831, 0.0831, 0.5422], - [0.2934, 0.0000, 0.2934], - [0.5422, 0.0831, 0.0831], - [0.7506, 0.2934, 0.0000], - [0.8858, 0.5422, 0.0831]]) - - """ - grid_x, grid_y = paddle.meshgrid( - paddle.arange(olen), paddle.arange(ilen)) - grid_x = grid_x.cast(dtype=paddle.float32) - grid_y = grid_y.cast(dtype=paddle.float32) - return 1.0 - paddle.exp(-( - (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) - - @staticmethod - def _make_masks(ilens, olens): - """Make masks indicating non-padded part. - - Parameters - ---------- - ilens (LongTensor or List): Batch of lengths (B,). - olens (LongTensor or List): Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor indicating non-padded part. - - Examples - ---------- - >>> ilens, olens = [5, 2], [8, 5] - >>> _make_mask(ilens, olens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], - - [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]], dtype=paddle.uint8) - - """ - # (B, T_in) - in_masks = make_non_pad_mask(ilens) - # (B, T_out) - out_masks = make_non_pad_mask(olens) - # (B, T_out, T_in) - - return paddle.logical_and( - out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) - - -class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): - """Guided attention loss function module for multi head attention. - - Parameters - ---------- - sigma : float, optional - Standard deviation to controlGuidedAttentionLoss - how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens : Tensor - Batch of input lenghts (B,). - olens : Tensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = ( - self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) - if self.masks is None: - self.masks = self._make_masks(ilens, olens).unsqueeze(1) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - - return self.alpha * loss diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index 6022567e..bcc454c0 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -17,8 +17,8 @@ from typing import Sequence import paddle from paddle import distributed as dist -from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss -from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss +from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater @@ -71,7 +71,7 @@ class TransformerTTSUpdater(StandardUpdater): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -116,7 +116,10 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -133,7 +136,8 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("train/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -150,7 +154,10 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("train/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss @@ -215,7 +222,7 @@ class TransformerTTSEvaluator(StandardEvaluator): def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -260,7 +267,10 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -277,7 +287,8 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("eval/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -295,7 +306,10 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 781ac792..044a52e5 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -26,26 +26,30 @@ from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask # Loss for new Tacotron2 class GuidedAttentionLoss(nn.Layer): """Guided attention loss function module. + This module calculates the guided attention loss described in `Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention`_, which forces the attention to be diagonal. + .. _`Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention`: https://arxiv.org/abs/1710.08969 + """ def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): """Initialize guided attention loss module. + Parameters ---------- sigma : float, optional - Standard deviation to control - how close attention to a diagonal. + Standard deviation to control how close attention to a diagonal. alpha : float, optional Scaling coefficient (lambda). reset_always : bool, optional Whether to always reset masks. + """ super().__init__() self.sigma = sigma @@ -60,18 +64,21 @@ class GuidedAttentionLoss(nn.Layer): def forward(self, att_ws, ilens, olens): """Calculate forward propagation. + Parameters ---------- att_ws : Tensor Batch of attention weights (B, T_max_out, T_max_in). ilens : Tensor(int64) - Batch of input lengths (B,). + Batch of input lenghts (B,). olens : Tensor(int64) - Batch of output lengths (B,). + Batch of output lenghts (B,). + Returns ---------- Tensor Guided attention loss value. + """ if self.guided_attn_masks is None: self.guided_attn_masks = self._make_guided_attention_masks(ilens, @@ -79,7 +86,8 @@ class GuidedAttentionLoss(nn.Layer): if self.masks is None: self.masks = self._make_masks(ilens, olens) losses = self.guided_attn_masks * att_ws - loss = paddle.mean(losses.masked_select(self.masks)) + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) if self.reset_always: self._reset_masks() return self.alpha * loss @@ -89,6 +97,7 @@ class GuidedAttentionLoss(nn.Layer): max_ilen = max(ilens) max_olen = max(olens) guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): guided_attn_masks[idx, :olen, : ilen] = self._make_guided_attention_mask( @@ -98,11 +107,12 @@ class GuidedAttentionLoss(nn.Layer): @staticmethod def _make_guided_attention_mask(ilen, olen, sigma): """Make guided attention mask. - Parameters + + Examples ---------- >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) >>> guided_attn_mask.shape - Size([5, 5]) + [5, 5] >>> guided_attn_mask tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], @@ -111,7 +121,7 @@ class GuidedAttentionLoss(nn.Layer): [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) >>> guided_attn_mask.shape - Size([6, 3]) + [6, 3] >>> guided_attn_mask tensor([[0.0000, 0.2934, 0.7506], [0.0831, 0.0831, 0.5422], @@ -119,55 +129,109 @@ class GuidedAttentionLoss(nn.Layer): [0.5422, 0.0831, 0.0831], [0.7506, 0.2934, 0.0000], [0.8858, 0.5422, 0.0831]]) + """ grid_x, grid_y = paddle.meshgrid( paddle.arange(olen), paddle.arange(ilen)) - grid_x = paddle.cast(grid_x, dtype='float32') - grid_y = paddle.cast(grid_y, dtype='float32') - + grid_x = grid_x.cast(dtype=paddle.float32) + grid_y = grid_y.cast(dtype=paddle.float32) return 1.0 - paddle.exp(-( (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) @staticmethod def _make_masks(ilens, olens): """Make masks indicating non-padded part. - Examples + + Parameters ---------- ilens : Tensor(int64) or List Batch of lengths (B,). olens : Tensor(int64) or List Batch of lengths (B,). + Returns ---------- Tensor Mask tensor indicating non-padded part. + Examples ---------- >>> ilens, olens = [5, 2], [8, 5] >>> _make_mask(ilens, olens) tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]],) + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], dtype=paddle.uint8) + """ # (B, T_in) in_masks = make_non_pad_mask(ilens) # (B, T_out) out_masks = make_non_pad_mask(olens) # (B, T_out, T_in) - return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) + + return paddle.logical_and( + out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) + + +class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): + """Guided attention loss function module for multi head attention. + + Parameters + ---------- + sigma : float, optional + Standard deviation to controlGuidedAttentionLoss + how close attention to a diagonal. + alpha : float, optional + Scaling coefficient (lambda). + reset_always : bool, optional + Whether to always reset masks. + + """ + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + + Parameters + ---------- + att_ws : Tensor + Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens : Tensor + Batch of input lenghts (B,). + olens : Tensor + Batch of output lenghts (B,). + + Returns + ---------- + Tensor + Guided attention loss value. + + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = ( + self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) + if self.masks is None: + self.masks = self._make_masks(ilens, olens).unsqueeze(1) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) + if self.reset_always: + self._reset_masks() + + return self.alpha * loss class Tacotron2Loss(nn.Layer):