# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""DiffSinger related modules for paddle"""
from typing import Any
from typing import Dict
from typing import Tuple

import numpy as np
import paddle
from paddle import nn
from typeguard import check_argument_types

from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI
from paddlespeech.t2s.modules.diffnet import DiffNet
from paddlespeech.t2s.modules.diffusion import GaussianDiffusion


class DiffSinger(nn.Layer):
    """DiffSinger module.

    This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._
    .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`:
        https://arxiv.org/pdf/2105.02446.pdf

    Args:

    Returns:

    """

    def __init__(
            self,
            # min and max spec for stretching before diffusion
            spec_min: paddle.Tensor,
            spec_max: paddle.Tensor,
            # fastspeech2midi config
            idim: int,
            odim: int,
            use_energy_pred: bool=False,
            use_postnet: bool=False,
            # music score related 
            note_num: int=300,
            is_slur_num: int=2,
            fastspeech2_params: Dict[str, Any]={
                "adim": 256,
                "aheads": 2,
                "elayers": 4,
                "eunits": 1024,
                "dlayers": 4,
                "dunits": 1024,
                "positionwise_layer_type": "conv1d",
                "positionwise_conv_kernel_size": 1,
                "use_scaled_pos_enc": True,
                "use_batch_norm": True,
                "encoder_normalize_before": True,
                "decoder_normalize_before": True,
                "encoder_concat_after": False,
                "decoder_concat_after": False,
                "reduction_factor": 1,
                # for transformer
                "transformer_enc_dropout_rate": 0.1,
                "transformer_enc_positional_dropout_rate": 0.1,
                "transformer_enc_attn_dropout_rate": 0.1,
                "transformer_dec_dropout_rate": 0.1,
                "transformer_dec_positional_dropout_rate": 0.1,
                "transformer_dec_attn_dropout_rate": 0.1,
                "transformer_activation_type": "gelu",
                # duration predictor
                "duration_predictor_layers": 2,
                "duration_predictor_chans": 384,
                "duration_predictor_kernel_size": 3,
                "duration_predictor_dropout_rate": 0.1,
                # pitch predictor
                "use_pitch_embed": True,
                "pitch_predictor_layers": 2,
                "pitch_predictor_chans": 384,
                "pitch_predictor_kernel_size": 3,
                "pitch_predictor_dropout": 0.5,
                "pitch_embed_kernel_size": 9,
                "pitch_embed_dropout": 0.5,
                "stop_gradient_from_pitch_predictor": False,
                # energy predictor
                "use_energy_embed": False,
                "energy_predictor_layers": 2,
                "energy_predictor_chans": 384,
                "energy_predictor_kernel_size": 3,
                "energy_predictor_dropout": 0.5,
                "energy_embed_kernel_size": 9,
                "energy_embed_dropout": 0.5,
                "stop_gradient_from_energy_predictor": False,
                # postnet
                "postnet_layers": 5,
                "postnet_chans": 512,
                "postnet_filts": 5,
                "postnet_dropout_rate": 0.5,
                # spk emb
                "spk_num": None,
                "spk_embed_dim": None,
                "spk_embed_integration_type": "add",
                # training related
                "init_type": "xavier_uniform",
                "init_enc_alpha": 1.0,
                "init_dec_alpha": 1.0,
                # speaker classifier
                "enable_speaker_classifier": False,
                "hidden_sc_dim": 256,
            },
            # denoiser config
            denoiser_params: Dict[str, Any]={
                "in_channels": 80,
                "out_channels": 80,
                "kernel_size": 3,
                "layers": 20,
                "stacks": 5,
                "residual_channels": 256,
                "gate_channels": 512,
                "skip_channels": 256,
                "aux_channels": 256,
                "dropout": 0.,
                "bias": True,
                "use_weight_norm": False,
                "init_type": "kaiming_normal",
            },
            # diffusion config
            diffusion_params: Dict[str, Any]={
                "num_train_timesteps": 100,
                "beta_start": 0.0001,
                "beta_end": 0.06,
                "beta_schedule": "squaredcos_cap_v2",
                "num_max_timesteps": 60,
                "stretch": True,
            }, ):
        """Initialize DiffSinger module.

        Args:
            spec_min (paddle.Tensor): The minimum value of the feature(mel) to stretch before diffusion.
            spec_max (paddle.Tensor): The maximum value of the feature(mel) to stretch before diffusion.
            idim (int): Dimension of the inputs (Input vocabrary size.).
            odim (int): Dimension of the outputs (Acoustic feature dimension.).
            use_energy_pred (bool, optional): whether use energy predictor. Defaults False.
            use_postnet (bool, optional): whether use postnet. Defaults False.
            note_num (int, optional): The number of note. Defaults to 300.
            is_slur_num (int, optional): The number of slur. Defaults to 2.
            fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module.
            denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
            diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
        """
        assert check_argument_types()
        super().__init__()
        self.fs2 = FastSpeech2MIDI(
            idim=idim,
            odim=odim,
            fastspeech2_params=fastspeech2_params,
            note_num=note_num,
            is_slur_num=is_slur_num,
            use_energy_pred=use_energy_pred,
            use_postnet=use_postnet, )
        denoiser = DiffNet(**denoiser_params)
        self.diffusion = GaussianDiffusion(
            denoiser,
            **diffusion_params,
            min_values=spec_min,
            max_values=spec_max, )

    def forward(
            self,
            text: paddle.Tensor,
            note: paddle.Tensor,
            note_dur: paddle.Tensor,
            is_slur: paddle.Tensor,
            text_lengths: paddle.Tensor,
            speech: paddle.Tensor,
            speech_lengths: paddle.Tensor,
            durations: paddle.Tensor,
            pitch: paddle.Tensor,
            energy: paddle.Tensor,
            spk_emb: paddle.Tensor=None,
            spk_id: paddle.Tensor=None,
            only_train_fs2: bool=True,
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

        Args:
            text(Tensor(int64)): 
                Batch of padded token (phone) ids (B, Tmax).
            note(Tensor(int64)): 
                Batch of padded note (element in music score) ids (B, Tmax).
            note_dur(Tensor(float32)): 
                Batch of padded note durations in seconds (element in music score) (B, Tmax).
            is_slur(Tensor(int64)): 
                Batch of padded slur (element in music score) ids (B, Tmax).
            text_lengths(Tensor(int64)): 
                Batch of phone lengths of each input (B,).
            speech(Tensor[float32]): 
                Batch of padded target features (e.g. mel) (B, Lmax, odim).
            speech_lengths(Tensor(int64)): 
                Batch of the lengths of each target features (B,).
            durations(Tensor(int64)): 
                Batch of padded token durations in frame (B, Tmax).
            pitch(Tensor[float32]): 
                Batch of padded frame-averaged pitch (B, Lmax, 1).
            energy(Tensor[float32]): 
                Batch of padded frame-averaged energy (B, Lmax, 1).
            spk_emb(Tensor[float32], optional): 
                Batch of speaker embeddings (B, spk_embed_dim).
            spk_id(Tnesor[int64], optional(int64)): 
                Batch of speaker ids (B,)
            only_train_fs2(bool):
                Whether to train only the fastspeech2 module

        Returns:

        """
        # only train fastspeech2 module firstly
        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2(
            text=text,
            note=note,
            note_dur=note_dur,
            is_slur=is_slur,
            text_lengths=text_lengths,
            speech=speech,
            speech_lengths=speech_lengths,
            durations=durations,
            pitch=pitch,
            energy=energy,
            spk_id=spk_id,
            spk_emb=spk_emb)
        if only_train_fs2:
            return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits

        # get the encoder output from fastspeech2 as the condition of denoiser module
        cond_fs2, mel_masks = self.fs2.encoder_infer_batch(
            text=text,
            note=note,
            note_dur=note_dur,
            is_slur=is_slur,
            text_lengths=text_lengths,
            speech_lengths=speech_lengths,
            ds=durations,
            ps=pitch,
            es=energy)
        cond_fs2 = cond_fs2.transpose((0, 2, 1))

        # get the output(final mel) from diffusion module
        noise_pred, noise_target = self.diffusion(
            speech.transpose((0, 2, 1)), cond_fs2)
        return noise_pred, noise_target, mel_masks

    def inference(
            self,
            text: paddle.Tensor,
            note: paddle.Tensor,
            note_dur: paddle.Tensor,
            is_slur: paddle.Tensor,
            get_mel_fs2: bool=False, ):
        """Run inference

        Args:
            text(Tensor(int64)): 
                Batch of padded token (phone) ids (B, Tmax).
            note(Tensor(int64)): 
                Batch of padded note (element in music score) ids (B, Tmax).
            note_dur(Tensor(float32)): 
                Batch of padded note durations in seconds (element in music score) (B, Tmax).
            is_slur(Tensor(int64)): 
                Batch of padded slur (element in music score) ids (B, Tmax).
            get_mel_fs2 (bool, optional): . Defaults to False.
                Whether to get mel from fastspeech2 module.

        Returns:
            
        """
        mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur)
        if get_mel_fs2:
            return mel_fs2
        mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1))
        cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur)
        cond_fs2 = cond_fs2.transpose((0, 2, 1))
        noise = paddle.randn(mel_fs2.shape)
        mel = self.diffusion.inference(
            noise=noise,
            cond=cond_fs2,
            ref_x=mel_fs2,
            scheduler_type="ddpm",
            num_inference_steps=60)
        mel = mel.transpose((0, 2, 1))
        return mel[0]


class DiffSingerInference(nn.Layer):
    def __init__(self, normalizer, model):
        super().__init__()
        self.normalizer = normalizer
        self.acoustic_model = model

    def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False):
        """Calculate forward propagation.

        Args:
            text(Tensor(int64)): 
                Batch of padded token (phone) ids (B, Tmax).
            note(Tensor(int64)): 
                Batch of padded note (element in music score) ids (B, Tmax).
            note_dur(Tensor(float32)): 
                Batch of padded note durations in seconds (element in music score) (B, Tmax).
            is_slur(Tensor(int64)): 
                Batch of padded slur (element in music score) ids (B, Tmax).
            get_mel_fs2 (bool, optional): . Defaults to False.
                Whether to get mel from fastspeech2 module.

        Returns:
            logmel(Tensor(float32)): denorm logmel, [T, mel_bin]
        """
        normalized_mel = self.acoustic_model.inference(
            text=text,
            note=note,
            note_dur=note_dur,
            is_slur=is_slur,
            get_mel_fs2=get_mel_fs2)
        logmel = normalized_mel
        return logmel


class DiffusionLoss(nn.Layer):
    """Loss function module for Diffusion module on DiffSinger."""

    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
        Args:
            use_masking (bool): 
                Whether to apply masking for padded part in loss calculation.
            use_weighted_masking (bool): 
                Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()

        assert (use_masking != use_weighted_masking) or not use_masking
        self.use_masking = use_masking
        self.use_weighted_masking = use_weighted_masking

        # define criterions
        reduction = "none" if self.use_weighted_masking else "mean"
        self.l1_criterion = nn.L1Loss(reduction=reduction)

    def forward(
            self,
            noise_pred: paddle.Tensor,
            noise_target: paddle.Tensor,
            mel_masks: paddle.Tensor, ) -> paddle.Tensor:
        """Calculate forward propagation.

        Args:
            noise_pred(Tensor): 
                Batch of outputs predict noise (B, Lmax, odim).
            noise_target(Tensor):  
                Batch of target noise (B, Lmax, odim).
            mel_masks(Tensor): 
                Batch of mask of real mel (B, Lmax, 1).
        Returns:
        
        """
        # apply mask to remove padded part
        if self.use_masking:
            noise_pred = noise_pred.masked_select(
                mel_masks.broadcast_to(noise_pred.shape))
            noise_target = noise_target.masked_select(
                mel_masks.broadcast_to(noise_target.shape))

        # calculate loss
        l1_loss = self.l1_criterion(noise_pred, noise_target)

        # make weighted mask and apply it
        if self.use_weighted_masking:
            mel_masks = mel_masks.unsqueeze(-1)
            out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast(
                dtype=paddle.float32).sum(
                    axis=1, keepdim=True)
            out_weights /= noise_target.shape[0] * noise_target.shape[2]

            # apply weight
            l1_loss = l1_loss.multiply(out_weights)
            l1_loss = l1_loss.masked_select(
                mel_masks.broadcast_to(l1_loss.shape)).sum()

        return l1_loss