add multi-band melgan finetune scripts

4 years ago · a6ac497f8e
parent dc7daa2a61
commit a6ac497f8e
7 changed files with 586 additions and 123 deletions
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@ -13,7 +13,6 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
 from typing import Union
 import numpy as np
 import paddle
@ -23,129 +22,12 @@ from yacs.config import CfgNode
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGInference
 from paddlespeech.t2s.modules.normalizer import ZScore
 class StyleFastSpeech2Inference(FastSpeech2Inference):
    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
        super().__init__(normalizer, model)
        pitch_mean, pitch_std = np.load(pitch_stats_path)
        self.pitch_mean = paddle.to_tensor(pitch_mean)
        self.pitch_std = paddle.to_tensor(pitch_std)
        energy_mean, energy_std = np.load(energy_stats_path)
        self.energy_mean = paddle.to_tensor(energy_mean)
        self.energy_std = paddle.to_tensor(energy_std)
    def denorm(self, data, mean, std):
        return data * std + mean
    def norm(self, data, mean, std):
        return (data - mean) / std
    def forward(self,
                text: paddle.Tensor,
                durations: Union[paddle.Tensor, np.ndarray]=None,
                durations_scale: Union[int, float]=None,
                durations_bias: Union[int, float]=None,
                pitch: Union[paddle.Tensor, np.ndarray]=None,
                pitch_scale: Union[int, float]=None,
                pitch_bias: Union[int, float]=None,
                energy: Union[paddle.Tensor, np.ndarray]=None,
                energy_scale: Union[int, float]=None,
                energy_bias: Union[int, float]=None,
                robot: bool=False):
        """
        Parameters
        ----------
        text : Tensor(int64)
            Input sequence of characters (T,).
        speech : Tensor, optional
            Feature sequence to extract style (N, idim).
        durations : paddle.Tensor/np.ndarray, optional (int64)
            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
        durations_scale: int/float, optional
        durations_bias: int/float, optional
        pitch : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
        pitch_scale: int/float, optional
            In denormed HZ domain.
        pitch_bias: int/float, optional
            In denormed HZ domain.
        energy : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
        energy_scale: int/float, optional
            In denormed domain.
        energy_bias: int/float, optional
            In denormed domain.
        robot : bool, optional
            Weather output robot style
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        """
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text, durations=None, pitch=None, energy=None)
        # priority: groundtruth > scale/bias > previous output
        # set durations
        if isinstance(durations, np.ndarray):
            durations = paddle.to_tensor(durations)
        elif isinstance(durations, paddle.Tensor):
            durations = durations
        elif durations_scale or durations_bias:
            durations_scale = durations_scale if durations_scale is not None else 1
            durations_bias = durations_bias if durations_bias is not None else 0
            durations = durations_scale * d_outs + durations_bias
        else:
            durations = d_outs
        if robot:
            # set normed pitch to zeros have the same effect with set denormd ones to mean
            pitch = paddle.zeros(p_outs.shape)
        # set pitch, can overwrite robot set  
        if isinstance(pitch, np.ndarray):
            pitch = paddle.to_tensor(pitch)
        elif isinstance(pitch, paddle.Tensor):
            pitch = pitch
        elif pitch_scale or pitch_bias:
            pitch_scale = pitch_scale if pitch_scale is not None else 1
            pitch_bias = pitch_bias if pitch_bias is not None else 0
            p_Hz = paddle.exp(
                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
            p_HZ = pitch_scale * p_Hz + pitch_bias
            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
        else:
            pitch = p_outs
        # set energy
        if isinstance(energy, np.ndarray):
            energy = paddle.to_tensor(energy)
        elif isinstance(energy, paddle.Tensor):
            energy = energy
        elif energy_scale or energy_bias:
            energy_scale = energy_scale if energy_scale is not None else 1
            energy_bias = energy_bias if energy_bias is not None else 0
            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
            e_dnorm = energy_scale * e_dnorm + energy_bias
            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
        else:
            energy = e_outs
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text,
            durations=durations,
            pitch=pitch,
            energy=energy,
            use_teacher_forcing=True)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
 def evaluate(args, fastspeech2_config, pwg_config):
    # construct dataset for evaluation
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@ -0,0 +1,139 @@
 # This is the hyperparameter configuration file for MelGAN.
 # Please make sure this is adjusted for the CSMSC dataset. If you want to
 # apply to the other dataset, you might need to carefully change some parameters.
 # This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
 # to converge). The optimizer setting is based on @dathudeptrai advice.
 # https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 fs: 24000                # Sampling rate.
 n_fft: 2048              # FFT size. (in samples)
 n_shift: 300             # Hop size. (in samples)
 win_length: 1200         # Window length. (in samples)
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80               # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
 fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
 generator_params:
    in_channels: 80               # Number of input channels.
    out_channels: 4               # Number of output channels.
    kernel_size: 7                # Kernel size of initial and final conv layers.
    channels: 384                 # Initial number of channels for conv layers.
    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
    stacks: 4                     # Number of stacks in a single residual stack module.
    use_weight_norm: True         # Whether to use weight normalization.
    use_causal_conv: False        # Whether to use causal convolution.
    use_final_nonlinear_activation: True
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
 discriminator_params:
    in_channels: 1                    # Number of input channels.
    out_channels: 1                   # Number of output channels.
    scales: 3                         # Number of multi-scales.
    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
    downsample_pooling_params:        # Parameters of the above pooling function.
        kernel_size: 4
        stride: 2
        padding: 1
        exclusive: True
    kernel_sizes: [5, 3]              # List of kernel size.
    channels: 16                      # Number of channels of the initial conv layer.
    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
    downsample_scales: [4, 4, 4]      # List of downsampling scales.
    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
    nonlinear_activation_params:      # Parameters of nonlinear activation function.
        negative_slope: 0.2
    use_weight_norm: True             # Whether to use weight norm.
 ###########################################################
 #                   STFT LOSS SETTING                     #
 ###########################################################
 use_stft_loss: true
 stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann"                # Window function for STFT-based loss
 use_subband_stft_loss: true
 subband_stft_loss_params:
    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
    window: "hann"              # Window function for STFT-based loss
 ###########################################################
 #               ADVERSARIAL LOSS SETTING                  #
 ###########################################################
 use_feat_match_loss: false # Whether to use feature matching loss.
 lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64             # Batch size.
 batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
 num_workers: 2             # Number of workers in DataLoader.
 ###########################################################
 #             OPTIMIZER & SCHEDULER SETTING               #
 ###########################################################
 generator_optimizer_params:
    epsilon: 1.0e-7                     # Generator's epsilon.
    weight_decay: 0.0                   # Generator's weight decay coefficient.
 generator_grad_norm: -1                 # Generator's gradient norm.
 generator_scheduler_params:
    learning_rate: 1.0e-3               # Generator's learning rate.
    gamma: 0.5                          # Generator's scheduler gamma.
    milestones:                         # At each milestone, lr will be multiplied by gamma.
        - 100000
        - 200000
        - 300000
        - 400000
        - 500000
        - 600000
 discriminator_optimizer_params:
    epsilon: 1.0e-7                          # Discriminator's epsilon.
    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
 discriminator_grad_norm: -1                 # Discriminator's gradient norm.
 discriminator_scheduler_params:
    learning_rate: 1.0e-3                   # Discriminator's learning rate.
    gamma: 0.5                              # Discriminator's scheduler gamma.
    milestones:                             # At each milestone, lr will be multiplied by gamma.
        - 100000
        - 200000
        - 300000
        - 400000
        - 500000
        - 600000
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
 train_max_steps: 1200000                # Number of training steps.
 save_interval_steps: 1000              # Interval steps to save checkpoint.
 eval_interval_steps: 1000               # Interval steps to evaluate the network.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_snapshots: 10                 # max number of snapshots to keep while training
 seed: 42                          # random seed for paddle, random, and np.random
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
@ -0,0 +1,65 @@
 #!/bin/bash
 source path.sh
 gpus=0
 stage=0
 stop_stage=100
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gt_duration_mel.py \
      --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
      --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
      --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
      --dur-file=durations.txt \
      --output-dir=dump_finetune \
      --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  python3 local/link_wav.py \
    --old-dump-dir=dump \
    --dump-dir=dump_finetune 
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump_finetune/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/train/raw/metadata.jsonl \
        --dumpdir=dump_finetune/train/norm \
        --stats=dump_finetune/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/dev/raw/metadata.jsonl \
        --dumpdir=dump_finetune/dev/norm \
        --stats=dump_finetune/train/feats_stats.npy
    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump_finetune/test/raw/metadata.jsonl \
        --dumpdir=dump_finetune/test/norm \
        --stats=dump_finetune/train/feats_stats.npy
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
  CUDA_VISIBLE_DEVICES=${gpus} \
  FLAGS_cudnn_exhaustive_search=true \
  FLAGS_conv_workspace_size_limit=4000 \
  python ${BIN_DIR}/train.py \
      --train-metadata=dump_finetune/train/norm/metadata.jsonl \
      --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
      --config=conf/finetune.yaml \
      --output-dir=exp/finetune \
      --ngpu=1
 fi 
--- a/examples/csmsc/voc3/local/link_wav.py
+++ b/examples/csmsc/voc3/local/link_wav.py
@ -0,0 +1,85 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from operator import itemgetter
 from pathlib import Path
 import jsonlines
 import numpy as np
 def main():
    # parse config and args
    parser = argparse.ArgumentParser(
        description="Preprocess audio and then extract features .")
    parser.add_argument(
        "--old-dump-dir",
        default=None,
        type=str,
        help="directory to dump feature files.")
    parser.add_argument(
        "--dump-dir",
        type=str,
        required=True,
        help="directory to finetune dump feature files.")
    args = parser.parse_args()
    old_dump_dir = Path(args.old_dump_dir).expanduser()
    old_dump_dir = old_dump_dir.resolve()
    dump_dir = Path(args.dump_dir).expanduser()
    # use absolute path
    dump_dir = dump_dir.resolve()
    dump_dir.mkdir(parents=True, exist_ok=True)
    assert old_dump_dir.is_dir()
    assert dump_dir.is_dir()
    for sub in ["train", "dev", "test"]:
        # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
        output_dir = dump_dir / sub
        output_dir.mkdir(parents=True, exist_ok=True)
        results = []
        for name in os.listdir(output_dir / "raw"):
            # 003918_feats.npy
            utt_id = name.split("_")[0]
            mel_path = output_dir / ("raw/" + name)
            gen_mel = np.load(mel_path)
            wave_name = utt_id + "_wave.npy"
            wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
            os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
                       output_dir / ("raw/" + wave_name))
            num_sample = wav.shape[0]
            num_frames = gen_mel.shape[0]
            wav_path = output_dir / ("raw/" + wave_name)
            record = {
                "utt_id": utt_id,
                "num_samples": num_sample,
                "num_frames": num_frames,
                "feats": str(mel_path),
                "wave": str(wav_path),
            }
            results.append(record)
        results.sort(key=itemgetter("utt_id"))
        with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
            for item in results:
                writer.write(item)
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@ -110,10 +110,10 @@ class Clip(object):
        if len(x) < c.shape[0] * self.hop_size:
            x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
        elif len(x) > c.shape[0] * self.hop_size:
-            print(
+            # print(
-                f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
-            )
+            # )
-            x = x[:c.shape[1] * self.hop_size]
+            x = x[:c.shape[0] * self.hop_size]
        # check the legnth is valid
        assert len(x) == c.shape[
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gt_duration_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gt_duration_mel.py
@ -0,0 +1,167 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # generate mels using durations.txt
 # for mb melgan finetune
 # 长度和原本的 mel 不一致怎么办？
 import argparse
 from pathlib import Path
 import numpy as np
 import paddle
 import yaml
 from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
 from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
 from paddlespeech.t2s.modules.normalizer import ZScore
 def evaluate(args, fastspeech2_config):
    # construct dataset for evaluation
    with open(args.phones_dict, "r") as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
    phone_dict = {}
    for phn, id in phn_id:
        phone_dict[phn] = int(id)
    odim = fastspeech2_config.n_mels
    model = FastSpeech2(
        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
    model.set_state_dict(
        paddle.load(args.fastspeech2_checkpoint)["main_params"])
    model.eval()
    stat = np.load(args.fastspeech2_stat)
    mu, std = stat
    mu = paddle.to_tensor(mu)
    std = paddle.to_tensor(std)
    fastspeech2_normalizer = ZScore(mu, std)
    fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
                                                      model)
    fastspeech2_inference.eval()
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    sentences, speaker_set = get_phn_dur(args.dur_file)
    merge_silence(sentences)
    for i, utt_id in enumerate(sentences):
        phones = sentences[utt_id][0]
        durations = sentences[utt_id][1]
        speaker = sentences[utt_id][2]
        # 裁剪掉开头和结尾的 sil
        if args.cut_sil:
            if phones[0] == "sil" and len(durations) > 1:
                durations = durations[1:]
                phones = phones[1:]
            if phones[-1] == 'sil' and len(durations) > 1:
                durations = durations[:-1]
                phones = phones[:-1]
            # sentences[utt_id][0] = phones
            # sentences[utt_id][1] = durations
        phone_ids = [phone_dict[phn] for phn in phones]
        phone_ids = paddle.to_tensor(np.array(phone_ids))
        durations = paddle.to_tensor(np.array(durations))
        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
        # split data into 3 sections
        if args.dataset == "baker":
            num_train = 9800
            num_dev = 100
        if i in range(0, num_train):
            sub_output_dir = output_dir / ("train/raw")
        elif i in range(num_train, num_train + num_dev):
            sub_output_dir = output_dir / ("dev/raw")
        else:
            sub_output_dir = output_dir / ("test/raw")
        sub_output_dir.mkdir(parents=True, exist_ok=True)
        with paddle.no_grad():
            mel = fastspeech2_inference(phone_ids, durations=durations)
        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(
        description="Synthesize with fastspeech2 & parallel wavegan.")
    parser.add_argument(
        "--dataset",
        default="baker",
        type=str,
        help="name of dataset, should in {baker, ljspeech, vctk} now")
    parser.add_argument(
        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
    parser.add_argument(
        "--fastspeech2-checkpoint",
        type=str,
        help="fastspeech2 checkpoint to load.")
    parser.add_argument(
        "--fastspeech2-stat",
        type=str,
        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
    )
    parser.add_argument(
        "--phones-dict",
        type=str,
        default="phone_id_map.txt",
        help="phone vocabulary file.")
    parser.add_argument(
        "--dur-file", default=None, type=str, help="path to durations.txt.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
    def str2bool(str):
        return True if str.lower() == 'true' else False
    parser.add_argument(
        "--cut-sil",
        type=str2bool,
        default=True,
        help="whether cut sil in the edge of audio")
    args = parser.parse_args()
    if args.ngpu == 0:
        paddle.set_device("cpu")
    elif args.ngpu > 0:
        paddle.set_device("gpu")
    else:
        print("ngpu should >= 0 !")
    with open(args.fastspeech2_config) as f:
        fastspeech2_config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(fastspeech2_config)
    evaluate(args, fastspeech2_config)
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -16,7 +16,9 @@
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
 from typing import Union
 import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
@ -687,6 +689,129 @@ class FastSpeech2Inference(nn.Layer):
        return logmel
 class StyleFastSpeech2Inference(FastSpeech2Inference):
    def __init__(self,
                 normalizer,
                 model,
                 pitch_stats_path=None,
                 energy_stats_path=None):
        super().__init__(normalizer, model)
        if pitch_stats_path:
            pitch_mean, pitch_std = np.load(pitch_stats_path)
            self.pitch_mean = paddle.to_tensor(pitch_mean)
            self.pitch_std = paddle.to_tensor(pitch_std)
        if energy_stats_path:
            energy_mean, energy_std = np.load(energy_stats_path)
            self.energy_mean = paddle.to_tensor(energy_mean)
            self.energy_std = paddle.to_tensor(energy_std)
    def denorm(self, data, mean, std):
        return data * std + mean
    def norm(self, data, mean, std):
        return (data - mean) / std
    def forward(self,
                text: paddle.Tensor,
                durations: Union[paddle.Tensor, np.ndarray]=None,
                durations_scale: Union[int, float]=None,
                durations_bias: Union[int, float]=None,
                pitch: Union[paddle.Tensor, np.ndarray]=None,
                pitch_scale: Union[int, float]=None,
                pitch_bias: Union[int, float]=None,
                energy: Union[paddle.Tensor, np.ndarray]=None,
                energy_scale: Union[int, float]=None,
                energy_bias: Union[int, float]=None,
                robot: bool=False):
        """
        Parameters
        ----------
        text : Tensor(int64)
            Input sequence of characters (T,).
        speech : Tensor, optional
            Feature sequence to extract style (N, idim).
        durations : paddle.Tensor/np.ndarray, optional (int64)
            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
        durations_scale: int/float, optional
        durations_bias: int/float, optional
        pitch : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
        pitch_scale: int/float, optional
            In denormed HZ domain.
        pitch_bias: int/float, optional
            In denormed HZ domain.
        energy : paddle.Tensor/np.ndarray, optional
            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
        energy_scale: int/float, optional
            In denormed domain.
        energy_bias: int/float, optional
            In denormed domain.
        robot : bool, optional
            Weather output robot style
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        """
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text, durations=None, pitch=None, energy=None)
        # priority: groundtruth > scale/bias > previous output
        # set durations
        if isinstance(durations, np.ndarray):
            durations = paddle.to_tensor(durations)
        elif isinstance(durations, paddle.Tensor):
            durations = durations
        elif durations_scale or durations_bias:
            durations_scale = durations_scale if durations_scale is not None else 1
            durations_bias = durations_bias if durations_bias is not None else 0
            durations = durations_scale * d_outs + durations_bias
        else:
            durations = d_outs
        if robot:
            # set normed pitch to zeros have the same effect with set denormd ones to mean
            pitch = paddle.zeros(p_outs.shape)
        # set pitch, can overwrite robot set  
        if isinstance(pitch, np.ndarray):
            pitch = paddle.to_tensor(pitch)
        elif isinstance(pitch, paddle.Tensor):
            pitch = pitch
        elif pitch_scale or pitch_bias:
            pitch_scale = pitch_scale if pitch_scale is not None else 1
            pitch_bias = pitch_bias if pitch_bias is not None else 0
            p_Hz = paddle.exp(
                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
            p_HZ = pitch_scale * p_Hz + pitch_bias
            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
        else:
            pitch = p_outs
        # set energy
        if isinstance(energy, np.ndarray):
            energy = paddle.to_tensor(energy)
        elif isinstance(energy, paddle.Tensor):
            energy = energy
        elif energy_scale or energy_bias:
            energy_scale = energy_scale if energy_scale is not None else 1
            energy_bias = energy_bias if energy_bias is not None else 0
            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
            e_dnorm = energy_scale * e_dnorm + energy_bias
            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
        else:
            energy = e_outs
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text,
            durations=durations,
            pitch=pitch,
            energy=energy,
            use_teacher_forcing=True)
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""