Merge pull request #1379 from yt605155624/new_wavernn

[TTS] add wavernn
4 years ago · dcfc32f1ec
parent 0747600c95 1b0c034134
commit dcfc32f1ec
21 changed files with 1729 additions and 11 deletions
--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@ -49,3 +49,14 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
 # wavernn
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    python3 ${BIN_DIR}/../inference.py \
        --inference_dir=${train_output_path}/inference \
        --am=fastspeech2_csmsc \
        --voc=wavernn_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/pd_infer_out \
        --phones_dict=dump/phone_id_map.txt
 fi
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@ -89,3 +89,25 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --inference_dir=${train_output_path}/inference \
        --phones_dict=dump/phone_id_map.txt
 fi
 # wavernn
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "in wavernn syn_e2e"
    FLAGS_allocator_strategy=naive_best_fit \
    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
    python3 ${BIN_DIR}/../synthesize_e2e.py \
        --am=fastspeech2_csmsc \
        --am_config=${config_path} \
        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
        --am_stat=dump/train/speech_stats.npy \
        --voc=wavernn_csmsc \
        --voc_config=wavernn_test/default.yaml \
        --voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \
        --voc_stat=wavernn_test/feats_stats.npy \
        --lang=zh \
        --text=${BIN_DIR}/../sentences.txt \
        --output_dir=${train_output_path}/test_e2e \
        --phones_dict=dump/phone_id_map.txt \
        --inference_dir=${train_output_path}/inference
 fi
--- a/examples/csmsc/voc6/conf/default.yaml
+++ b/examples/csmsc/voc6/conf/default.yaml
@ -0,0 +1,67 @@
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 fs: 24000                # Sampling rate.
 n_fft: 2048              # FFT size (samples).
 n_shift: 300             # Hop size (samples). 12.5ms
 win_length: 1200         # Window length (samples). 50ms
                         # If set to null, it will be the same as fft_size.
 window: "hann"           # Window function.
 n_mels: 80               # Number of mel basis.
 fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
 fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
 mu_law: True             # Recommended to suppress noise if using raw bitsexit()
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################
 model:
    rnn_dims: 512                     # Hidden dims of RNN Layers.
    fc_dims: 512
    bits: 9                           # Bit depth of signal
    aux_context_window: 2             # Context window size for auxiliary feature.
                                      # If set to 2, previous 2 and future 2 frames will be considered.
    aux_channels: 80                  # Number of channels for auxiliary feature conv.
                                      # Must be the same as num_mels.
    upsample_scales: [4, 5, 3, 5]     # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
    compute_dims: 128                 # Dims of Conv1D in MelResNet.
    res_out_dims: 128                 # Dims of output in MelResNet.
    res_blocks: 10                    # Number of residual blocks.
    mode: RAW                         # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
 inference:
    gen_batched: True                 # whether to genenate sample in batch mode
    target: 12000                     # target number of samples to be generated in each batch entry
    overlap: 600                      # number of samples for crossfading between batches
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 64              # Batch size.
 batch_max_steps: 4500       # Length of each audio in batch. Make sure dividable by hop_size.
 num_workers: 2              # Number of workers in DataLoader.
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################
 grad_clip: 4.0
 learning_rate: 1.0e-4                
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 train_max_steps: 400000               # Number of training steps.
 save_interval_steps: 5000             # Interval steps to save checkpoint.
 eval_interval_steps: 1000             # Interval steps to evaluate the network.
 gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples
 generate_num: 5                       # number of samples to generate at each checkpoint
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_snapshots: 10                 # max number of snapshots to keep while training
 seed: 42                          # random seed for paddle, random, and np.random
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@ -0,0 +1,55 @@
 #!/bin/bash
 stage=0
 stop_stage=100
 config_path=$1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
    echo "Generate durations.txt from MFA results ..."
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
        --config=${config_path}
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
    python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \
        --rootdir=~/datasets/BZNSYP/ \
        --dataset=baker \
        --dumpdir=dump \
        --dur-file=durations.txt \
        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # get features' stats(mean and std)
    echo "Get features' stats ..."
    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --field-name="feats"
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy
 fi
--- a/examples/csmsc/voc6/local/synthesize.sh
+++ b/examples/csmsc/voc6/local/synthesize.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize.py \
    --config=${config_path} \
    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
    --test-metadata=dump/test/norm/metadata.jsonl \
    --output-dir=${train_output_path}/test
--- a/examples/csmsc/voc6/local/train.sh
+++ b/examples/csmsc/voc6/local/train.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 config_path=$1
 train_output_path=$2
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1
--- a/examples/csmsc/voc6/path.sh
+++ b/examples/csmsc/voc6/path.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=wavernn
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/csmsc/voc6/run.sh
+++ b/examples/csmsc/voc6/run.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 set -e
 source path.sh
 gpus=0,1
 stage=0
 stop_stage=100
 conf_path=conf/default.yaml
 train_output_path=exp/default
 test_input=dump/dump_gta_test
 ckpt_name=snapshot_iter_100000.pdz
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    ./local/preprocess.sh ${conf_path} || exit -1
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # prepare data
    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # synthesize
    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
--- a/paddlespeech/t2s/audio/init.py
+++ b/paddlespeech/t2s/audio/init.py
@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .audio import AudioProcessor
 from .codec import *
 from .spec_normalizer import LogMagnitude
 from .spec_normalizer import NormalizerBase
--- a/paddlespeech/t2s/audio/codec.py
+++ b/paddlespeech/t2s/audio/codec.py
@ -0,0 +1,51 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import numpy as np
 import paddle
 # x: [0: 2**bit-1], return: [-1, 1]
 def label_2_float(x, bits):
    return 2 * x / (2**bits - 1.) - 1.
 #x: [-1, 1], return: [0, 2**bits-1]
 def float_2_label(x, bits):
    assert abs(x).max() <= 1.0
    x = (x + 1.) * (2**bits - 1) / 2
    return x.clip(0, 2**bits - 1)
 # y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
 # see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
 # be careful the input `mu` here, which is +1 than that of the link above
 def encode_mu_law(x, mu):
    mu = mu - 1
    fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
    return np.floor((fx + 1) / 2 * mu + 0.5)
 # from_labels = True:
 # y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
 # from_labels = False:
 # y: [-1, 1], return: [-1, 1]
 def decode_mu_law(y, mu, from_labels=True):
    # TODO: get rid of log2 - makes no sense
    if from_labels:
        y = label_2_float(y, math.log2(mu))
    mu = mu - 1
    x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
    return x
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@ -14,6 +14,10 @@
 import numpy as np
 import paddle
 from paddlespeech.t2s.audio.codec import encode_mu_law
 from paddlespeech.t2s.audio.codec import float_2_label
 from paddlespeech.t2s.audio.codec import label_2_float
 class Clip(object):
    """Collate functor for training vocoders.
@ -49,7 +53,7 @@ class Clip(object):
        self.end_offset = -(self.batch_max_frames + aux_context_window)
        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
-    def __call__(self, examples):
+    def __call__(self, batch):
        """Convert into batch tensors.
        Parameters
@ -67,11 +71,11 @@ class Clip(object):
        """
        # check length
-        examples = [
+        batch = [
-            self._adjust_length(b['wave'], b['feats']) for b in examples
+            self._adjust_length(b['wave'], b['feats']) for b in batch
            if b['feats'].shape[0] > self.mel_threshold
        ]
-        xs, cs = [b[0] for b in examples], [b[1] for b in examples]
+        xs, cs = [b[0] for b in batch], [b[1] for b in batch]
        # make batch with random cut
        c_lengths = [c.shape[0] for c in cs]
@ -89,7 +93,7 @@ class Clip(object):
        c_batch = np.stack(
            [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
-        # convert each batch to tensor, asuume that each item in batch has the same length
+        # convert each batch to tensor, assume that each item in batch has the same length
        y_batch = paddle.to_tensor(
            y_batch, dtype=paddle.float32).unsqueeze(1)  # (B, 1, T)
        c_batch = paddle.to_tensor(
@ -120,3 +124,113 @@ class Clip(object):
            0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
        return x, c
 class WaveRNNClip(Clip):
    def __init__(self,
                 mode: str='RAW',
                 batch_max_steps: int=4500,
                 hop_size: int=300,
                 aux_context_window: int=2,
                 bits: int=9,
                 mu_law: bool=True):
        self.mode = mode
        self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
        self.batch_max_steps = batch_max_steps
        self.hop_size = hop_size
        self.aux_context_window = aux_context_window
        self.mu_law = mu_law
        self.batch_max_frames = batch_max_steps // hop_size
        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
        if self.mode == 'MOL':
            self.bits = 16
        else:
            self.bits = bits
    def to_quant(self, wav):
        if self.mode == 'RAW':
            if self.mu_law:
                quant = encode_mu_law(wav, mu=2**self.bits)
            else:
                quant = float_2_label(wav, bits=self.bits)
        elif self.mode == 'MOL':
            quant = float_2_label(wav, bits=16)
        quant = quant.astype(np.int64)
        return quant
    def __call__(self, batch):
        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
        """Convert into batch tensors.
        Parameters
        ----------
        batch : list
            list of tuple of the pair of audio and features. 
            Audio shape (T, ), features shape(T', C).
        Returns
        ----------
        Tensor
            Input signal batch (B, 1, T).
        Tensor
            Target signal batch (B, 1, T).
        Tensor
            Auxiliary feature batch (B, C, T'), where
            T = (T' - 2 * aux_context_window) * hop_size.
        """
        # check length
        batch = [
            self._adjust_length(b['wave'], b['feats']) for b in batch
            if b['feats'].shape[0] > self.mel_threshold
        ]
        wav, mel = [b[0] for b in batch], [b[1] for b in batch]
        # mel 此处需要转置
        mel = [x.T for x in mel]
        max_offsets = [
            x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
            for x in mel
        ]
        # the slice point of mel selecting randomly 
        mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
        # the slice point of wav selecting randomly, which is behind 2(=pad) frames 
        sig_offsets = [(offset + self.aux_context_window) * self.hop_size
                       for offset in mel_offsets]
        # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad
        mels = [
            x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
            for i, x in enumerate(mel)
        ]
        # label.shape[1] = voc_seq_len + 1
        wav = [self.to_quant(x) for x in wav]
        labels = [
            x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
            for i, x in enumerate(wav)
        ]
        mels = np.stack(mels).astype(np.float32)
        labels = np.stack(labels).astype(np.int64)
        mels = paddle.to_tensor(mels)
        labels = paddle.to_tensor(labels, dtype='int64')
        # x is input, y is label
        x = labels[:, :self.batch_max_steps]
        y = labels[:, 1:]
        '''
        mode = RAW:
            mu_law = True:
                quant: bits = 9   0, 1, 2, ..., 509, 510, 511  int
            mu_law = False
                quant bits = 9    [0， 511]  float
        mode = MOL:
            quant: bits = 16  [0. 65536]  float
        '''
        # x should be normalizes in.[0, 1] in RAW mode
        x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
        # y should be normalizes in.[0, 1] in MOL mode
        if self.mode == 'MOL':
            y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
        return x, y, mels
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@ -54,7 +54,7 @@ def main():
        default='pwgan_csmsc',
        choices=[
            'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
-            'pwgan_vctk'
+            'pwgan_vctk', 'wavernn_csmsc'
        ],
        help='Choose vocoder type of tts task.')
    # other
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -59,6 +59,10 @@ model_alias = {
    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
    "hifigan_inference":
    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
    "wavernn":
    "paddlespeech.t2s.models.wavernn:WaveRNN",
    "wavernn_inference":
    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
 }
@ -151,10 +155,16 @@ def evaluate(args):
    voc_name = args.voc[:args.voc.rindex('_')]
    voc_class = dynamic_import(voc_name, model_alias)
    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
    if voc_name != 'wavernn':
        voc = voc_class(**voc_config["generator_params"])
        voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
        voc.remove_weight_norm()
        voc.eval()
    else:
        voc = voc_class(**voc_config["model"])
        voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
        voc.eval()
    voc_mu, voc_std = np.load(args.voc_stat)
    voc_mu = paddle.to_tensor(voc_mu)
    voc_std = paddle.to_tensor(voc_std)
@ -322,7 +332,8 @@ def main():
        default='pwgan_csmsc',
        choices=[
            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
-            'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc'
+            'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc',
            'wavernn_csmsc'
        ],
        help='Choose vocoder type of tts task.')
--- a/paddlespeech/t2s/exps/wavernn/init.py
+++ b/paddlespeech/t2s/exps/wavernn/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/t2s/exps/wavernn/synthesize.py
+++ b/paddlespeech/t2s/exps/wavernn/synthesize.py
@ -0,0 +1,108 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from pathlib import Path
 import jsonlines
 import numpy as np
 import paddle
 import soundfile as sf
 import yaml
 from paddle import distributed as dist
 from timer import timer
 from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.wavernn import WaveRNN
 def main():
    parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.")
    parser.add_argument("--config", type=str, help="GANVocoder config file.")
    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
    parser.add_argument("--test-metadata", type=str, help="dev data.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    args = parser.parse_args()
    with open(args.config) as f:
        config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(config)
    print(
        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
    )
    if args.ngpu == 0:
        paddle.set_device("cpu")
    elif args.ngpu > 0:
        paddle.set_device("gpu")
    else:
        print("ngpu should >= 0 !")
    model = WaveRNN(
        hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
    state_dict = paddle.load(args.checkpoint)
    model.set_state_dict(state_dict["main_params"])
    model.eval()
    with jsonlines.open(args.test_metadata, 'r') as reader:
        metadata = list(reader)
    test_dataset = DataTable(
        metadata,
        fields=['utt_id', 'feats'],
        converters={
            'utt_id': None,
            'feats': np.load,
        })
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    N = 0
    T = 0
    for example in test_dataset:
        utt_id = example['utt_id']
        mel = example['feats']
        mel = paddle.to_tensor(mel)  # (T, C)
        with timer() as t:
            with paddle.no_grad():
                wav = model.generate(
                    c=mel,
                    batched=config.inference.gen_batched,
                    target=config.inference.target,
                    overlap=config.inference.overlap,
                    mu_law=config.mu_law,
                    gen_display=True)
            wav = wav.numpy()
            N += wav.size
            T += t.elapse
            speed = wav.size / t.elapse
            rtf = config.fs / speed
        print(
            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
        )
        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@ -0,0 +1,212 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 import shutil
 from pathlib import Path
 import jsonlines
 import numpy as np
 import paddle
 import yaml
 from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam
 from yacs.config import CfgNode
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip
 from paddlespeech.t2s.models.wavernn import WaveRNN
 from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator
 from paddlespeech.t2s.models.wavernn import WaveRNNUpdater
 from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss
 from paddlespeech.t2s.training.extensions.snapshot import Snapshot
 from paddlespeech.t2s.training.extensions.visualizer import VisualDL
 from paddlespeech.t2s.training.seeding import seed_everything
 from paddlespeech.t2s.training.trainer import Trainer
 def train_sp(args, config):
    # decides device type and whether to run in parallel
    # setup running environment correctly
    world_size = paddle.distributed.get_world_size()
    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
        paddle.set_device("cpu")
    else:
        paddle.set_device("gpu")
        if world_size > 1:
            paddle.distributed.init_parallel_env()
    # set the random seed, it is a must for multiprocess training
    seed_everything(config.seed)
    print(
        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
    )
    # construct dataset for training and validation
    with jsonlines.open(args.train_metadata, 'r') as reader:
        train_metadata = list(reader)
    train_dataset = DataTable(
        data=train_metadata,
        fields=["wave", "feats"],
        converters={
            "wave": np.load,
            "feats": np.load,
        }, )
    with jsonlines.open(args.dev_metadata, 'r') as reader:
        dev_metadata = list(reader)
    dev_dataset = DataTable(
        data=dev_metadata,
        fields=["wave", "feats"],
        converters={
            "wave": np.load,
            "feats": np.load,
        }, )
    batch_fn = WaveRNNClip(
        mode=config.model.mode,
        aux_context_window=config.model.aux_context_window,
        hop_size=config.n_shift,
        batch_max_steps=config.batch_max_steps,
        bits=config.model.bits)
    # collate function and dataloader
    train_sampler = DistributedBatchSampler(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        drop_last=True)
    dev_sampler = DistributedBatchSampler(
        dev_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        drop_last=False)
    print("samplers done!")
    train_dataloader = DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
        collate_fn=batch_fn,
        num_workers=config.num_workers)
    dev_dataloader = DataLoader(
        dev_dataset,
        collate_fn=batch_fn,
        batch_sampler=dev_sampler,
        num_workers=config.num_workers)
    valid_generate_loader = DataLoader(dev_dataset, batch_size=1)
    print("dataloaders done!")
    model = WaveRNN(
        hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
    if world_size > 1:
        model = DataParallel(model)
    print("model done!")
    if config.model.mode == 'RAW':
        criterion = paddle.nn.CrossEntropyLoss(axis=1)
    elif config.model.mode == 'MOL':
        criterion = discretized_mix_logistic_loss
    else:
        criterion = None
        RuntimeError('Unknown model mode value - ', config.model.mode)
    print("criterions done!")
    clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip)
    optimizer = Adam(
        parameters=model.parameters(),
        learning_rate=config.learning_rate,
        grad_clip=clip)
    print("optimizer done!")
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    if dist.get_rank() == 0:
        config_name = args.config.split("/")[-1]
        # copy conf to output_dir
        shutil.copyfile(args.config, output_dir / config_name)
    updater = WaveRNNUpdater(
        model=model,
        optimizer=optimizer,
        criterion=criterion,
        dataloader=train_dataloader,
        output_dir=output_dir,
        mode=config.model.mode)
    evaluator = WaveRNNEvaluator(
        model=model,
        dataloader=dev_dataloader,
        criterion=criterion,
        output_dir=output_dir,
        valid_generate_loader=valid_generate_loader,
        config=config)
    trainer = Trainer(
        updater,
        stop_trigger=(config.train_max_steps, "iteration"),
        out=output_dir)
    if dist.get_rank() == 0:
        trainer.extend(
            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
        trainer.extend(
            Snapshot(max_size=config.num_snapshots),
            trigger=(config.save_interval_steps, 'iteration'))
    print("Trainer Done!")
    trainer.run()
 def main():
    # parse args and config and redirect to train_sp
    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
    parser.add_argument(
        "--config", type=str, help="config file to overwrite default config.")
    parser.add_argument("--train-metadata", type=str, help="training data.")
    parser.add_argument("--dev-metadata", type=str, help="dev data.")
    parser.add_argument("--output-dir", type=str, help="output dir.")
    parser.add_argument(
        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
        config = CfgNode(yaml.safe_load(f))
    print("========Args========")
    print(yaml.safe_dump(vars(args)))
    print("========Config========")
    print(config)
    print(
        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
    )
    # dispatch
    if args.ngpu > 1:
        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
    else:
        train_sp(args, config)
 if __name__ == "__main__":
    main()
--- a/paddlespeech/t2s/models/init.py
+++ b/paddlespeech/t2s/models/init.py
@ -19,3 +19,4 @@ from .parallel_wavegan import *
 from .speedyspeech import *
 from .transformer_tts import *
 from .waveflow import *
 from .wavernn import *
--- a/paddlespeech/t2s/models/wavernn/init.py
+++ b/paddlespeech/t2s/models/wavernn/init.py
@ -0,0 +1,15 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .wavernn import *
 from .wavernn_updater import *
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@ -0,0 +1,627 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 import time
 from typing import List
 import numpy as np
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from paddlespeech.t2s.audio.codec import decode_mu_law
 from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.upsample import Stretch2D
 class ResBlock(nn.Layer):
    def __init__(self, dims):
        super().__init__()
        self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
        self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
        self.batch_norm1 = nn.BatchNorm1D(dims)
        self.batch_norm2 = nn.BatchNorm1D(dims)
    def forward(self, x):
        '''
        conv -> bn -> relu -> conv -> bn + residual connection
        '''
        residual = x
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.batch_norm2(x)
        return x + residual
 class MelResNet(nn.Layer):
    def __init__(self,
                 res_blocks: int=10,
                 compute_dims: int=128,
                 res_out_dims: int=128,
                 aux_channels: int=80,
                 aux_context_window: int=0):
        super().__init__()
        k_size = aux_context_window * 2 + 1
        # pay attention here, the dim reduces aux_context_window * 2
        self.conv_in = nn.Conv1D(
            aux_channels, compute_dims, kernel_size=k_size, bias_attr=False)
        self.batch_norm = nn.BatchNorm1D(compute_dims)
        self.layers = nn.LayerList()
        for _ in range(res_blocks):
            self.layers.append(ResBlock(compute_dims))
        self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1)
    def forward(self, x):
        '''
        Parameters
        ----------
        x : Tensor
            Input tensor (B, in_dims, T).
        Returns
        ----------
        Tensor
            Output tensor (B, res_out_dims, T).
        '''
        x = self.conv_in(x)
        x = self.batch_norm(x)
        x = F.relu(x)
        for f in self.layers:
            x = f(x)
        x = self.conv_out(x)
        return x
 class UpsampleNetwork(nn.Layer):
    def __init__(self,
                 aux_channels: int=80,
                 upsample_scales: List[int]=[4, 5, 3, 5],
                 compute_dims: int=128,
                 res_blocks: int=10,
                 res_out_dims: int=128,
                 aux_context_window: int=2):
        super().__init__()
        # total_scale is the total Up sampling multiple
        total_scale = np.prod(upsample_scales)
        # TODO pad*total_scale is numpy.int64
        self.indent = int(aux_context_window * total_scale)
        self.resnet = MelResNet(
            res_blocks=res_blocks,
            aux_channels=aux_channels,
            compute_dims=compute_dims,
            res_out_dims=res_out_dims,
            aux_context_window=aux_context_window)
        self.resnet_stretch = Stretch2D(total_scale, 1)
        self.up_layers = nn.LayerList()
        for scale in upsample_scales:
            k_size = (1, scale * 2 + 1)
            padding = (0, scale)
            stretch = Stretch2D(scale, 1)
            conv = nn.Conv2D(
                1, 1, kernel_size=k_size, padding=padding, bias_attr=False)
            weight_ = paddle.full_like(conv.weight, 1. / k_size[1])
            conv.weight.set_value(weight_)
            self.up_layers.append(stretch)
            self.up_layers.append(conv)
    def forward(self, m):
        '''
        Parameters
        ----------
        c : Tensor
            Input tensor (B, C_aux, T).
        Returns
        ----------
        Tensor
            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
        Tensor
            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        '''
        # aux: [B, C_aux, T] 
        # -> [B, res_out_dims, T - 2 * aux_context_window]
        # -> [B, 1, res_out_dims, T - 2 * aux_context_window]
        aux = self.resnet(m).unsqueeze(1)
        # aux: [B, 1, res_out_dims, T - 2 * aux_context_window]
        # -> [B, 1, res_out_dims, (T - 2 * pad) *  prob(upsample_scales)]
        aux = self.resnet_stretch(aux)
        # aux: [B, 1, res_out_dims, T * prob(upsample_scales)] 
        # -> [B, res_out_dims, T * prob(upsample_scales)]
        aux = aux.squeeze(1)
        # m: [B, C_aux, T] -> [B, 1, C_aux, T]
        m = m.unsqueeze(1)
        for f in self.up_layers:
            m = f(m)
        # m: [B, 1, C_aux, T*prob(upsample_scales)]
        # -> [B, C_aux, T * prob(upsample_scales)]
        # -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)]
        m = m.squeeze(1)[:, :, self.indent:-self.indent]
        # m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux]
        # aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims]
        return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1])
 class WaveRNN(nn.Layer):
    def __init__(
            self,
            rnn_dims: int=512,
            fc_dims: int=512,
            bits: int=9,
            aux_context_window: int=2,
            upsample_scales: List[int]=[4, 5, 3, 5],
            aux_channels: int=80,
            compute_dims: int=128,
            res_out_dims: int=128,
            res_blocks: int=10,
            hop_length: int=300,
            sample_rate: int=24000,
            mode='RAW',
            init_type: str="xavier_uniform", ):
        '''
        Parameters
        ----------
        rnn_dims : int, optional
            Hidden dims of RNN Layers.
        fc_dims : int, optional
             Dims of FC Layers.
        bits : int, optional
            bit depth of signal.
        aux_context_window : int, optional
            The context window size of the first convolution applied to the 
            auxiliary input, by default 2
        upsample_scales : List[int], optional
            Upsample scales of the upsample network.
        aux_channels : int, optional
            Auxiliary channel of the residual blocks.
        compute_dims : int, optional
            Dims of Conv1D in MelResNet.
        res_out_dims : int, optional
            Dims of output in MelResNet.
        res_blocks : int, optional
            Number of residual blocks.
        mode : str, optional
            Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
            and `RAW` for quantized bits as the model's output.
        init_type : str
            How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
        self.aux_context_window = aux_context_window
        if self.mode == 'RAW':
            self.n_classes = 2**bits
        elif self.mode == 'MOL':
            self.n_classes = 10 * 3
        else:
            RuntimeError('Unknown model mode value - ', self.mode)
        # List of rnns to call 'flatten_parameters()' on
        self._to_flatten = []
        self.rnn_dims = rnn_dims
        self.aux_dims = res_out_dims // 4
        self.hop_length = hop_length
        self.sample_rate = sample_rate
        # initialize parameters
        initialize(self, init_type)
        self.upsample = UpsampleNetwork(
            aux_channels=aux_channels,
            upsample_scales=upsample_scales,
            compute_dims=compute_dims,
            res_blocks=res_blocks,
            res_out_dims=res_out_dims,
            aux_context_window=aux_context_window)
        self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims)
        self.rnn1 = nn.GRU(rnn_dims, rnn_dims)
        self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims)
        self._to_flatten += [self.rnn1, self.rnn2]
        self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
        self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
        self.fc3 = nn.Linear(fc_dims, self.n_classes)
        # Avoid fragmentation of RNN parameters and associated warning
        self._flatten_parameters()
        nn.initializer.set_global_initializer(None)
    def forward(self, x, c):
        '''
        Parameters
        ----------
        x : Tensor
            wav sequence, [B, T]
        c : Tensor
            mel spectrogram [B, C_aux, T']
        T = (T' - 2 * aux_context_window ) * hop_length
        Returns
        ----------
        Tensor
            [B, T, n_classes]
        '''
        # Although we `_flatten_parameters()` on init, when using DataParallel
        # the model gets replicated, making it no longer guaranteed that the
        # weights are contiguous in GPU memory. Hence, we must call it again
        self._flatten_parameters()
        bsize = paddle.shape(x)[0]
        h1 = paddle.zeros([1, bsize, self.rnn_dims])
        h2 = paddle.zeros([1, bsize, self.rnn_dims])
        # c: [B, T, C_aux]
        # aux: [B, T, res_out_dims]
        c, aux = self.upsample(c)
        aux_idx = [self.aux_dims * i for i in range(5)]
        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
        x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2)
        x = self.I(x)
        res = x
        x, _ = self.rnn1(x, h1)
        x = x + res
        res = x
        x = paddle.concat([x, a2], axis=2)
        x, _ = self.rnn2(x, h2)
        x = x + res
        x = paddle.concat([x, a3], axis=2)
        x = F.relu(self.fc1(x))
        x = paddle.concat([x, a4], axis=2)
        x = F.relu(self.fc2(x))
        return self.fc3(x)
    @paddle.no_grad()
    def generate(self,
                 c,
                 batched: bool=True,
                 target: int=12000,
                 overlap: int=600,
                 mu_law: bool=True,
                 gen_display: bool=False):
        """
        Parameters
        ----------
        c : Tensor
            input mels, (T', C_aux)
        batched : bool
            generate in batch or not
        target : int
            target number of samples to be generated in each batch entry
        overlap : int
            number of samples for crossfading between batches
        mu_law : bool
            use mu law or not
        Returns
        ----------
        wav sequence
            Output (T' * prod(upsample_scales), out_channels, C_out).
        """
        self.eval()
        mu_law = mu_law if self.mode == 'RAW' else False
        output = []
        start = time.time()
        # pseudo batch
        # (T, C_aux) -> (1, C_aux, T)
        c = paddle.transpose(c, [1, 0]).unsqueeze(0)
        T = paddle.shape(c)[-1]
        wave_len = T * self.hop_length
        # TODO remove two transpose op by modifying function pad_tensor
        c = self.pad_tensor(
            c.transpose([0, 2, 1]), pad=self.aux_context_window,
            side='both').transpose([0, 2, 1])
        c, aux = self.upsample(c)
        if batched:
            # (num_folds, target + 2 * overlap, features)
            c = self.fold_with_overlap(c, target, overlap)
            aux = self.fold_with_overlap(aux, target, overlap)
        # for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for
        # will not get TensorArray
        # see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray
        # b_size, seq_len, _ = paddle.shape(c)
        b_size = paddle.shape(c)[0]
        seq_len = paddle.shape(c)[1]
        h1 = paddle.zeros([b_size, self.rnn_dims])
        h2 = paddle.zeros([b_size, self.rnn_dims])
        x = paddle.zeros([b_size, 1])
        d = self.aux_dims
        aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]
        for i in range(seq_len):
            m_t = c[:, i, :]
            # for dygraph to static graph
            # a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
            a1_t = aux_split[0][:, i, :]
            a2_t = aux_split[1][:, i, :]
            a3_t = aux_split[2][:, i, :]
            a4_t = aux_split[3][:, i, :]
            x = paddle.concat([x, m_t, a1_t], axis=1)
            x = self.I(x)
            # use GRUCell here
            h1, _ = self.rnn1[0].cell(x, h1)
            x = x + h1
            inp = paddle.concat([x, a2_t], axis=1)
            # use GRUCell here
            h2, _ = self.rnn2[0].cell(inp, h2)
            x = x + h2
            x = paddle.concat([x, a3_t], axis=1)
            x = F.relu(self.fc1(x))
            x = paddle.concat([x, a4_t], axis=1)
            x = F.relu(self.fc2(x))
            logits = self.fc3(x)
            if self.mode == 'MOL':
                sample = sample_from_discretized_mix_logistic(
                    logits.unsqueeze(0).transpose([0, 2, 1]))
                output.append(sample.reshape([-1]))
                x = sample.transpose([1, 0, 2])
            elif self.mode == 'RAW':
                posterior = F.softmax(logits, axis=1)
                distrib = paddle.distribution.Categorical(posterior)
                # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
                # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
                # sample: [-1, 1]
                sample = 2 * distrib.sample([1])[0].cast('float32') / (
                    self.n_classes - 1.) - 1.
                output.append(sample)
                x = sample.unsqueeze(-1)
            else:
                raise RuntimeError('Unknown model mode value - ', self.mode)
            if gen_display:
                if i % 1000 == 0:
                    self.gen_display(i, int(seq_len), int(b_size), start)
        output = paddle.stack(output).transpose([1, 0])
        if mu_law:
            output = decode_mu_law(output, self.n_classes, False)
        if batched:
            output = self.xfade_and_unfold(output, target, overlap)
        else:
            output = output[0]
        # Fade-out at the end to avoid signal cutting out suddenly
        fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
        output = output[:wave_len]
        output[-10 * self.hop_length:] *= fade_out
        self.train()
        # 增加 C_out 维度
        return output.unsqueeze(-1)
    def _flatten_parameters(self):
        [m.flatten_parameters() for m in self._to_flatten]
    def pad_tensor(self, x, pad, side='both'):
        '''
        Parameters
        ----------
        x : Tensor
            mel, [1, n_frames, 80]
        pad : int
        side : str 
            'both', 'before' or 'after'
        Returns
        ----------
        Tensor
        '''
        b, t, _ = paddle.shape(x)
        # for dygraph to static graph
        c = x.shape[-1]
        total = t + 2 * pad if side == 'both' else t + pad
        padded = paddle.zeros([b, total, c])
        if side == 'before' or side == 'both':
            padded[:, pad:pad + t, :] = x
        elif side == 'after':
            padded[:, :t, :] = x
        return padded
    def fold_with_overlap(self, x, target, overlap):
        '''
        Fold the tensor with overlap for quick batched inference.
        Overlap will be used for crossfading in xfade_and_unfold()
        Parameters
        ----------
        x : Tensor
            Upsampled conditioning features. mels or aux
            shape=(1, T, features)
            mels: [1, T, 80]
            aux: [1, T, 128]
        target : int
            Target timesteps for each index of batch
        overlap : int
            Timesteps for both xfade and rnn warmup
            overlap = hop_length * 2
        Returns
        ----------
        Tensor 
            shape=(num_folds, target + 2 * overlap, features)
            num_flods = (time_seq - overlap) // (target + overlap)
            mel: [num_folds, target + 2 * overlap, 80]
            aux: [num_folds, target + 2 * overlap, 128]
        Details
        ----------
        x = [[h1, h2, ... hn]]
        Where each h is a vector of conditioning features
        Eg: target=2, overlap=1 with x.size(1)=10
        folded = [[h1, h2, h3, h4],
                  [h4, h5, h6, h7],
                  [h7, h8, h9, h10]]
        '''
        _, total_len, features = paddle.shape(x)
        # Calculate variables needed
        num_folds = (total_len - overlap) // (target + overlap)
        extended_len = num_folds * (overlap + target) + overlap
        remaining = total_len - extended_len
        # Pad if some time steps poking out
        if remaining != 0:
            num_folds += 1
            padding = target + 2 * overlap - remaining
            x = self.pad_tensor(x, padding, side='after')
        folded = paddle.zeros([num_folds, target + 2 * overlap, features])
        # Get the values for the folded tensor
        for i in range(num_folds):
            start = i * (target + overlap)
            end = start + target + 2 * overlap
            folded[i] = x[0][start:end, :]
        return folded
    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
        ''' Applies a crossfade and unfolds into a 1d array.
        Parameters
        ----------
        y : Tensor
            Batched sequences of audio samples
            shape=(num_folds, target + 2 * overlap)
            dtype=paddle.float32
        overlap : int
            Timesteps for both xfade and rnn warmup
        Returns
        ----------
        Tensor
            audio samples in a 1d array
            shape=(total_len)
            dtype=paddle.float32
        Details
        ----------
        y = [[seq1],
            [seq2],
            [seq3]]
        Apply a gain envelope at both ends of the sequences
        y = [[seq1_in, seq1_target, seq1_out],
            [seq2_in, seq2_target, seq2_out],
            [seq3_in, seq3_target, seq3_out]]
        Stagger and add up the groups of samples:
        [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
        '''
        # num_folds = (total_len - overlap) // (target + overlap)
        num_folds, length = paddle.shape(y)
        target = length - 2 * overlap
        total_len = num_folds * (target + overlap) + overlap
        # Need some silence for the run warmup
        slience_len = overlap // 2
        fade_len = overlap - slience_len
        slience = paddle.zeros([slience_len], dtype=paddle.float32)
        linear = paddle.ones([fade_len], dtype=paddle.float32)
        # Equal power crossfade
        # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
        t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32)
        fade_in = paddle.sqrt(0.5 * (1 + t))
        fade_out = paddle.sqrt(0.5 * (1 - t))
        # Concat the silence to the fades
        fade_out = paddle.concat([linear, fade_out])
        fade_in = paddle.concat([slience, fade_in])
        # Apply the gain to the overlap samples
        y[:, :overlap] *= fade_in
        y[:, -overlap:] *= fade_out
        unfolded = paddle.zeros([total_len], dtype=paddle.float32)
        # Loop to add up all the samples
        for i in range(num_folds):
            start = i * (target + overlap)
            end = start + target + 2 * overlap
            unfolded[start:end] += y[i]
        return unfolded
    def gen_display(self, i, seq_len, b_size, start):
        gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
        pbar = self.progbar(i, seq_len)
        msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
        sys.stdout.write(f"\r{msg}")
    def progbar(self, i, n, size=16):
        done = int(i * size) // n
        bar = ''
        for i in range(size):
            bar += '█' if i <= done else '░'
        return bar
 class WaveRNNInference(nn.Layer):
    def __init__(self, normalizer, wavernn):
        super().__init__()
        self.normalizer = normalizer
        self.wavernn = wavernn
    def forward(self,
                logmel,
                batched: bool=True,
                target: int=12000,
                overlap: int=600,
                mu_law: bool=True,
                gen_display: bool=False):
        normalized_mel = self.normalizer(logmel)
        wav = self.wavernn.generate(
            normalized_mel, )
        # batched=batched,
        # target=target,
        # overlap=overlap,
        # mu_law=mu_law,
        # gen_display=gen_display)
        return wav
--- a/paddlespeech/t2s/models/wavernn/wavernn_updater.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn_updater.py
@ -0,0 +1,201 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 import paddle
 import soundfile as sf
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
 logging.basicConfig(
    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
    datefmt='[%Y-%m-%d %H:%M:%S]')
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 def calculate_grad_norm(parameters, norm_type: str=2):
    '''
    calculate grad norm of mdoel's parameters
    parameters:
        model's parameters
    norm_type: str
    Returns
    ------------
    Tensor
        grad_norm
    '''
    grad_list = [
        paddle.to_tensor(p.grad) for p in parameters if p.grad is not None
    ]
    norm_list = paddle.stack(
        [paddle.norm(grad, norm_type) for grad in grad_list])
    total_norm = paddle.norm(norm_list)
    return total_norm
 # for save name in gen_valid_samples()
 ITERATION = 0
 class WaveRNNUpdater(StandardUpdater):
    def __init__(self,
                 model: Layer,
                 optimizer: Optimizer,
                 criterion: Layer,
                 dataloader: DataLoader,
                 init_state=None,
                 output_dir: Path=None,
                 mode='RAW'):
        super().__init__(model, optimizer, dataloader, init_state=None)
        self.criterion = criterion
        # self.scheduler = scheduler
        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
        self.filehandler = logging.FileHandler(str(log_file))
        logger.addHandler(self.filehandler)
        self.logger = logger
        self.msg = ""
        self.mode = mode
    def update_core(self, batch):
        self.msg = "Rank: {}, ".format(dist.get_rank())
        losses_dict = {}
        # parse batch
        self.model.train()
        self.optimizer.clear_grad()
        wav, y, mel = batch
        y_hat = self.model(wav, mel)
        if self.mode == 'RAW':
            y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
        elif self.mode == 'MOL':
            y_hat = paddle.cast(y, dtype='float32')
        y = y.unsqueeze(-1)
        loss = self.criterion(y_hat, y)
        loss.backward()
        grad_norm = float(
            calculate_grad_norm(self.model.parameters(), norm_type=2))
        self.optimizer.step()
        report("train/loss", float(loss))
        report("train/grad_norm", float(grad_norm))
        losses_dict["loss"] = float(loss)
        losses_dict["grad_norm"] = float(grad_norm)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
        global ITERATION
        ITERATION = self.state.iteration + 1
 class WaveRNNEvaluator(StandardEvaluator):
    def __init__(self,
                 model: Layer,
                 criterion: Layer,
                 dataloader: Optimizer,
                 output_dir: Path=None,
                 valid_generate_loader=None,
                 config=None):
        super().__init__(model, dataloader)
        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
        self.filehandler = logging.FileHandler(str(log_file))
        logger.addHandler(self.filehandler)
        self.logger = logger
        self.msg = ""
        self.criterion = criterion
        self.valid_generate_loader = valid_generate_loader
        self.config = config
        self.mode = config.model.mode
        self.valid_samples_dir = output_dir / "valid_samples"
        self.valid_samples_dir.mkdir(parents=True, exist_ok=True)
    def evaluate_core(self, batch):
        self.msg = "Evaluate: "
        losses_dict = {}
        # parse batch
        wav, y, mel = batch
        y_hat = self.model(wav, mel)
        if self.mode == 'RAW':
            y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
        elif self.mode == 'MOL':
            y_hat = paddle.cast(y, dtype='float32')
        y = y.unsqueeze(-1)
        loss = self.criterion(y_hat, y)
        report("eval/loss", float(loss))
        losses_dict["loss"] = float(loss)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
        self.logger.info(self.msg)
    def gen_valid_samples(self):
        for i, item in enumerate(self.valid_generate_loader):
            if i >= self.config.generate_num:
                break
            print(
                '\n| Generating: {}/{}'.format(i + 1, self.config.generate_num))
            mel = item['feats']
            wav = item['wave']
            wav = wav.squeeze(0)
            origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format(
                self.iteration, i)
            sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs)
            if self.config.inference.gen_batched:
                batch_str = 'gen_batched_target{}_overlap{}'.format(
                    self.config.inference.target, self.config.inference.overlap)
            else:
                batch_str = 'gen_not_batched'
            gen_save_path = str(self.valid_samples_dir /
                                '{}_steps_{}_{}.wav'.format(self.iteration, i,
                                                            batch_str))
            # (1, T, C_aux) -> (T, C_aux)
            mel = mel.squeeze(0)
            gen_sample = self.model.generate(
                mel, self.config.inference.gen_batched,
                self.config.inference.target, self.config.inference.overlap,
                self.config.mu_law)
            sf.write(
                gen_save_path, gen_sample.numpy(), samplerate=self.config.fs)
    def __call__(self, trainer=None):
        summary = self.evaluate()
        for k, v in summary.items():
            report(k, v)
        # gen samples at then end of evaluate
        self.iteration = ITERATION
        if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
            self.gen_valid_samples()
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -14,6 +14,7 @@
 import math
 import librosa
 import numpy as np
 import paddle
 from paddle import nn
 from paddle.fluid.layers import sequence_mask
@ -23,6 +24,145 @@ from scipy import signal
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 # Losses for WaveRNN
 def log_sum_exp(x):
    """ numerically stable log_sum_exp implementation that prevents overflow """
    # TF ordering
    axis = len(x.shape) - 1
    m = paddle.max(x, axis=axis)
    m2 = paddle.max(x, axis=axis, keepdim=True)
    return m + paddle.log(paddle.sum(paddle.exp(x - m2), axis=axis))
 # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
 def discretized_mix_logistic_loss(y_hat,
                                  y,
                                  num_classes=65536,
                                  log_scale_min=None,
                                  reduce=True):
    if log_scale_min is None:
        log_scale_min = float(np.log(1e-14))
    y_hat = y_hat.transpose([0, 2, 1])
    assert y_hat.dim() == 3
    assert y_hat.shape[1] % 3 == 0
    nr_mix = y_hat.shape[1] // 3
    # (B x T x C)
    y_hat = y_hat.transpose([0, 2, 1])
    # unpack parameters. (B, T, num_mixtures) x 3
    logit_probs = y_hat[:, :, :nr_mix]
    means = y_hat[:, :, nr_mix:2 * nr_mix]
    log_scales = paddle.clip(
        y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
    # B x T x 1 -> B x T x num_mixtures
    y = y.expand_as(means)
    centered_y = paddle.cast(y, dtype=paddle.get_default_dtype()) - means
    inv_stdv = paddle.exp(-log_scales)
    plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
    cdf_plus = F.sigmoid(plus_in)
    min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
    cdf_min = F.sigmoid(min_in)
    # log probability for edge case of 0 (before scaling)
    # equivalent: torch.log(F.sigmoid(plus_in))
    # softplus: log(1+ e^{-x})
    log_cdf_plus = plus_in - F.softplus(plus_in)
    # log probability for edge case of 255 (before scaling)
    # equivalent: (1 - F.sigmoid(min_in)).log()
    log_one_minus_cdf_min = -F.softplus(min_in)
    # probability for all other cases
    cdf_delta = cdf_plus - cdf_min
    mid_in = inv_stdv * centered_y
    # log probability in the center of the bin, to be used in extreme cases
    # (not actually used in our code)
    log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
    # for num_classes=65536 case? 1e-7? not sure..
    inner_inner_cond = cdf_delta > 1e-5
    inner_inner_cond = paddle.cast(
        inner_inner_cond, dtype=paddle.get_default_dtype())
    # inner_inner_out = inner_inner_cond * \
    #                   paddle.log(paddle.clip(cdf_delta, min=1e-12)) + \
    #                   (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
    inner_inner_out = inner_inner_cond * paddle.log(
        paddle.clip(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * (
            log_pdf_mid - np.log((num_classes - 1) / 2))
    inner_cond = y > 0.999
    inner_cond = paddle.cast(inner_cond, dtype=paddle.get_default_dtype())
    inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond
                                                      ) * inner_inner_out
    cond = y < -0.999
    cond = paddle.cast(cond, dtype=paddle.get_default_dtype())
    log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
    log_probs = log_probs + F.log_softmax(logit_probs, -1)
    if reduce:
        return -paddle.mean(log_sum_exp(log_probs))
    else:
        return -log_sum_exp(log_probs).unsqueeze(-1)
 def sample_from_discretized_mix_logistic(y, log_scale_min=None):
    """
    Sample from discretized mixture of logistic distributions
    Parameters
    ----------
    y : Tensor 
        (B, C, T)
    log_scale_min : float
        Log scale minimum value
    Returns
    ----------
    Tensor
        sample in range of [-1, 1].
    """
    if log_scale_min is None:
        log_scale_min = float(np.log(1e-14))
    assert y.shape[1] % 3 == 0
    nr_mix = y.shape[1] // 3
    # (B, T, C)
    y = y.transpose([0, 2, 1])
    logit_probs = y[:, :, :nr_mix]
    # sample mixture indicator from softmax
    temp = paddle.uniform(
        logit_probs.shape, dtype=logit_probs.dtype, min=1e-5, max=1.0 - 1e-5)
    temp = logit_probs - paddle.log(-paddle.log(temp))
    argmax = paddle.argmax(temp, axis=-1)
    # (B, T) -> (B, T, nr_mix)
    one_hot = F.one_hot(argmax, nr_mix)
    one_hot = paddle.cast(one_hot, dtype=paddle.get_default_dtype())
    # select logistic parameters
    means = paddle.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
    log_scales = paddle.clip(
        paddle.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1),
        min=log_scale_min)
    # sample from logistic & clip to interval
    # we don't actually round to the nearest 8bit value when sampling
    u = paddle.uniform(means.shape, min=1e-5, max=1.0 - 1e-5)
    x = means + paddle.exp(log_scales) * (paddle.log(u) - paddle.log(1. - u))
    x = paddle.clip(x, min=-1., max=-1.)
    return x
 # Loss for new Tacotron2
 class GuidedAttentionLoss(nn.Layer):
    """Guided attention loss function module.