PaddleSpeech/paddlespeech/t2s/models/wavernn/wavernn.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from https://github.com/fatchord/WaveRNN
import sys
import time
from typing import List

import numpy as np
import paddle
from paddle import nn
from paddle.nn import functional as F

from paddlespeech.t2s.audio.codec import decode_mu_law
from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.upsample import Stretch2D


class ResBlock(nn.Layer):
    def __init__(self, dims):
        super().__init__()
        self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
        self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)
        self.batch_norm1 = nn.BatchNorm1D(dims)
        self.batch_norm2 = nn.BatchNorm1D(dims)

    def forward(self, x):
        '''
        conv -> bn -> relu -> conv -> bn + residual connection
        '''
        residual = x
        x = self.conv1(x)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.batch_norm2(x)
        return x + residual


class MelResNet(nn.Layer):
    def __init__(self,
                 res_blocks: int=10,
                 compute_dims: int=128,
                 res_out_dims: int=128,
                 aux_channels: int=80,
                 aux_context_window: int=0):
        super().__init__()
        k_size = aux_context_window * 2 + 1
        # pay attention here, the dim reduces aux_context_window * 2
        self.conv_in = nn.Conv1D(
            aux_channels, compute_dims, kernel_size=k_size, bias_attr=False)
        self.batch_norm = nn.BatchNorm1D(compute_dims)
        self.layers = nn.LayerList()
        for _ in range(res_blocks):
            self.layers.append(ResBlock(compute_dims))
        self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1)

    def forward(self, x):
        '''
        Args:
            x (Tensor): Input tensor (B, in_dims, T).
        Returns:
            Tensor: Output tensor (B, res_out_dims, T).
        '''

        x = self.conv_in(x)
        x = self.batch_norm(x)
        x = F.relu(x)
        for f in self.layers:
            x = f(x)
        x = self.conv_out(x)
        return x


class UpsampleNetwork(nn.Layer):
    def __init__(self,
                 aux_channels: int=80,
                 upsample_scales: List[int]=[4, 5, 3, 5],
                 compute_dims: int=128,
                 res_blocks: int=10,
                 res_out_dims: int=128,
                 aux_context_window: int=2):
        super().__init__()
        # total_scale is the total Up sampling multiple
        total_scale = np.prod(upsample_scales)
        # TODO pad*total_scale is numpy.int64
        self.indent = int(aux_context_window * total_scale)
        self.resnet = MelResNet(
            res_blocks=res_blocks,
            aux_channels=aux_channels,
            compute_dims=compute_dims,
            res_out_dims=res_out_dims,
            aux_context_window=aux_context_window)
        self.resnet_stretch = Stretch2D(total_scale, 1)
        self.up_layers = nn.LayerList()
        for scale in upsample_scales:
            k_size = (1, scale * 2 + 1)
            padding = (0, scale)
            stretch = Stretch2D(scale, 1)

            conv = nn.Conv2D(
                1, 1, kernel_size=k_size, padding=padding, bias_attr=False)
            weight_ = paddle.full_like(conv.weight, 1. / k_size[1])
            conv.weight.set_value(weight_)
            self.up_layers.append(stretch)
            self.up_layers.append(conv)

    def forward(self, m):
        '''
        Args:
            c (Tensor): Input tensor (B, C_aux, T).
        Returns:
            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        '''
        # aux: [B, C_aux, T] 
        # -> [B, res_out_dims, T - 2 * aux_context_window]
        # -> [B, 1, res_out_dims, T - 2 * aux_context_window]
        aux = self.resnet(m).unsqueeze(1)
        # aux: [B, 1, res_out_dims, T - 2 * aux_context_window]
        # -> [B, 1, res_out_dims, (T - 2 * pad) *  prob(upsample_scales)]
        aux = self.resnet_stretch(aux)
        # aux: [B, 1, res_out_dims, T * prob(upsample_scales)] 
        # -> [B, res_out_dims, T * prob(upsample_scales)]
        aux = aux.squeeze(1)
        # m: [B, C_aux, T] -> [B, 1, C_aux, T]
        m = m.unsqueeze(1)
        for f in self.up_layers:
            m = f(m)
        # m: [B, 1, C_aux, T*prob(upsample_scales)]
        # -> [B, C_aux, T * prob(upsample_scales)]
        # -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)]
        m = m.squeeze(1)[:, :, self.indent:-self.indent]
        # m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux]
        # aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims]
        return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1])


class WaveRNN(nn.Layer):
    def __init__(
            self,
            rnn_dims: int=512,
            fc_dims: int=512,
            bits: int=9,
            aux_context_window: int=2,
            upsample_scales: List[int]=[4, 5, 3, 5],
            aux_channels: int=80,
            compute_dims: int=128,
            res_out_dims: int=128,
            res_blocks: int=10,
            hop_length: int=300,
            sample_rate: int=24000,
            mode='RAW',
            init_type: str="xavier_uniform", ):
        '''
        Args:
            rnn_dims (int, optional): Hidden dims of RNN Layers.
            fc_dims (int, optional): Dims of FC Layers.
            bits (int, optional): bit depth of signal.
            aux_context_window (int, optional): The context window size of the first convolution applied to the 
                auxiliary input, by default 2
            upsample_scales (List[int], optional): Upsample scales of the upsample network.
            aux_channels (int, optional): Auxiliary channel of the residual blocks.
            compute_dims (int, optional): Dims of Conv1D in MelResNet.
            res_out_dims (int, optional): Dims of output in MelResNet.
            res_blocks (int, optional): Number of residual blocks.
            mode (str, optional): Output mode of the WaveRNN vocoder. 
                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
            init_type (str): How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
        self.aux_context_window = aux_context_window
        if self.mode == 'RAW':
            self.n_classes = 2**bits
        elif self.mode == 'MOL':
            self.n_classes = 10 * 3
        else:
            RuntimeError('Unknown model mode value - ', self.mode)

        # List of rnns to call 'flatten_parameters()' on
        self._to_flatten = []

        self.rnn_dims = rnn_dims
        self.aux_dims = res_out_dims // 4
        self.hop_length = hop_length
        self.sample_rate = sample_rate

        # initialize parameters
        initialize(self, init_type)

        self.upsample = UpsampleNetwork(
            aux_channels=aux_channels,
            upsample_scales=upsample_scales,
            compute_dims=compute_dims,
            res_blocks=res_blocks,
            res_out_dims=res_out_dims,
            aux_context_window=aux_context_window)
        self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims)

        self.rnn1 = nn.GRU(rnn_dims, rnn_dims)
        self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims)

        self._to_flatten += [self.rnn1, self.rnn2]

        self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)
        self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)
        self.fc3 = nn.Linear(fc_dims, self.n_classes)

        # Avoid fragmentation of RNN parameters and associated warning
        self._flatten_parameters()

        nn.initializer.set_global_initializer(None)

    def forward(self, x, c):
        '''
        Args:
            x (Tensor): wav sequence, [B, T]
            c (Tensor): mel spectrogram [B, C_aux, T']

            T = (T' - 2 * aux_context_window ) * hop_length
        Returns:
            Tensor: [B, T, n_classes]
        '''
        # Although we `_flatten_parameters()` on init, when using DataParallel
        # the model gets replicated, making it no longer guaranteed that the
        # weights are contiguous in GPU memory. Hence, we must call it again
        self._flatten_parameters()

        bsize = paddle.shape(x)[0]
        h1 = paddle.zeros([1, bsize, self.rnn_dims])
        h2 = paddle.zeros([1, bsize, self.rnn_dims])
        # c: [B, T, C_aux]
        # aux: [B, T, res_out_dims]
        c, aux = self.upsample(c)

        aux_idx = [self.aux_dims * i for i in range(5)]
        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]

        x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2)
        x = self.I(x)
        res = x
        x, _ = self.rnn1(x, h1)

        x = x + res
        res = x
        x = paddle.concat([x, a2], axis=2)
        x, _ = self.rnn2(x, h2)

        x = x + res
        x = paddle.concat([x, a3], axis=2)
        x = F.relu(self.fc1(x))

        x = paddle.concat([x, a4], axis=2)
        x = F.relu(self.fc2(x))

        return self.fc3(x)

    @paddle.no_grad()
    def generate(self,
                 c,
                 batched: bool=True,
                 target: int=12000,
                 overlap: int=600,
                 mu_law: bool=True,
                 gen_display: bool=False):
        """
        Args:
            c(Tensor): input mels, (T', C_aux)
            batched(bool): generate in batch or not
            target(int): target number of samples to be generated in each batch entry
            overlap(int): number of samples for crossfading between batches
            mu_law(bool)
        Returns: 
            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
        """

        self.eval()

        mu_law = mu_law if self.mode == 'RAW' else False

        output = []
        start = time.time()

        # pseudo batch
        # (T, C_aux) -> (1, C_aux, T)
        c = paddle.transpose(c, [1, 0]).unsqueeze(0)
        T = paddle.shape(c)[-1]
        wave_len = T * self.hop_length
        # TODO remove two transpose op by modifying function pad_tensor
        c = self.pad_tensor(
            c.transpose([0, 2, 1]), pad=self.aux_context_window,
            side='both').transpose([0, 2, 1])

        c, aux = self.upsample(c)

        if batched:
            # (num_folds, target + 2 * overlap, features)
            c = self.fold_with_overlap(c, target, overlap)
            aux = self.fold_with_overlap(aux, target, overlap)

        # for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for
        # will not get TensorArray
        # see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray
        # b_size, seq_len, _ = paddle.shape(c)
        b_size = paddle.shape(c)[0]
        seq_len = paddle.shape(c)[1]

        h1 = paddle.zeros([b_size, self.rnn_dims])
        h2 = paddle.zeros([b_size, self.rnn_dims])
        x = paddle.zeros([b_size, 1])

        d = self.aux_dims
        aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]

        for i in range(seq_len):
            m_t = c[:, i, :]
            # for dygraph to static graph
            # a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)
            a1_t = aux_split[0][:, i, :]
            a2_t = aux_split[1][:, i, :]
            a3_t = aux_split[2][:, i, :]
            a4_t = aux_split[3][:, i, :]
            x = paddle.concat([x, m_t, a1_t], axis=1)
            x = self.I(x)
            # use GRUCell here
            h1, _ = self.rnn1[0].cell(x, h1)
            x = x + h1
            inp = paddle.concat([x, a2_t], axis=1)
            # use GRUCell here
            h2, _ = self.rnn2[0].cell(inp, h2)

            x = x + h2
            x = paddle.concat([x, a3_t], axis=1)
            x = F.relu(self.fc1(x))

            x = paddle.concat([x, a4_t], axis=1)
            x = F.relu(self.fc2(x))

            logits = self.fc3(x)

            if self.mode == 'MOL':
                sample = sample_from_discretized_mix_logistic(
                    logits.unsqueeze(0).transpose([0, 2, 1]))
                output.append(sample.reshape([-1]))
                x = sample.transpose([1, 0, 2])

            elif self.mode == 'RAW':
                # fix bug for paddle 2.3, see https://github.com/PaddlePaddle/Paddle/commit/01f606b4f1ca3e184a59111084ed460ee0798a5a
                # posterior = F.softmax(logits, axis=1)
                posterior = logits
                distrib = paddle.distribution.Categorical(posterior)
                # corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law
                # distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]
                # sample: [-1, 1]
                sample = 2 * distrib.sample([1])[0].cast('float32') / (
                    self.n_classes - 1.) - 1.
                output.append(sample)
                x = sample.unsqueeze(-1)
            else:
                raise RuntimeError('Unknown model mode value - ', self.mode)

            if gen_display:
                if i % 1000 == 0:
                    self.gen_display(i, int(seq_len), int(b_size), start)

        output = paddle.stack(output).transpose([1, 0])

        if mu_law:
            output = decode_mu_law(output, self.n_classes, False)

        if batched:
            output = self.xfade_and_unfold(output, target, overlap)
        else:
            output = output[0]

        # Fade-out at the end to avoid signal cutting out suddenly
        fade_out = paddle.linspace(1, 0, 10 * self.hop_length)
        output = output[:wave_len]
        output[-10 * self.hop_length:] *= fade_out

        self.train()

        # 增加 C_out 维度
        return output.unsqueeze(-1)

    def _flatten_parameters(self):
        [m.flatten_parameters() for m in self._to_flatten]

    def pad_tensor(self, x, pad, side='both'):
        '''
        Args:
            x(Tensor): mel, [1, n_frames, 80]
            pad(int): 
            side(str, optional):  (Default value = 'both')

        Returns:
            Tensor
        '''
        b, t, _ = paddle.shape(x)
        # for dygraph to static graph
        c = x.shape[-1]
        total = t + 2 * pad if side == 'both' else t + pad
        padded = paddle.zeros([b, total, c])
        if side == 'before' or side == 'both':
            padded[:, pad:pad + t, :] = x
        elif side == 'after':
            padded[:, :t, :] = x
        return padded

    def fold_with_overlap(self, x, target, overlap):
        '''
        Fold the tensor with overlap for quick batched inference.
        Overlap will be used for crossfading in xfade_and_unfold()

        Args:
            x(Tensor): Upsampled conditioning features. mels or aux
                shape=(1, T, features)
                mels: [1, T, 80]
                aux: [1, T, 128]
            target(int): Target timesteps for each index of batch
            overlap(int): Timesteps for both xfade and rnn warmup

        Returns:
            Tensor: 
                shape=(num_folds, target + 2 * overlap, features)
                num_flods = (time_seq - overlap) // (target + overlap)
                mel: [num_folds, target + 2 * overlap, 80]
                aux: [num_folds, target + 2 * overlap, 128]

        Details:
            x = [[h1, h2, ... hn]]
            Where each h is a vector of conditioning features
            Eg: target=2, overlap=1 with x.size(1)=10

            folded = [[h1, h2, h3, h4],
                    [h4, h5, h6, h7],
                    [h7, h8, h9, h10]]
        '''

        _, total_len, features = paddle.shape(x)

        # Calculate variables needed
        num_folds = (total_len - overlap) // (target + overlap)
        extended_len = num_folds * (overlap + target) + overlap
        remaining = total_len - extended_len

        # Pad if some time steps poking out
        if remaining != 0:
            num_folds += 1
            padding = target + 2 * overlap - remaining
            x = self.pad_tensor(x, padding, side='after')

        folded = paddle.zeros([num_folds, target + 2 * overlap, features])

        # Get the values for the folded tensor
        for i in range(num_folds):
            start = i * (target + overlap)
            end = start + target + 2 * overlap
            folded[i] = x[0][start:end, :]
        return folded

    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
        ''' Applies a crossfade and unfolds into a 1d array.

        Args:
            y (Tensor): 
                Batched sequences of audio samples
                shape=(num_folds, target + 2 * overlap)
                dtype=paddle.float32
            overlap (int): Timesteps for both xfade and rnn warmup

        Returns:
            Tensor
                audio samples in a 1d array
                shape=(total_len)
                dtype=paddle.float32

        Details:
            y = [[seq1],
                [seq2],
                [seq3]]

            Apply a gain envelope at both ends of the sequences

            y = [[seq1_in, seq1_target, seq1_out],
                [seq2_in, seq2_target, seq2_out],
                [seq3_in, seq3_target, seq3_out]]

            Stagger and add up the groups of samples:

            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]

        '''
        # num_folds = (total_len - overlap) // (target + overlap)
        num_folds, length = paddle.shape(y)
        target = length - 2 * overlap
        total_len = num_folds * (target + overlap) + overlap

        # Need some silence for the run warmup
        slience_len = 0
        linear_len = slience_len
        fade_len = overlap - slience_len
        slience = paddle.zeros([slience_len], dtype=paddle.float32)
        linear = paddle.ones([linear_len], dtype=paddle.float32)

        # Equal power crossfade
        # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
        sigmoid_scale = 2.3
        t = paddle.linspace(
            -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32)
        # sigmoid 曲线应该更好
        fade_in = paddle.nn.functional.sigmoid(t)
        fade_out = 1 - paddle.nn.functional.sigmoid(t)
        # Concat the silence to the fades
        fade_out = paddle.concat([linear, fade_out])
        fade_in = paddle.concat([slience, fade_in])

        # Apply the gain to the overlap samples
        y[:, :overlap] *= fade_in
        y[:, -overlap:] *= fade_out

        unfolded = paddle.zeros([total_len], dtype=paddle.float32)

        # Loop to add up all the samples
        for i in range(num_folds):
            start = i * (target + overlap)
            end = start + target + 2 * overlap
            unfolded[start:end] += y[i]

        return unfolded

    def gen_display(self, i, seq_len, b_size, start):
        gen_rate = (i + 1) / (time.time() - start) * b_size / 1000
        pbar = self.progbar(i, seq_len)
        msg = f'| {pbar} {i*b_size}/{seq_len*b_size} | Batch Size: {b_size} | Gen Rate: {gen_rate:.1f}kHz | '
        sys.stdout.write(f"\r{msg}")

    def progbar(self, i, n, size=16):
        done = int(i * size) // n
        bar = ''
        for i in range(size):
            bar += '█' if i <= done else '░'
        return bar


class WaveRNNInference(nn.Layer):
    def __init__(self, normalizer, wavernn):
        super().__init__()
        self.normalizer = normalizer
        self.wavernn = wavernn

    def forward(self,
                logmel,
                batched: bool=True,
                target: int=12000,
                overlap: int=600,
                mu_law: bool=True,
                gen_display: bool=False):
        normalized_mel = self.normalizer(logmel)

        wav = self.wavernn.generate(
            normalized_mel, )
        # batched=batched,
        # target=target,
        # overlap=overlap,
        # mu_law=mu_law,
        # gen_display=gen_display)

        return wav
add wavernn, test=tts 3 years ago			`# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add license and reference for some models, test=doc 3 years ago			`# Modified from https://github.com/fatchord/WaveRNN`
add wavernn, test=tts 3 years ago			`import sys`
			`import time`
			`from typing import List`

			`import numpy as np`
			`import paddle`
			`from paddle import nn`
			`from paddle.nn import functional as F`

update wavernn, test=tts 3 years ago			`from paddlespeech.t2s.audio.codec import decode_mu_law`
add wavernn, test=tts 3 years ago			`from paddlespeech.t2s.modules.losses import sample_from_discretized_mix_logistic`
			`from paddlespeech.t2s.modules.nets_utils import initialize`
			`from paddlespeech.t2s.modules.upsample import Stretch2D`


			`class ResBlock(nn.Layer):`
			`def __init__(self, dims):`
update wavernn, test=tts 3 years ago			`super().__init__()`
add wavernn, test=tts 3 years ago			`self.conv1 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)`
			`self.conv2 = nn.Conv1D(dims, dims, kernel_size=1, bias_attr=False)`
			`self.batch_norm1 = nn.BatchNorm1D(dims)`
			`self.batch_norm2 = nn.BatchNorm1D(dims)`

			`def forward(self, x):`
			`'''`
			`conv -> bn -> relu -> conv -> bn + residual connection`
			`'''`
			`residual = x`
			`x = self.conv1(x)`
			`x = self.batch_norm1(x)`
			`x = F.relu(x)`
			`x = self.conv2(x)`
			`x = self.batch_norm2(x)`
			`return x + residual`


			`class MelResNet(nn.Layer):`
			`def __init__(self,`
			`res_blocks: int=10,`
			`compute_dims: int=128,`
			`res_out_dims: int=128,`
			`aux_channels: int=80,`
			`aux_context_window: int=0):`
			`super().__init__()`
			`k_size = aux_context_window * 2 + 1`
			`# pay attention here, the dim reduces aux_context_window * 2`
			`self.conv_in = nn.Conv1D(`
			`aux_channels, compute_dims, kernel_size=k_size, bias_attr=False)`
			`self.batch_norm = nn.BatchNorm1D(compute_dims)`
			`self.layers = nn.LayerList()`
			`for _ in range(res_blocks):`
			`self.layers.append(ResBlock(compute_dims))`
			`self.conv_out = nn.Conv1D(compute_dims, res_out_dims, kernel_size=1)`

			`def forward(self, x):`
			`'''`
change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`x (Tensor): Input tensor (B, in_dims, T).`
			`Returns:`
			`Tensor: Output tensor (B, res_out_dims, T).`
add wavernn, test=tts 3 years ago			`'''`
fix wavernn dygraph to static , test=tts 3 years ago
add wavernn, test=tts 3 years ago			`x = self.conv_in(x)`
			`x = self.batch_norm(x)`
			`x = F.relu(x)`
			`for f in self.layers:`
			`x = f(x)`
			`x = self.conv_out(x)`
			`return x`


			`class UpsampleNetwork(nn.Layer):`
			`def __init__(self,`
			`aux_channels: int=80,`
			`upsample_scales: List[int]=[4, 5, 3, 5],`
			`compute_dims: int=128,`
			`res_blocks: int=10,`
			`res_out_dims: int=128,`
			`aux_context_window: int=2):`
			`super().__init__()`
			`# total_scale is the total Up sampling multiple`
			`total_scale = np.prod(upsample_scales)`
			`# TODO pad*total_scale is numpy.int64`
			`self.indent = int(aux_context_window * total_scale)`
			`self.resnet = MelResNet(`
			`res_blocks=res_blocks,`
			`aux_channels=aux_channels,`
			`compute_dims=compute_dims,`
			`res_out_dims=res_out_dims,`
			`aux_context_window=aux_context_window)`
			`self.resnet_stretch = Stretch2D(total_scale, 1)`
			`self.up_layers = nn.LayerList()`
			`for scale in upsample_scales:`
			`k_size = (1, scale * 2 + 1)`
			`padding = (0, scale)`
			`stretch = Stretch2D(scale, 1)`

			`conv = nn.Conv2D(`
			`1, 1, kernel_size=k_size, padding=padding, bias_attr=False)`
			`weight_ = paddle.full_like(conv.weight, 1. / k_size[1])`
			`conv.weight.set_value(weight_)`
			`self.up_layers.append(stretch)`
			`self.up_layers.append(conv)`

			`def forward(self, m):`
			`'''`
change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`c (Tensor): Input tensor (B, C_aux, T).`
			`Returns:`
			`Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), C_aux).`
			`Tensor: Output tensor (B, (T - 2 * pad) * prob(upsample_scales), res_out_dims).`
add wavernn, test=tts 3 years ago			`'''`
			`# aux: [B, C_aux, T]`
			`# -> [B, res_out_dims, T - 2 * aux_context_window]`
			`# -> [B, 1, res_out_dims, T - 2 * aux_context_window]`
			`aux = self.resnet(m).unsqueeze(1)`
			`# aux: [B, 1, res_out_dims, T - 2 * aux_context_window]`
			`# -> [B, 1, res_out_dims, (T - 2 * pad) * prob(upsample_scales)]`
			`aux = self.resnet_stretch(aux)`
			`# aux: [B, 1, res_out_dims, T * prob(upsample_scales)]`
			`# -> [B, res_out_dims, T * prob(upsample_scales)]`
			`aux = aux.squeeze(1)`
			`# m: [B, C_aux, T] -> [B, 1, C_aux, T]`
			`m = m.unsqueeze(1)`
			`for f in self.up_layers:`
			`m = f(m)`
			`# m: [B, 1, C_aux, T*prob(upsample_scales)]`
			`# -> [B, C_aux, T * prob(upsample_scales)]`
			`# -> [B, C_aux, (T - 2 * pad) * prob(upsample_scales)]`
			`m = m.squeeze(1)[:, :, self.indent:-self.indent]`
			`# m: [B, (T - 2 * pad) * prob(upsample_scales), C_aux]`
			`# aux: [B, (T - 2 * pad) * prob(upsample_scales), res_out_dims]`
			`return m.transpose([0, 2, 1]), aux.transpose([0, 2, 1])`


			`class WaveRNN(nn.Layer):`
			`def __init__(`
			`self,`
			`rnn_dims: int=512,`
			`fc_dims: int=512,`
			`bits: int=9,`
			`aux_context_window: int=2,`
			`upsample_scales: List[int]=[4, 5, 3, 5],`
			`aux_channels: int=80,`
			`compute_dims: int=128,`
			`res_out_dims: int=128,`
			`res_blocks: int=10,`
			`hop_length: int=300,`
			`sample_rate: int=24000,`
			`mode='RAW',`
			`init_type: str="xavier_uniform", ):`
			`'''`
change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`rnn_dims (int, optional): Hidden dims of RNN Layers.`
			`fc_dims (int, optional): Dims of FC Layers.`
			`bits (int, optional): bit depth of signal.`
			`aux_context_window (int, optional): The context window size of the first convolution applied to the`
			`auxiliary input, by default 2`
			`upsample_scales (List[int], optional): Upsample scales of the upsample network.`
			`aux_channels (int, optional): Auxiliary channel of the residual blocks.`
			`compute_dims (int, optional): Dims of Conv1D in MelResNet.`
			`res_out_dims (int, optional): Dims of output in MelResNet.`
			`res_blocks (int, optional): Number of residual blocks.`
			`mode (str, optional): Output mode of the WaveRNN vocoder.`
			`MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
			`init_type (str): How to initialize parameters.`
add wavernn, test=tts 3 years ago			`'''`
			`super().__init__()`
			`self.mode = mode`
			`self.aux_context_window = aux_context_window`
			`if self.mode == 'RAW':`
			`self.n_classes = 2**bits`
			`elif self.mode == 'MOL':`
update wavernn, test=tts 3 years ago			`self.n_classes = 10 * 3`
add wavernn, test=tts 3 years ago			`else:`
			`RuntimeError('Unknown model mode value - ', self.mode)`

			`# List of rnns to call 'flatten_parameters()' on`
			`self._to_flatten = []`

			`self.rnn_dims = rnn_dims`
			`self.aux_dims = res_out_dims // 4`
			`self.hop_length = hop_length`
			`self.sample_rate = sample_rate`

			`# initialize parameters`
			`initialize(self, init_type)`

			`self.upsample = UpsampleNetwork(`
			`aux_channels=aux_channels,`
			`upsample_scales=upsample_scales,`
			`compute_dims=compute_dims,`
			`res_blocks=res_blocks,`
			`res_out_dims=res_out_dims,`
			`aux_context_window=aux_context_window)`
			`self.I = nn.Linear(aux_channels + self.aux_dims + 1, rnn_dims)`

			`self.rnn1 = nn.GRU(rnn_dims, rnn_dims)`
			`self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims)`
fix wavernn dygraph to static , test=tts 3 years ago
add wavernn, test=tts 3 years ago			`self._to_flatten += [self.rnn1, self.rnn2]`

			`self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims)`
			`self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims)`
			`self.fc3 = nn.Linear(fc_dims, self.n_classes)`

			`# Avoid fragmentation of RNN parameters and associated warning`
			`self._flatten_parameters()`

			`nn.initializer.set_global_initializer(None)`

			`def forward(self, x, c):`
			`'''`
change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`x (Tensor): wav sequence, [B, T]`
			`c (Tensor): mel spectrogram [B, C_aux, T']`

			`T = (T' - 2 * aux_context_window ) * hop_length`
			`Returns:`
			`Tensor: [B, T, n_classes]`
add wavernn, test=tts 3 years ago			`'''`
			# Although we `_flatten_parameters()` on init, when using DataParallel
			`# the model gets replicated, making it no longer guaranteed that the`
			`# weights are contiguous in GPU memory. Hence, we must call it again`
			`self._flatten_parameters()`

			`bsize = paddle.shape(x)[0]`
			`h1 = paddle.zeros([1, bsize, self.rnn_dims])`
			`h2 = paddle.zeros([1, bsize, self.rnn_dims])`
			`# c: [B, T, C_aux]`
			`# aux: [B, T, res_out_dims]`
			`c, aux = self.upsample(c)`

			`aux_idx = [self.aux_dims * i for i in range(5)]`
			`a1 = aux[:, :, aux_idx[0]:aux_idx[1]]`
			`a2 = aux[:, :, aux_idx[1]:aux_idx[2]]`
			`a3 = aux[:, :, aux_idx[2]:aux_idx[3]]`
			`a4 = aux[:, :, aux_idx[3]:aux_idx[4]]`

			`x = paddle.concat([x.unsqueeze(-1), c, a1], axis=2)`
			`x = self.I(x)`
			`res = x`
			`x, _ = self.rnn1(x, h1)`

			`x = x + res`
			`res = x`
			`x = paddle.concat([x, a2], axis=2)`
			`x, _ = self.rnn2(x, h2)`

			`x = x + res`
			`x = paddle.concat([x, a3], axis=2)`
			`x = F.relu(self.fc1(x))`

			`x = paddle.concat([x, a4], axis=2)`
			`x = F.relu(self.fc2(x))`

			`return self.fc3(x)`

			`@paddle.no_grad()`
			`def generate(self,`
			`c,`
			`batched: bool=True,`
			`target: int=12000,`
			`overlap: int=600,`
			`mu_law: bool=True,`
			`gen_display: bool=False):`
			`"""`
change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`c(Tensor): input mels, (T', C_aux)`
			`batched(bool): generate in batch or not`
			`target(int): target number of samples to be generated in each batch entry`
			`overlap(int): number of samples for crossfading between batches`
			`mu_law(bool)`
			`Returns:`
			`wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).`
add wavernn, test=tts 3 years ago			`"""`

			`self.eval()`

			`mu_law = mu_law if self.mode == 'RAW' else False`

			`output = []`
			`start = time.time()`
fix wavernn dygraph to static , test=tts 3 years ago
add wavernn, test=tts 3 years ago			`# pseudo batch`
			`# (T, C_aux) -> (1, C_aux, T)`
			`c = paddle.transpose(c, [1, 0]).unsqueeze(0)`
fix wavernn dygraph to static , test=tts 3 years ago			`T = paddle.shape(c)[-1]`
update wavernn, test=tts 3 years ago			`wave_len = T * self.hop_length`
add wavernn, test=tts 3 years ago			`# TODO remove two transpose op by modifying function pad_tensor`
			`c = self.pad_tensor(`
			`c.transpose([0, 2, 1]), pad=self.aux_context_window,`
			`side='both').transpose([0, 2, 1])`
fix wavernn dygraph to static , test=tts 3 years ago
add wavernn, test=tts 3 years ago			`c, aux = self.upsample(c)`

			`if batched:`
			`# (num_folds, target + 2 * overlap, features)`
			`c = self.fold_with_overlap(c, target, overlap)`
			`aux = self.fold_with_overlap(aux, target, overlap)`

fix wavernn dygraph to static , test=tts 3 years ago			# for dygraph to static graph, if use seq_len of `b_size, seq_len, _ = paddle.shape(c)` in for
			`# will not get TensorArray`
			`# see https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/04_dygraph_to_static/case_analysis_cn.html#list-lodtensorarray`
			`# b_size, seq_len, _ = paddle.shape(c)`
			`b_size = paddle.shape(c)[0]`
			`seq_len = paddle.shape(c)[1]`

add wavernn, test=tts 3 years ago			`h1 = paddle.zeros([b_size, self.rnn_dims])`
			`h2 = paddle.zeros([b_size, self.rnn_dims])`
			`x = paddle.zeros([b_size, 1])`

			`d = self.aux_dims`
			`aux_split = [aux[:, :, d * i:d * (i + 1)] for i in range(4)]`

			`for i in range(seq_len):`
			`m_t = c[:, i, :]`
fix wavernn dygraph to static , test=tts 3 years ago			`# for dygraph to static graph`
			`# a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split)`
			`a1_t = aux_split[0][:, i, :]`
			`a2_t = aux_split[1][:, i, :]`
			`a3_t = aux_split[2][:, i, :]`
			`a4_t = aux_split[3][:, i, :]`
add wavernn, test=tts 3 years ago			`x = paddle.concat([x, m_t, a1_t], axis=1)`
			`x = self.I(x)`
fix wavernn dygraph to static , test=tts 3 years ago			`# use GRUCell here`
			`h1, _ = self.rnn1[0].cell(x, h1)`
add wavernn, test=tts 3 years ago			`x = x + h1`
			`inp = paddle.concat([x, a2_t], axis=1)`
fix wavernn dygraph to static , test=tts 3 years ago			`# use GRUCell here`
			`h2, _ = self.rnn2[0].cell(inp, h2)`
add wavernn, test=tts 3 years ago
			`x = x + h2`
			`x = paddle.concat([x, a3_t], axis=1)`
			`x = F.relu(self.fc1(x))`

			`x = paddle.concat([x, a4_t], axis=1)`
			`x = F.relu(self.fc2(x))`

			`logits = self.fc3(x)`

			`if self.mode == 'MOL':`
			`sample = sample_from_discretized_mix_logistic(`
			`logits.unsqueeze(0).transpose([0, 2, 1]))`
			`output.append(sample.reshape([-1]))`
			`x = sample.transpose([1, 0, 2])`

			`elif self.mode == 'RAW':`
fix wavernn bug, test=tts 3 years ago			`# fix bug for paddle 2.3, see https://github.com/PaddlePaddle/Paddle/commit/01f606b4f1ca3e184a59111084ed460ee0798a5a`
			`# posterior = F.softmax(logits, axis=1)`
			`posterior = logits`
add wavernn, test=tts 3 years ago			`distrib = paddle.distribution.Categorical(posterior)`
			`# corresponding operate [np.floor((fx + 1) / 2 * mu + 0.5)] in enocde_mu_law`
update wavernn, test=tts 3 years ago			`# distrib.sample([1])[0].cast('float32'): [0, 2**bits-1]`
			`# sample: [-1, 1]`
add wavernn, test=tts 3 years ago			`sample = 2 * distrib.sample([1])[0].cast('float32') / (`
			`self.n_classes - 1.) - 1.`
			`output.append(sample)`
			`x = sample.unsqueeze(-1)`
			`else:`
			`raise RuntimeError('Unknown model mode value - ', self.mode)`

			`if gen_display:`
			`if i % 1000 == 0:`
			`self.gen_display(i, int(seq_len), int(b_size), start)`

			`output = paddle.stack(output).transpose([1, 0])`

			`if mu_law:`
			`output = decode_mu_law(output, self.n_classes, False)`

			`if batched:`
			`output = self.xfade_and_unfold(output, target, overlap)`
			`else:`
			`output = output[0]`

			`# Fade-out at the end to avoid signal cutting out suddenly`
update wavernn, test=tts 3 years ago			`fade_out = paddle.linspace(1, 0, 10 * self.hop_length)`
add wavernn, test=tts 3 years ago			`output = output[:wave_len]`
update wavernn, test=tts 3 years ago			`output[-10 * self.hop_length:] *= fade_out`
add wavernn, test=tts 3 years ago
			`self.train()`

			`# 增加 C_out 维度`
			`return output.unsqueeze(-1)`

			`def _flatten_parameters(self):`
			`[m.flatten_parameters() for m in self._to_flatten]`

			`def pad_tensor(self, x, pad, side='both'):`
			`'''`
change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`x(Tensor): mel, [1, n_frames, 80]`
			`pad(int):`
			`side(str, optional): (Default value = 'both')`

			`Returns:`
			`Tensor`
add wavernn, test=tts 3 years ago			`'''`
fix wavernn dygraph to static , test=tts 3 years ago			`b, t, _ = paddle.shape(x)`
			`# for dygraph to static graph`
			`c = x.shape[-1]`
add wavernn, test=tts 3 years ago			`total = t + 2 * pad if side == 'both' else t + pad`
			`padded = paddle.zeros([b, total, c])`
			`if side == 'before' or side == 'both':`
			`padded[:, pad:pad + t, :] = x`
			`elif side == 'after':`
			`padded[:, :t, :] = x`
			`return padded`

			`def fold_with_overlap(self, x, target, overlap):`
			`'''`
			`Fold the tensor with overlap for quick batched inference.`
			`Overlap will be used for crossfading in xfade_and_unfold()`

change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`x(Tensor): Upsampled conditioning features. mels or aux`
			`shape=(1, T, features)`
			`mels: [1, T, 80]`
			`aux: [1, T, 128]`
			`target(int): Target timesteps for each index of batch`
			`overlap(int): Timesteps for both xfade and rnn warmup`

			`Returns:`
			`Tensor:`
			`shape=(num_folds, target + 2 * overlap, features)`
			`num_flods = (time_seq - overlap) // (target + overlap)`
			`mel: [num_folds, target + 2 * overlap, 80]`
			`aux: [num_folds, target + 2 * overlap, 128]`

			`Details:`
			`x = [[h1, h2, ... hn]]`
			`Where each h is a vector of conditioning features`
			`Eg: target=2, overlap=1 with x.size(1)=10`

			`folded = [[h1, h2, h3, h4],`
			`[h4, h5, h6, h7],`
			`[h7, h8, h9, h10]]`
add wavernn, test=tts 3 years ago			`'''`

			`_, total_len, features = paddle.shape(x)`

			`# Calculate variables needed`
			`num_folds = (total_len - overlap) // (target + overlap)`
			`extended_len = num_folds * (overlap + target) + overlap`
			`remaining = total_len - extended_len`

			`# Pad if some time steps poking out`
			`if remaining != 0:`
			`num_folds += 1`
			`padding = target + 2 * overlap - remaining`
			`x = self.pad_tensor(x, padding, side='after')`

			`folded = paddle.zeros([num_folds, target + 2 * overlap, features])`

			`# Get the values for the folded tensor`
			`for i in range(num_folds):`
			`start = i * (target + overlap)`
			`end = start + target + 2 * overlap`
			`folded[i] = x[0][start:end, :]`
			`return folded`

			`def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):`
			`''' Applies a crossfade and unfolds into a 1d array.`

change the docstring style from numpydoc to google, test=tts 3 years ago			`Args:`
			`y (Tensor):`
			`Batched sequences of audio samples`
			`shape=(num_folds, target + 2 * overlap)`
			`dtype=paddle.float32`
			`overlap (int): Timesteps for both xfade and rnn warmup`

			`Returns:`
			`Tensor`
			`audio samples in a 1d array`
			`shape=(total_len)`
			`dtype=paddle.float32`

			`Details:`
			`y = [[seq1],`
			`[seq2],`
			`[seq3]]`

			`Apply a gain envelope at both ends of the sequences`

			`y = [[seq1_in, seq1_target, seq1_out],`
			`[seq2_in, seq2_target, seq2_out],`
			`[seq3_in, seq3_target, seq3_out]]`

			`Stagger and add up the groups of samples:`

			`[seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]`
add wavernn, test=tts 3 years ago
			`'''`
			`# num_folds = (total_len - overlap) // (target + overlap)`
fix wavernn dygraph to static , test=tts 3 years ago			`num_folds, length = paddle.shape(y)`
add wavernn, test=tts 3 years ago			`target = length - 2 * overlap`
			`total_len = num_folds * (target + overlap) + overlap`

			`# Need some silence for the run warmup`
add rtf for synthesize, add more vocoder for synthesize_e2e.sh, test=tts 3 years ago			`slience_len = 0`
			`linear_len = slience_len`
add wavernn, test=tts 3 years ago			`fade_len = overlap - slience_len`
fix wavernn dygraph to static , test=tts 3 years ago			`slience = paddle.zeros([slience_len], dtype=paddle.float32)`
add rtf for synthesize, add more vocoder for synthesize_e2e.sh, test=tts 3 years ago			`linear = paddle.ones([linear_len], dtype=paddle.float32)`
add wavernn, test=tts 3 years ago
			`# Equal power crossfade`
			`# fade_in increase from 0 to 1, fade_out reduces from 1 to 0`
add rtf for synthesize, add more vocoder for synthesize_e2e.sh, test=tts 3 years ago			`sigmoid_scale = 2.3`
			`t = paddle.linspace(`
			`-sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32)`
			`# sigmoid 曲线应该更好`
			`fade_in = paddle.nn.functional.sigmoid(t)`
			`fade_out = 1 - paddle.nn.functional.sigmoid(t)`
add wavernn, test=tts 3 years ago			`# Concat the silence to the fades`
			`fade_out = paddle.concat([linear, fade_out])`
			`fade_in = paddle.concat([slience, fade_in])`

			`# Apply the gain to the overlap samples`
			`y[:, :overlap] *= fade_in`
			`y[:, -overlap:] *= fade_out`

fix wavernn dygraph to static , test=tts 3 years ago			`unfolded = paddle.zeros([total_len], dtype=paddle.float32)`
add wavernn, test=tts 3 years ago
			`# Loop to add up all the samples`
			`for i in range(num_folds):`
			`start = i * (target + overlap)`
			`end = start + target + 2 * overlap`
			`unfolded[start:end] += y[i]`

			`return unfolded`

			`def gen_display(self, i, seq_len, b_size, start):`
			`gen_rate = (i + 1) / (time.time() - start) * b_size / 1000`
			`pbar = self.progbar(i, seq_len)`
			`msg = f'\| {pbar} {ib_size}/{seq_lenb_size} \| Batch Size: {b_size} \| Gen Rate: {gen_rate:.1f}kHz \| '`
			`sys.stdout.write(f"\r{msg}")`

			`def progbar(self, i, n, size=16):`
			`done = int(i * size) // n`
			`bar = ''`
			`for i in range(size):`
			`bar += '█' if i <= done else '░'`
			`return bar`
add wavernn in synthesize_e2e, test=tts 3 years ago

			`class WaveRNNInference(nn.Layer):`
			`def __init__(self, normalizer, wavernn):`
			`super().__init__()`
			`self.normalizer = normalizer`
			`self.wavernn = wavernn`

			`def forward(self,`
			`logmel,`
			`batched: bool=True,`
			`target: int=12000,`
			`overlap: int=600,`
			`mu_law: bool=True,`
			`gen_display: bool=False):`
			`normalized_mel = self.normalizer(logmel)`
fix wavernn dygraph to static , test=tts 3 years ago
add wavernn in synthesize_e2e, test=tts 3 years ago			`wav = self.wavernn.generate(`
fix wavernn dygraph to static , test=tts 3 years ago			`normalized_mel, )`
			`# batched=batched,`
			`# target=target,`
			`# overlap=overlap,`
			`# mu_law=mu_law,`
			`# gen_display=gen_display)`

add wavernn in synthesize_e2e, test=tts 3 years ago			`return wav`