PaddleSpeech/paddlespeech/t2s/modules/tacotron2/decoder.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
"""Tacotron2 decoder related modules."""
import paddle.nn.functional as F
import six
from paddle import nn


class Prenet(nn.Layer):
    """Prenet module for decoder of Spectrogram prediction network.

    This is a module of Prenet in the decoder of Spectrogram prediction network,
    which described in `Natural TTS
    Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The Prenet preforms nonlinear conversion
    of inputs before input to auto-regressive lstm,
    which helps to learn diagonal attentions.

    Notes
    ----------
    This module alway applies dropout even in evaluation.
    See the detail in `Natural TTS Synthesis by
    Conditioning WaveNet on Mel Spectrogram Predictions`_.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
        """Initialize prenet module.

        Parameters
        ----------
        idim : int
            Dimension of the inputs.
        odim : int
            Dimension of the outputs.
        n_layers : int, optional
            The number of prenet layers.
        n_units : int, optional
            The number of prenet units.
        """
        super().__init__()
        self.dropout_rate = dropout_rate
        self.prenet = nn.LayerList()
        for layer in six.moves.range(n_layers):
            n_inputs = idim if layer == 0 else n_units
            self.prenet.append(
                nn.Sequential(nn.Linear(n_inputs, n_units), nn.ReLU()))

    def forward(self, x):
        """Calculate forward propagation.

        Parameters
        ----------
        x : Tensor
            Batch of input tensors (B, ..., idim).

        Returns
        ----------
        Tensor
            Batch of output tensors (B, ..., odim).

        """
        for i in six.moves.range(len(self.prenet)):
            # F.dropout 引入了随机, tacotron2 的 dropout 是不能去掉的
            x = F.dropout(self.prenet[i](x))
        return x


class Postnet(nn.Layer):
    """Postnet module for Spectrogram prediction network.

    This is a module of Postnet in Spectrogram prediction network,
    which described in `Natural TTS Synthesis by
    Conditioning WaveNet on Mel Spectrogram Predictions`_.
    The Postnet predicts refines the predicted
    Mel-filterbank of the decoder,
    which helps to compensate the detail sturcture of spectrogram.

    .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
       https://arxiv.org/abs/1712.05884

    """

    def __init__(
            self,
            idim,
            odim,
            n_layers=5,
            n_chans=512,
            n_filts=5,
            dropout_rate=0.5,
            use_batch_norm=True, ):
        """Initialize postnet module.

        Parameters
        ----------
        idim : int
            Dimension of the inputs.
        odim : int
            Dimension of the outputs.
        n_layers : int, optional
            The number of layers.
        n_filts : int, optional
            The number of filter size.
        n_units : int, optional
            The number of filter channels.
        use_batch_norm : bool, optional
            Whether to use batch normalization..
        dropout_rate : float, optional
            Dropout rate..
        """
        super().__init__()
        self.postnet = nn.LayerList()
        for layer in six.moves.range(n_layers - 1):
            ichans = odim if layer == 0 else n_chans
            ochans = odim if layer == n_layers - 1 else n_chans
            if use_batch_norm:
                self.postnet.append(
                    nn.Sequential(
                        nn.Conv1D(
                            ichans,
                            ochans,
                            n_filts,
                            stride=1,
                            padding=(n_filts - 1) // 2,
                            bias_attr=False, ),
                        nn.BatchNorm1D(ochans),
                        nn.Tanh(),
                        nn.Dropout(dropout_rate), ))
            else:
                self.postnet.append(
                    nn.Sequential(
                        nn.Conv1D(
                            ichans,
                            ochans,
                            n_filts,
                            stride=1,
                            padding=(n_filts - 1) // 2,
                            bias_attr=False, ),
                        nn.Tanh(),
                        nn.Dropout(dropout_rate), ))
        ichans = n_chans if n_layers != 1 else odim
        if use_batch_norm:
            self.postnet.append(
                nn.Sequential(
                    nn.Conv1D(
                        ichans,
                        odim,
                        n_filts,
                        stride=1,
                        padding=(n_filts - 1) // 2,
                        bias_attr=False, ),
                    nn.BatchNorm1D(odim),
                    nn.Dropout(dropout_rate), ))
        else:
            self.postnet.append(
                nn.Sequential(
                    nn.Conv1D(
                        ichans,
                        odim,
                        n_filts,
                        stride=1,
                        padding=(n_filts - 1) // 2,
                        bias_attr=False, ),
                    nn.Dropout(dropout_rate), ))

    def forward(self, xs):
        """Calculate forward propagation.

        Parameters
        ----------
        xs : Tensor
            Batch of the sequences of padded input tensors (B, idim, Tmax).

        Returns
        ----------
        Tensor
            Batch of padded output tensor. (B, odim, Tmax).

        """
        for i in six.moves.range(len(self.postnet)):
            xs = self.postnet[i](xs)
        return xs
merge parakeet repo into deepspeech 3 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
add reference 3 years ago			`# Modified from espnet(https://github.com/espnet/espnet)`
merge parakeet repo into deepspeech 3 years ago			`"""Tacotron2 decoder related modules."""`
			`import paddle.nn.functional as F`
refactor parakeet examples 3 years ago			`import six`
merge parakeet repo into deepspeech 3 years ago			`from paddle import nn`


			`class Prenet(nn.Layer):`
			`"""Prenet module for decoder of Spectrogram prediction network.`

			`This is a module of Prenet in the decoder of Spectrogram prediction network,`
			which described in `Natural TTS
			Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_.
			`The Prenet preforms nonlinear conversion`
			`of inputs before input to auto-regressive lstm,`
			`which helps to learn diagonal attentions.`

			`Notes`
			`----------`
			`This module alway applies dropout even in evaluation.`
			See the detail in `Natural TTS Synthesis by
			Conditioning WaveNet on Mel Spectrogram Predictions`_.

			.. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
			`https://arxiv.org/abs/1712.05884`

			`"""`

			`def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):`
			`"""Initialize prenet module.`

			`Parameters`
			`----------`
			`idim : int`
			`Dimension of the inputs.`
			`odim : int`
			`Dimension of the outputs.`
			`n_layers : int, optional`
			`The number of prenet layers.`
			`n_units : int, optional`
			`The number of prenet units.`
			`"""`
			`super().__init__()`
			`self.dropout_rate = dropout_rate`
			`self.prenet = nn.LayerList()`
			`for layer in six.moves.range(n_layers):`
			`n_inputs = idim if layer == 0 else n_units`
			`self.prenet.append(`
			`nn.Sequential(nn.Linear(n_inputs, n_units), nn.ReLU()))`

			`def forward(self, x):`
			`"""Calculate forward propagation.`

			`Parameters`
			`----------`
			`x : Tensor`
			`Batch of input tensors (B, ..., idim).`

			`Returns`
			`----------`
			`Tensor`
			`Batch of output tensors (B, ..., odim).`

			`"""`
			`for i in six.moves.range(len(self.prenet)):`
			`# F.dropout 引入了随机, tacotron2 的 dropout 是不能去掉的`
			`x = F.dropout(self.prenet[i](x))`
			`return x`


			`class Postnet(nn.Layer):`
			`"""Postnet module for Spectrogram prediction network.`

			`This is a module of Postnet in Spectrogram prediction network,`
			which described in `Natural TTS Synthesis by
			Conditioning WaveNet on Mel Spectrogram Predictions`_.
			`The Postnet predicts refines the predicted`
			`Mel-filterbank of the decoder,`
			`which helps to compensate the detail sturcture of spectrogram.`

			.. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`:
			`https://arxiv.org/abs/1712.05884`

			`"""`

			`def __init__(`
			`self,`
			`idim,`
			`odim,`
			`n_layers=5,`
			`n_chans=512,`
			`n_filts=5,`
			`dropout_rate=0.5,`
			`use_batch_norm=True, ):`
			`"""Initialize postnet module.`

			`Parameters`
			`----------`
			`idim : int`
			`Dimension of the inputs.`
			`odim : int`
			`Dimension of the outputs.`
			`n_layers : int, optional`
			`The number of layers.`
			`n_filts : int, optional`
			`The number of filter size.`
			`n_units : int, optional`
			`The number of filter channels.`
			`use_batch_norm : bool, optional`
			`Whether to use batch normalization..`
			`dropout_rate : float, optional`
			`Dropout rate..`
			`"""`
			`super().__init__()`
			`self.postnet = nn.LayerList()`
			`for layer in six.moves.range(n_layers - 1):`
			`ichans = odim if layer == 0 else n_chans`
			`ochans = odim if layer == n_layers - 1 else n_chans`
			`if use_batch_norm:`
			`self.postnet.append(`
			`nn.Sequential(`
			`nn.Conv1D(`
			`ichans,`
			`ochans,`
			`n_filts,`
			`stride=1,`
			`padding=(n_filts - 1) // 2,`
			`bias_attr=False, ),`
			`nn.BatchNorm1D(ochans),`
			`nn.Tanh(),`
			`nn.Dropout(dropout_rate), ))`
			`else:`
			`self.postnet.append(`
			`nn.Sequential(`
			`nn.Conv1D(`
			`ichans,`
			`ochans,`
			`n_filts,`
			`stride=1,`
			`padding=(n_filts - 1) // 2,`
			`bias_attr=False, ),`
			`nn.Tanh(),`
			`nn.Dropout(dropout_rate), ))`
			`ichans = n_chans if n_layers != 1 else odim`
			`if use_batch_norm:`
			`self.postnet.append(`
			`nn.Sequential(`
			`nn.Conv1D(`
			`ichans,`
			`odim,`
			`n_filts,`
			`stride=1,`
			`padding=(n_filts - 1) // 2,`
			`bias_attr=False, ),`
			`nn.BatchNorm1D(odim),`
			`nn.Dropout(dropout_rate), ))`
			`else:`
			`self.postnet.append(`
			`nn.Sequential(`
			`nn.Conv1D(`
			`ichans,`
			`odim,`
			`n_filts,`
			`stride=1,`
			`padding=(n_filts - 1) // 2,`
			`bias_attr=False, ),`
			`nn.Dropout(dropout_rate), ))`

			`def forward(self, xs):`
			`"""Calculate forward propagation.`

			`Parameters`
			`----------`
			`xs : Tensor`
			`Batch of the sequences of padded input tensors (B, idim, Tmax).`

			`Returns`
			`----------`
			`Tensor`
			`Batch of padded output tensor. (B, odim, Tmax).`

			`"""`
			`for i in six.moves.range(len(self.postnet)):`
			`xs = self.postnet[i](xs)`
			`return xs`