Merge pull request #2115 from yt605155624/add_api

[doc]format tts doc string for read the docs
3 years ago · 8817bf8636
parent 9c4763ecce 9e63b5947e
commit 8817bf8636
48 changed files with 2300 additions and 1178 deletions
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -141,71 +141,133 @@ class FastSpeech2(nn.Layer):
            init_dec_alpha: float=1.0, ):
        """Initialize FastSpeech2 module.
        Args:
-            idim (int): Dimension of the inputs.
+            idim (int): 
-            odim (int): Dimension of the outputs.
+                Dimension of the inputs.
-            adim (int): Attention dimension.
+            odim (int): 
-            aheads (int): Number of attention heads.
+                Dimension of the outputs.
-            elayers (int): Number of encoder layers.
+            adim (int): 
-            eunits (int): Number of encoder hidden units.
+                Attention dimension.
-            dlayers (int): Number of decoder layers.
+            aheads (int): 
-            dunits (int): Number of decoder hidden units.
+                Number of attention heads.
-            postnet_layers (int): Number of postnet layers.
+            elayers (int): 
-            postnet_chans (int): Number of postnet channels.
+                Number of encoder layers.
-            postnet_filts (int): Kernel size of postnet.
+            eunits (int): 
-            postnet_dropout_rate (float): Dropout rate in postnet.
+                Number of encoder hidden units.
-            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            dlayers (int): 
-            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+                Number of decoder layers.
-            encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
+            dunits (int): 
-            decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
+                Number of decoder hidden units.
-            encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
+            postnet_layers (int): 
-            decoder_concat_after (bool): Whether to concatenate attention layer's input  and output in decoder.
+                Number of postnet layers.
-            reduction_factor (int): Reduction factor.
+            postnet_chans (int): 
-            encoder_type (str): Encoder type ("transformer" or "conformer").
+                Number of postnet channels.
-            decoder_type (str): Decoder type ("transformer" or "conformer").
+            postnet_filts (int): 
-            transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
+                Kernel size of postnet.
-            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
+            postnet_dropout_rate (float): 
-            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
+                Dropout rate in postnet.
-            transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
+            use_scaled_pos_enc (bool): 
-            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
+                Whether to use trainable scaled pos encoding.
-            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
+            use_batch_norm (bool): 
-            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+                Whether to use batch normalization in encoder prenet.
-            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+            encoder_normalize_before (bool): 
-            conformer_activation_type (str): Activation function type in conformer.
+                Whether to apply layernorm layer before encoder block.
-            use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
+            decoder_normalize_before (bool): 
-            use_cnn_in_conformer (bool): Whether to use CNN in conformer.
+                Whether to apply layernorm layer before decoder block.
-            zero_triu (bool): Whether to use zero triu in relative self-attention module.
+            encoder_concat_after (bool): 
-            conformer_enc_kernel_size (int): Kernel size of encoder conformer.
+                Whether to concatenate attention layer's input and output in encoder.
-            conformer_dec_kernel_size (int): Kernel size of decoder conformer.
+            decoder_concat_after (bool): 
-            duration_predictor_layers (int): Number of duration predictor layers.
+                Whether to concatenate attention layer's input  and output in decoder.
-            duration_predictor_chans (int): Number of duration predictor channels.
+            reduction_factor (int): 
-            duration_predictor_kernel_size (int): Kernel size of duration predictor.
+                Reduction factor.
-            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+            encoder_type (str): 
-            pitch_predictor_layers (int): Number of pitch predictor layers.
+                Encoder type ("transformer" or "conformer").
-            pitch_predictor_chans (int): Number of pitch predictor channels.
+            decoder_type (str): 
-            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
+                Decoder type ("transformer" or "conformer").
-            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
+            transformer_enc_dropout_rate (float): 
-            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
+                Dropout rate in encoder except attention and positional encoding.
-            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
+            transformer_enc_positional_dropout_rate (float): 
-            stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
+                Dropout rate after encoder positional encoding.
-            energy_predictor_layers (int): Number of energy predictor layers.
+            transformer_enc_attn_dropout_rate (float): 
-            energy_predictor_chans (int): Number of energy predictor channels.
+                Dropout rate in encoder self-attention module.
-            energy_predictor_kernel_size (int): Kernel size of energy predictor.
+            transformer_dec_dropout_rate (float): 
-            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
+                Dropout rate in decoder except attention & positional encoding.
-            energy_embed_kernel_size (float): Kernel size of energy embedding.
+            transformer_dec_positional_dropout_rate (float):
-            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
+                Dropout rate after decoder positional encoding.
-            stop_gradient_from_energy_predictor（bool): Whether to stop gradient from energy predictor to encoder.
+            transformer_dec_attn_dropout_rate (float): 
-            spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
+                Dropout rate in decoder self-attention module.
            conformer_pos_enc_layer_type (str): 
                Pos encoding layer type in conformer.
            conformer_self_attn_layer_type (str): 
                Self-attention layer type in conformer
            conformer_activation_type (str): 
                Activation function type in conformer.
            use_macaron_style_in_conformer (bool): 
                Whether to use macaron style FFN.
            use_cnn_in_conformer (bool): 
                Whether to use CNN in conformer.
            zero_triu (bool): 
                Whether to use zero triu in relative self-attention module.
            conformer_enc_kernel_size (int): 
                Kernel size of encoder conformer.
            conformer_dec_kernel_size (int): 
                Kernel size of decoder conformer.
            duration_predictor_layers (int): 
                Number of duration predictor layers.
            duration_predictor_chans (int): 
                Number of duration predictor channels.
            duration_predictor_kernel_size (int): 
                Kernel size of duration predictor.
            duration_predictor_dropout_rate (float): 
                Dropout rate in duration predictor.
            pitch_predictor_layers (int): 
                Number of pitch predictor layers.
            pitch_predictor_chans (int):
                Number of pitch predictor channels.
            pitch_predictor_kernel_size (int): 
                Kernel size of pitch predictor.
            pitch_predictor_dropout_rate (float): 
                Dropout rate in pitch predictor.
            pitch_embed_kernel_size (float): 
                Kernel size of pitch embedding.
            pitch_embed_dropout_rate (float): 
                Dropout rate for pitch embedding.
            stop_gradient_from_pitch_predictor (bool): 
                Whether to stop gradient from pitch predictor to encoder.
            energy_predictor_layers (int): 
                Number of energy predictor layers.
            energy_predictor_chans (int): 
                Number of energy predictor channels.
            energy_predictor_kernel_size (int): 
                Kernel size of energy predictor.
            energy_predictor_dropout_rate (float): 
                Dropout rate in energy predictor.
            energy_embed_kernel_size (float): 
                Kernel size of energy embedding.
            energy_embed_dropout_rate (float): 
                Dropout rate for energy embedding.
            stop_gradient_from_energy_predictor（bool): 
                Whether to stop gradient from energy predictor to encoder.
            spk_num (Optional[int]): 
                Number of speakers. If not None, assume that the spk_embed_dim is not None,
                spk_ids will be provided as the input and use spk_embedding_table.
-            spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, 
+            spk_embed_dim (Optional[int]): 
                Speaker embedding dimension. If not None, 
                assume that spk_emb will be provided as the input or spk_num is not None.
-            spk_embed_integration_type (str): How to integrate speaker embedding.
+            spk_embed_integration_type (str): 
-            tone_num (Optional[int]): Number of tones. If not None, assume that the
+                How to integrate speaker embedding.
            tone_num (Optional[int]): 
                Number of tones. If not None, assume that the
                tone_ids will be provided as the input and use tone_embedding_table.
-            tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
+            tone_embed_dim (Optional[int]):
-            tone_embed_integration_type (str): How to integrate tone embedding.
+                Tone embedding dimension. If not None, assume that tone_num is not None.
-            init_type (str): How to initialize transformer parameters.
+            tone_embed_integration_type (str): 
-            init_enc_alpha （float): Initial value of alpha in scaled pos encoding of the encoder.
+                How to integrate tone embedding.
-            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
+            init_type (str): 
                How to initialize transformer parameters.
            init_enc_alpha （float): 
                Initial value of alpha in scaled pos encoding of the encoder.
            init_dec_alpha (float): 
                Initial value of alpha in scaled pos encoding of the decoder.
        """
        assert check_argument_types()
@ -449,20 +511,29 @@ class FastSpeech2(nn.Layer):
        """Calculate forward propagation.
        Args:
-            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text(Tensor(int64)): 
-            text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
+                Batch of padded token ids (B, Tmax).
-            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            text_lengths(Tensor(int64)): 
-            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+                Batch of lengths of each input (B,).
-            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+            speech(Tensor): 
-            pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
+                Batch of padded target features (B, Lmax, odim).
-            energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
+            speech_lengths(Tensor(int64)): 
-            tone_id(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+                Batch of the lengths of each target (B,).
-            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+            durations(Tensor(int64)): 
-            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+                Batch of padded durations (B, Tmax).
            pitch(Tensor): 
                Batch of padded token-averaged pitch (B, Tmax, 1).
            energy(Tensor): 
                Batch of padded token-averaged energy (B, Tmax, 1).
            tone_id(Tensor, optional(int64)): 
                Batch of padded tone ids  (B, Tmax).
            spk_emb(Tensor, optional): 
                Batch of speaker embeddings (B, spk_embed_dim).
            spk_id(Tnesor, optional(int64)): 
                Batch of speaker ids (B,)
        Returns:
        """
        # input of embedding must be int64
@ -658,20 +729,28 @@ class FastSpeech2(nn.Layer):
        """Generate the sequence of features given the sequences of characters.
        Args:
-            text(Tensor(int64)): Input sequence of characters (T,).
+            text(Tensor(int64)): 
-            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+                Input sequence of characters (T,).
-            pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
+            durations(Tensor, optional (int64)): 
-            energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
+                Groundtruth of duration (T,).
-            alpha(float, optional): Alpha to control the speed.
+            pitch(Tensor, optional): 
-            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+                Groundtruth of token-averaged pitch (T, 1).
            energy(Tensor, optional): 
                Groundtruth of token-averaged energy (T, 1).
            alpha(float, optional): 
                Alpha to control the speed.
            use_teacher_forcing(bool, optional): 
                Whether to use teacher forcing.
                If true, groundtruth of duration, pitch and energy will be used.
-            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
+            spk_emb(Tensor, optional, optional): 
-            spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
+                peaker embedding vector (spk_embed_dim,). (Default value = None)
-            tone_id(Tensor, optional(int64), optional): tone ids (T,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): 
                spk ids (1,). (Default value = None)
            tone_id(Tensor, optional(int64), optional): 
                tone ids (T,). (Default value = None)
        Returns:
        """
        # input of embedding must be int64
        x = paddle.cast(text, 'int64')
@ -720,8 +799,10 @@ class FastSpeech2(nn.Layer):
        """Integrate speaker embedding with hidden states.
        Args:
-            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            hs(Tensor): 
-            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+                Batch of hidden state sequences (B, Tmax, adim).
            spk_emb(Tensor): 
                Batch of speaker embeddings (B, spk_embed_dim).
        Returns:
@ -745,8 +826,10 @@ class FastSpeech2(nn.Layer):
        """Integrate speaker embedding with hidden states.
        Args:
-            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            hs(Tensor): 
-            tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
+                Batch of hidden state sequences (B, Tmax, adim).
            tone_embs(Tensor): 
                Batch of speaker embeddings (B, Tmax, tone_embed_dim).
        Returns:
@ -769,10 +852,12 @@ class FastSpeech2(nn.Layer):
        """Make masks for self-attention.
        Args:
-            ilens(Tensor): Batch of lengths (B,).
+            ilens(Tensor): 
                Batch of lengths (B,).
        Returns:
-            Tensor: Mask tensor for self-attention. dtype=paddle.bool
+            Tensor: 
                Mask tensor for self-attention. dtype=paddle.bool
        Examples:
            >>> ilens = [5, 3]
@ -854,19 +939,32 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
        """
        Args:
-            text(Tensor(int64)): Input sequence of characters (T,).
+            text(Tensor(int64)): 
-            durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+                Input sequence of characters (T,).
            durations(paddle.Tensor/np.ndarray, optional (int64)): 
                Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
            durations_scale(int/float, optional): 
            durations_bias(int/float, optional): 
-            pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+
-            pitch_scale(int/float, optional): In denormed HZ domain.
+            pitch(paddle.Tensor/np.ndarray, optional): 
-            pitch_bias(int/float, optional): In denormed HZ domain.
+                Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-            energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+            pitch_scale(int/float, optional): 
-            energy_scale(int/float, optional): In denormed domain.
+                In denormed HZ domain.
-            energy_bias(int/float, optional): In denormed domain.
+            pitch_bias(int/float, optional): 
-            robot: bool:  (Default value = False)
+                In denormed HZ domain.
-            spk_emb: (Default value = None)
+            energy(paddle.Tensor/np.ndarray, optional): 
-            spk_id: (Default value = None)
+                Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
            energy_scale(int/float, optional): 
                In denormed domain.
            energy_bias(int/float, optional): 
                In denormed domain.
            robot(bool) (Default value = False):
            spk_emb(Default value = None):
            spk_id(Default value = None):
        Returns:
            Tensor: logmel
@ -945,8 +1043,10 @@ class FastSpeech2Loss(nn.Layer):
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
        Args:
-            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_masking (bool): 
-            use_weighted_masking (bool): Whether to weighted masking in loss calculation.
+                Whether to apply masking for padded part in loss calculation.
            use_weighted_masking (bool): 
                Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
@ -978,17 +1078,28 @@ class FastSpeech2Loss(nn.Layer):
        """Calculate forward propagation.
        Args:
-            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            after_outs(Tensor):  
-            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+                Batch of outputs after postnets (B, Lmax, odim).
-            d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
+            before_outs(Tensor): 
-            p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+                Batch of outputs before postnets (B, Lmax, odim).
-            e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+            d_outs(Tensor): 
-            ys(Tensor): Batch of target features (B, Lmax, odim).
+                Batch of outputs of duration predictor (B, Tmax).
-            ds(Tensor): Batch of durations (B, Tmax).
+            p_outs(Tensor): 
-            ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+                Batch of outputs of pitch predictor (B, Tmax, 1).
-            es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+            e_outs(Tensor): 
-            ilens(Tensor): Batch of the lengths of each input (B,).
+                Batch of outputs of energy predictor (B, Tmax, 1).
-            olens(Tensor): Batch of the lengths of each target (B,).
+            ys(Tensor): 
                Batch of target features (B, Lmax, odim).
            ds(Tensor): 
                Batch of durations (B, Tmax).
            ps(Tensor): 
                Batch of target token-averaged pitch (B, Tmax, 1).
            es(Tensor): 
                Batch of target token-averaged energy (B, Tmax, 1).
            ilens(Tensor): 
                Batch of the lengths of each input (B,).
            olens(Tensor): 
                Batch of the lengths of each target (B,).
        Returns:
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@ -50,20 +50,34 @@ class HiFiGANGenerator(nn.Layer):
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANGenerator module.
        Args:
-            in_channels (int): Number of input channels.
+            in_channels (int): 
-            out_channels (int): Number of output channels.
+                Number of input channels.
-            channels (int): Number of hidden representation channels.
+            out_channels (int): 
-            global_channels (int): Number of global conditioning channels.
+                Number of output channels.
-            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): 
-            upsample_scales (list): List of upsampling scales.
+                Number of hidden representation channels.
-            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            global_channels (int): 
-            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+                Number of global conditioning channels.
-            resblock_dilations (list): List of dilation list for residual blocks.
+            kernel_size (int): 
-            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+                Kernel size of initial and final conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (list): 
-            nonlinear_activation (str): Activation function module name.
+                List of upsampling scales.
-            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            upsample_kernel_sizes (list): 
-            use_weight_norm (bool): Whether to use weight norm.
+                List of kernel sizes for upsampling layers.
            resblock_kernel_sizes (list): 
                List of kernel sizes for residual blocks.
            resblock_dilations (list): 
                List of dilation list for residual blocks.
            use_additional_convs (bool): 
                Whether to use additional conv layers in residual blocks.
            bias (bool): 
                Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (dict): 
                Hyperparameters for activation function.
            use_weight_norm (bool): 
                Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -199,9 +213,10 @@ class HiFiGANGenerator(nn.Layer):
    def inference(self, c, g: Optional[paddle.Tensor]=None):
        """Perform inference.
        Args:
-            c (Tensor): Input tensor (T, in_channels).
+            c (Tensor): 
-                normalize_before (bool): Whether to perform normalization.
+                Input tensor (T, in_channels).
-            g (Optional[Tensor]): Global conditioning tensor (global_channels, 1).
+            g (Optional[Tensor]): 
                Global conditioning tensor (global_channels, 1).
        Returns:
            Tensor:
                Output tensor (T ** prod(upsample_scales), out_channels).
@ -233,20 +248,33 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
        """Initialize HiFiGANPeriodDiscriminator module.
        Args:
-            in_channels (int): Number of input channels.
+            in_channels (int): 
-            out_channels (int): Number of output channels.
+                Number of input channels.
-            period (int): Period.
+            out_channels (int): 
-            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+                Number of output channels.
-            channels (int): Number of initial channels.
+            period (int): 
-            downsample_scales (list): List of downsampling scales.
+                Period.
-            max_downsample_channels (int): Number of maximum downsampling channels.
+            kernel_sizes (list): 
-            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+                Kernel sizes of initial conv layers and the final conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            channels (int): 
-            nonlinear_activation (str): Activation function module name.
+                Number of initial channels.
-            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            downsample_scales (list): 
-            use_weight_norm (bool): Whether to use weight norm.
+                List of downsampling scales.
            max_downsample_channels (int): 
                Number of maximum downsampling channels.
            use_additional_convs (bool): 
                Whether to use additional conv layers in residual blocks.
            bias (bool): 
                Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (dict): 
                Hyperparameters for activation function.
            use_weight_norm (bool): 
                Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
-            use_spectral_norm (bool): Whether to use spectral norm.
+            use_spectral_norm (bool): 
                Whether to use spectral norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -298,7 +326,8 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
        """Calculate forward propagation.
        Args:
-            c (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): 
                Input tensor (B, in_channels, T).
        Returns:
            list: List of each layer's tensors.
        """
@ -367,8 +396,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
        """Initialize HiFiGANMultiPeriodDiscriminator module.
        Args:
-            periods (list): List of periods.
+            periods (list): 
-            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                List of periods.
            discriminator_params (dict): 
                Parameters for hifi-gan period discriminator module.
                The period parameter will be overwritten.
        """
        super().__init__()
@ -385,7 +416,8 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input noise signal (B, 1, T).
+            x (Tensor): 
                Input noise signal (B, 1, T).
        Returns:
            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
@ -417,16 +449,25 @@ class HiFiGANScaleDiscriminator(nn.Layer):
        """Initilize HiFiGAN scale discriminator module.
        Args:
-            in_channels (int): Number of input channels.
+            in_channels (int): 
-            out_channels (int): Number of output channels.
+                Number of input channels.
-            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+            out_channels (int): 
                Number of output channels.
            kernel_sizes (list): 
                List of four kernel sizes. The first will be used for the first conv layer,
                and the second is for downsampling part, and the remaining two are for output layers.
-            channels (int): Initial number of channels for conv layer.
+            channels (int): 
-            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+                Initial number of channels for conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            max_downsample_channels (int): 
-            downsample_scales (list): List of downsampling scales.
+                Maximum number of channels for downsampling layers.
-            nonlinear_activation (str): Activation function module name.
+            bias (bool): 
-            nonlinear_activation_params (dict): Hyperparameters for activation function.
+                Whether to add bias parameter in convolution layers.
            downsample_scales (list): 
                List of downsampling scales.
            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (dict): 
                Hyperparameters for activation function.
            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
            use_spectral_norm (bool): Whether to use spectral norm.
@ -614,7 +655,8 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input noise signal (B, 1, T).
+            x (Tensor): 
                Input noise signal (B, 1, T).
        Returns:
            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
@ -675,14 +717,21 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
        Args:
-            scales (int): Number of multi-scales.
+            scales (int): 
-            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+                Number of multi-scales.
-            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_downsample_pooling (str): 
-            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+                Pooling module name for downsampling of the inputs.
-            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
+            scale_downsample_pooling_params (dict): 
                Parameters for the above pooling module.
            scale_discriminator_params (dict): 
                Parameters for hifi-gan scale discriminator module.
            follow_official_norm （bool): 
                Whether to follow the norm setting of the official implementaion. 
                The first discriminator uses spectral norm and the other discriminators use weight norm.
-            periods (list): List of periods.
+            periods (list): 
-            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                List of periods.
            period_discriminator_params (dict): 
                Parameters for hifi-gan period discriminator module.
                The period parameter will be overwritten.
        """
        super().__init__()
@ -704,7 +753,8 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input noise signal (B, 1, T).
+            x (Tensor): 
                Input noise signal (B, 1, T).
        Returns:
            List:
                List of list of each discriminator outputs,
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@ -53,24 +53,38 @@ class MelGANGenerator(nn.Layer):
        """Initialize MelGANGenerator module.
        Args:
-            in_channels (int): Number of input channels.
+            in_channels (int): 
-            out_channels (int): Number of output channels,
+                Number of input channels.
            out_channels (int): 
                Number of output channels,
                the number of sub-band is out_channels in multi-band melgan.
-            kernel_size (int): Kernel size of initial and final conv layer.
+            kernel_size (int): 
-            channels (int): Initial number of channels for conv layer.
+                Kernel size of initial and final conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            channels (int): 
-            upsample_scales (List[int]): List of upsampling scales.
+                Initial number of channels for conv layer.
-            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            bias (bool): 
-            stacks (int): Number of stacks in a single residual stack.
+                Whether to add bias parameter in convolution layers.
-            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+            upsample_scales (List[int]): 
-            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+                List of upsampling scales.
-                by default {}
+            stack_kernel_size (int): 
-            pad (str): Padding function module name before dilated convolution layer.
+                Kernel size of dilated conv layers in residual stack.
-            pad_params (dict): Hyperparameters for padding function.
+            stacks (int): 
-            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+                Number of stacks in a single residual stack.
-            use_weight_norm (bool): Whether to use weight norm.
+            nonlinear_activation (Optional[str], optional): 
                Non linear activation in upsample network, by default None
            nonlinear_activation_params (Dict[str, Any], optional): 
                Parameters passed to the linear activation in the upsample network, by default {}
            pad (str): 
                Padding function module name before dilated convolution layer.
            pad_params (dict): 
                Hyperparameters for padding function.
            use_final_nonlinear_activation (nn.Layer): 
                Activation function for the final layer.
            use_weight_norm (bool): 
                Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
-            use_causal_conv (bool): Whether to use causal convolution.
+            use_causal_conv (bool):
                Whether to use causal convolution.
        """
        super().__init__()
@ -194,7 +208,8 @@ class MelGANGenerator(nn.Layer):
        """Calculate forward propagation.
        Args:
-            c (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): 
                Input tensor (B, in_channels, T).
        Returns:
            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
@ -244,7 +259,8 @@ class MelGANGenerator(nn.Layer):
        """Perform inference.
        Args:
-            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+            c (Union[Tensor, ndarray]): 
                Input tensor (T, in_channels).
        Returns:
            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
        """
@ -279,20 +295,30 @@ class MelGANDiscriminator(nn.Layer):
        """Initilize MelGAN discriminator module.
        Args:
-            in_channels (int): Number of input channels.
+            in_channels (int): 
-            out_channels (int): Number of output channels.
+                Number of input channels.
            out_channels (int): 
                Number of output channels.
            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
                and the first and the second kernel sizes will be used for the last two layers.
                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
                the last two layers' kernel size will be 5 and 3, respectively.
-            channels (int): Initial number of channels for conv layer.
+            channels (int): 
-            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+                Initial number of channels for conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            max_downsample_channels (int): 
-            downsample_scales (List[int]): List of downsampling scales.
+                Maximum number of channels for downsampling layers.
-            nonlinear_activation (str): Activation function module name.
+            bias (bool): 
-            nonlinear_activation_params (dict): Hyperparameters for activation function.
+                Whether to add bias parameter in convolution layers.
-            pad (str): Padding function module name before dilated convolution layer.
+            downsample_scales (List[int]): 
-            pad_params (dict): Hyperparameters for padding function.
+                List of downsampling scales.
            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (dict): 
                Hyperparameters for activation function.
            pad (str): 
                Padding function module name before dilated convolution layer.
            pad_params (dict): 
                Hyperparameters for padding function.
        """
        super().__init__()
@ -364,7 +390,8 @@ class MelGANDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input noise signal (B, 1, T).
+            x (Tensor): 
                Input noise signal (B, 1, T).
        Returns:
            List: List of output tensors of each layer (for feat_match_loss).
        """
@ -406,22 +433,37 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
        """Initilize MelGAN multi-scale discriminator module.
        Args:
-            in_channels (int): Number of input channels.
+            in_channels (int): 
-            out_channels (int): Number of output channels.
+                Number of input channels.
-            scales (int): Number of multi-scales.
+            out_channels (int): 
-            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+                Number of output channels.
-            downsample_pooling_params (dict): Parameters for the above pooling module.
+            scales (int): 
-            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
+                Number of multi-scales.
            downsample_pooling (str): 
                Pooling module name for downsampling of the inputs.
            downsample_pooling_params (dict): 
                Parameters for the above pooling module.
            kernel_sizes (List[int]): 
                List of two kernel sizes. The sum will be used for the first conv layer,
                and the first and the second kernel sizes will be used for the last two layers.
-            channels (int): Initial number of channels for conv layer.
+            channels (int): 
-            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+                Initial number of channels for conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            max_downsample_channels (int): 
-            downsample_scales (List[int]): List of downsampling scales.
+                Maximum number of channels for downsampling layers.
-            nonlinear_activation (str): Activation function module name.
+            bias (bool): 
-            nonlinear_activation_params (dict): Hyperparameters for activation function.
+                Whether to add bias parameter in convolution layers.
-            pad (str): Padding function module name before dilated convolution layer.
+            downsample_scales (List[int]): 
-            pad_params (dict): Hyperparameters for padding function.
+                List of downsampling scales.
-            use_causal_conv (bool): Whether to use causal convolution.
+            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (dict): 
                Hyperparameters for activation function.
            pad (str): 
                Padding function module name before dilated convolution layer.
            pad_params (dict): 
                Hyperparameters for padding function.
            use_causal_conv (bool): 
                Whether to use causal convolution.
        """
        super().__init__()
@ -464,7 +506,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input noise signal (B, 1, T).
+            x (Tensor):
                Input noise signal (B, 1, T).
        Returns:
            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@ -54,20 +54,34 @@ class StyleMelGANGenerator(nn.Layer):
        """Initilize Style MelGAN generator.
        Args:
-            in_channels (int): Number of input noise channels.
+            in_channels (int): 
-            aux_channels (int): Number of auxiliary input channels.
+                Number of input noise channels.
-            channels (int): Number of channels for conv layer.
+            aux_channels (int): 
-            out_channels (int): Number of output channels.
+                Number of auxiliary input channels.
-            kernel_size (int): Kernel size of conv layers.
+            channels (int): 
-            dilation (int): Dilation factor for conv layers.
+                Number of channels for conv layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            out_channels (int): 
-            noise_upsample_scales (list): List of noise upsampling scales.
+                Number of output channels.
-            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            kernel_size (int): 
-            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+                Kernel size of conv layers.
-            upsample_scales (list): List of upsampling scales.
+            dilation (int): 
-            upsample_mode (str): Upsampling mode in TADE layer.
+                Dilation factor for conv layers.
-            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            bias (bool): 
-            use_weight_norm (bool): Whether to use weight norm.
+                Whether to add bias parameter in convolution layers.
            noise_upsample_scales (list): 
                List of noise upsampling scales.
            noise_upsample_activation (str): 
                Activation function module name for noise upsampling.
            noise_upsample_activation_params (dict): 
                Hyperparameters for the above activation function.
            upsample_scales (list): 
                List of upsampling scales.
            upsample_mode (str): 
                Upsampling mode in TADE layer.
            gated_function (str): 
                Gated function in TADEResBlock ("softmax" or "sigmoid").
            use_weight_norm (bool): 
                Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -194,7 +208,8 @@ class StyleMelGANGenerator(nn.Layer):
    def inference(self, c):
        """Perform inference.
        Args:
-            c (Tensor): Input tensor (T, in_channels).
+            c (Tensor): 
                Input tensor (T, in_channels).
        Returns:
            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
        """
@ -258,11 +273,16 @@ class StyleMelGANDiscriminator(nn.Layer):
        """Initilize Style MelGAN discriminator.
        Args:
-            repeats (int): Number of repititons to apply RWD.
+            repeats (int): 
-            window_sizes (list): List of random window sizes.
+                Number of repititons to apply RWD.
-            pqmf_params (list): List of list of Parameters for PQMF modules
+            window_sizes (list): 
-            discriminator_params (dict): Parameters for base discriminator module.
+                List of random window sizes.
-            use_weight_nom (bool): Whether to apply weight normalization.
+            pqmf_params (list): 
                List of list of Parameters for PQMF modules
            discriminator_params (dict): 
                Parameters for base discriminator module.
            use_weight_nom (bool): 
                Whether to apply weight normalization.
        """
        super().__init__()
@ -299,7 +319,8 @@ class StyleMelGANDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input tensor (B, 1, T).
+            x (Tensor): 
                Input tensor (B, 1, T).
        Returns:
            List: List of discriminator outputs, #items in the list will be
                equal to repeats * #discriminators.
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@ -32,29 +32,45 @@ class PWGGenerator(nn.Layer):
    """Wave Generator for Parallel WaveGAN
    Args:
-        in_channels (int, optional): Number of channels of the input waveform, by default 1
+        in_channels (int, optional): 
-        out_channels (int, optional): Number of channels of the output waveform, by default 1
+            Number of channels of the input waveform, by default 1
-        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+        out_channels (int, optional): 
-        layers (int, optional): Number of residual blocks inside, by default 30
+            Number of channels of the output waveform, by default 1
-        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+        kernel_size (int, optional): 
            Kernel size of the residual blocks inside, by default 3
        layers (int, optional): 
            Number of residual blocks inside, by default 30
        stacks (int, optional):
            The number of groups to split the residual blocks into, by default 3
            Within each group, the dilation of the residual block grows exponentially.
-        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+        residual_channels (int, optional): 
-        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+            Residual channel of the residual blocks, by default 64
-        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+        gate_channels (int, optional): 
-        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+            Gate channel of the residual blocks, by default 128
-        aux_context_window (int, optional): The context window size of the first convolution applied to the 
+        skip_channels (int, optional): 
-            auxiliary input, by default 2
+            Skip channel of the residual blocks, by default 64
-        dropout (float, optional): Dropout of the residual blocks, by default 0.
+        aux_channels (int, optional): 
-        bias (bool, optional): Whether to use bias in residual blocks, by default True
+            Auxiliary channel of the residual blocks, by default 80
-        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+        aux_context_window (int, optional): 
-        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
+            The context window size of the first convolution applied to the auxiliary input, by default 2
-            blocks, by default False
+        dropout (float, optional): 
-        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+            Dropout of the residual blocks, by default 0.
-        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+        bias (bool, optional): 
-        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+            Whether to use bias in residual blocks, by default True
-            by default {}
+        use_weight_norm (bool, optional): 
-        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+            Whether to use weight norm in all convolutions, by default True
-        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
+        use_causal_conv (bool, optional): 
            Whether to use causal padding in the upsample network and residual blocks, by default False
        upsample_scales (List[int], optional): 
            Upsample scales of the upsample network, by default [4, 4, 4, 4]
        nonlinear_activation (Optional[str], optional): 
            Non linear activation in upsample network, by default None
        nonlinear_activation_params (Dict[str, Any], optional): 
            Parameters passed to the linear activation in the upsample network, by default {}
        interpolate_mode (str, optional): 
            Interpolation mode of the upsample network, by default "nearest"
        freq_axis_kernel_size (int, optional): 
            Kernel size along the frequency axis of the upsample network, by default 1
    """
    def __init__(
@ -147,9 +163,11 @@ class PWGGenerator(nn.Layer):
        """Generate waveform.
        Args:
-            x(Tensor): Shape (N, C_in, T), The input waveform.
+            x(Tensor): 
-            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
+                Shape (N, C_in, T), The input waveform.
-            is upsampled to match the time resolution of the input.
+            c(Tensor): 
                Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). 
                It is upsampled to match the time resolution of the input.
        Returns:
            Tensor: Shape (N, C_out, T), the generated waveform.
@ -195,8 +213,10 @@ class PWGGenerator(nn.Layer):
        """Waveform generation. This function is used for single instance inference.
        Args:
-            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+            c(Tensor, optional, optional): 
-            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+                Shape (T', C_aux), the auxiliary input, by default None
            x(Tensor, optional): 
                Shape (T, C_in), the noise waveform, by default None
        Returns:
            Tensor: Shape (T, C_out), the generated waveform
@ -214,20 +234,28 @@ class PWGDiscriminator(nn.Layer):
    """A convolutional discriminator for audio.
    Args:
-        in_channels (int, optional): Number of channels of the input audio, by default 1
+        in_channels (int, optional): 
-        out_channels (int, optional): Output feature size, by default 1
+            Number of channels of the input audio, by default 1
-        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+        out_channels (int, optional): 
-        layers (int, optional): Number of layers, by default 10
+            Output feature size, by default 1
-        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+        kernel_size (int, optional): 
-        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
+            Kernel size of convolutional sublayers, by default 3
        layers (int, optional): 
            Number of layers, by default 10
        conv_channels (int, optional): 
            Feature size of the convolutional sublayers, by default 64
        dilation_factor (int, optional): 
            The factor with which dilation of each convolutional sublayers grows 
            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
            by default 1
-        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+        nonlinear_activation (str, optional): 
-        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
+            The activation after each convolutional sublayer, by default "leakyrelu"
-            {"negative_slope": 0.2}
+        nonlinear_activation_params (Dict[str, Any], optional): 
-        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+            The parameters passed to the activation's initializer, by default {"negative_slope": 0.2}
-        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
+        bias (bool, optional): 
-            by default True
+            Whether to use bias in convolutional sublayers, by default True
        use_weight_norm (bool, optional): 
            Whether to use weight normalization at all convolutional sublayers, by default True
    """
    def __init__(
@ -290,7 +318,8 @@ class PWGDiscriminator(nn.Layer):
        """
        Args:
-            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+            x (Tensor): 
                Shape (N, in_channels, num_samples), the input audio.
        Returns:
            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
@ -318,24 +347,35 @@ class ResidualPWGDiscriminator(nn.Layer):
    """A wavenet-style discriminator for audio.
    Args:
-        in_channels (int, optional): Number of channels of the input audio, by default 1
+        in_channels (int, optional): 
-        out_channels (int, optional): Output feature size, by default 1
+            Number of channels of the input audio, by default 1
-        kernel_size (int, optional): Kernel size of residual blocks, by default 3
+        out_channels (int, optional): 
-        layers (int, optional): Number of residual blocks, by default 30
+            Output feature size, by default 1
-        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
+        kernel_size (int, optional): 
            Kernel size of residual blocks, by default 3
        layers (int, optional): 
            Number of residual blocks, by default 30
        stacks (int, optional): 
            Number of groups of residual blocks, within which the dilation 
            of each residual blocks grows exponentially, by default 3
-        residual_channels (int, optional): Residual channels of residual blocks, by default 64
+        residual_channels (int, optional): 
-        gate_channels (int, optional): Gate channels of residual blocks, by default 128
+            Residual channels of residual blocks, by default 64
-        skip_channels (int, optional): Skip channels of residual blocks, by default 64
+        gate_channels (int, optional): 
-        dropout (float, optional): Dropout probability of residual blocks, by default 0.
+            Gate channels of residual blocks, by default 128
-        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        skip_channels (int, optional): 
-        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
+            Skip channels of residual blocks, by default 64
-            by default True
+        dropout (float, optional): 
-        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+            Dropout probability of residual blocks, by default 0.
-        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
+        bias (bool, optional): 
-            by default "leakyrelu"
+            Whether to use bias in residual blocks, by default True
-        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
+        use_weight_norm (bool, optional): 
-            by default {"negative_slope": 0.2}
+            Whether to use weight normalization in all convolutional layers, by default True
        use_causal_conv (bool, optional): 
            Whether to use causal convolution in residual blocks, by default False
        nonlinear_activation (str, optional): 
            Activation after convolutions other than those in residual blocks, by default "leakyrelu"
        nonlinear_activation_params (Dict[str, Any], optional): 
            Parameters to pass to the activation, by default {"negative_slope": 0.2}
    """
    def __init__(
@ -405,7 +445,8 @@ class ResidualPWGDiscriminator(nn.Layer):
    def forward(self, x):
        """
        Args:
-            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+            x(Tensor): 
                Shape (N, in_channels, num_samples), the input audio.↩
        Returns:
            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@ -29,10 +29,14 @@ class ResidualBlock(nn.Layer):
                 n: int=2):
        """SpeedySpeech encoder module.
        Args:
-            channels (int, optional): Feature size of the residual output(and also the input).
+            channels (int, optional): 
-            kernel_size (int, optional): Kernel size of the 1D convolution.
+                Feature size of the residual output(and also the input).
-            dilation (int, optional): Dilation of the 1D convolution.
+            kernel_size (int, optional): 
-            n (int): Number of blocks.
+                Kernel size of the 1D convolution.
            dilation (int, optional): 
                Dilation of the 1D convolution.
            n (int): 
                Number of blocks.
        """
        super().__init__()
@ -57,7 +61,8 @@ class ResidualBlock(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Calculate forward propagation.
        Args:
-            x(Tensor): Batch of input sequences (B, hidden_size, Tmax).
+            x(Tensor): 
                Batch of input sequences (B, hidden_size, Tmax).
        Returns:
            Tensor: The residual output (B, hidden_size, Tmax).
        """
@ -89,8 +94,10 @@ class TextEmbedding(nn.Layer):
    def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None):
        """Calculate forward propagation.
        Args:
-            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text(Tensor(int64)): 
-            tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+                Batch of padded token ids (B, Tmax).
            tones(Tensor, optional(int64)): 
                Batch of padded tone ids (B, Tmax).
        Returns:
            Tensor: The residual output (B, Tmax, embedding_size).
        """
@ -109,12 +116,18 @@ class TextEmbedding(nn.Layer):
 class SpeedySpeechEncoder(nn.Layer):
    """SpeedySpeech encoder module.
    Args:
-        vocab_size (int): Dimension of the inputs.
+        vocab_size (int): 
-        tone_size (Optional[int]): Number of tones.
+            Dimension of the inputs.
-        hidden_size (int): Number of encoder hidden units.
+        tone_size (Optional[int]): 
-        kernel_size (int): Kernel size of encoder.
+            Number of tones.
-        dilations (List[int]): Dilations of encoder.
+        hidden_size (int): 
-        spk_num (Optional[int]): Number of speakers. 
+            Number of encoder hidden units.
        kernel_size (int): 
            Kernel size of encoder.
        dilations (List[int]): 
            Dilations of encoder.
        spk_num (Optional[int]): 
            Number of speakers. 
    """
    def __init__(self,
@ -161,9 +174,12 @@ class SpeedySpeechEncoder(nn.Layer):
                spk_id: paddle.Tensor=None):
        """Encoder input sequence.
        Args:
-            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text(Tensor(int64)): 
-            tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
+                Batch of padded token ids (B, Tmax).
-            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+            tones(Tensor, optional(int64)): 
                Batch of padded tone ids (B, Tmax).
            spk_id(Tnesor, optional(int64)): 
                Batch of speaker ids (B,)
        Returns:
            Tensor: Output tensor (B, Tmax, hidden_size).
@ -192,7 +208,8 @@ class DurationPredictor(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Calculate forward propagation.
        Args:
-            x(Tensor): Batch of input sequences (B, Tmax, hidden_size).
+            x(Tensor): 
                Batch of input sequences (B, Tmax, hidden_size).
        Returns:
            Tensor: Batch of predicted durations in log domain (B, Tmax).
@ -212,10 +229,14 @@ class SpeedySpeechDecoder(nn.Layer):
                 ]):
        """SpeedySpeech decoder module.
        Args:
-            hidden_size (int): Number of decoder hidden units.
+            hidden_size (int): 
-            kernel_size (int): Kernel size of decoder.
+                Number of decoder hidden units.
-            output_size (int): Dimension of the outputs.
+            kernel_size (int): 
-            dilations (List[int]): Dilations of decoder.
+                Kernel size of decoder.
            output_size (int): 
                Dimension of the outputs.
            dilations (List[int]): 
                Dilations of decoder.
        """
        super().__init__()
        res_blocks = [
@ -230,7 +251,8 @@ class SpeedySpeechDecoder(nn.Layer):
    def forward(self, x):
        """Decoder input sequence.
        Args:
-            x(Tensor): Input tensor (B, time, hidden_size).
+            x(Tensor): 
                Input tensor (B, time, hidden_size).
        Returns:
            Tensor: Output tensor (B, time, output_size).
@ -261,18 +283,30 @@ class SpeedySpeech(nn.Layer):
            positional_dropout_rate: int=0.1):
        """Initialize SpeedySpeech module.
        Args:
-            vocab_size (int): Dimension of the inputs.
+            vocab_size (int): 
-            encoder_hidden_size (int): Number of encoder hidden units.
+                Dimension of the inputs.
-            encoder_kernel_size (int): Kernel size of encoder.
+            encoder_hidden_size (int): 
-            encoder_dilations (List[int]): Dilations of encoder.
+                Number of encoder hidden units.
-            duration_predictor_hidden_size (int): Number of duration predictor hidden units.
+            encoder_kernel_size (int): 
-            decoder_hidden_size (int): Number of decoder hidden units.
+                Kernel size of encoder.
-            decoder_kernel_size (int): Kernel size of decoder.
+            encoder_dilations (List[int]): 
-            decoder_dilations (List[int]): Dilations of decoder.
+                Dilations of encoder.
-            decoder_output_size (int): Dimension of the outputs.
+            duration_predictor_hidden_size (int):
-            tone_size (Optional[int]): Number of tones.
+                Number of duration predictor hidden units.
-            spk_num (Optional[int]): Number of speakers. 
+            decoder_hidden_size (int): 
-            init_type (str): How to initialize transformer parameters.
+                Number of decoder hidden units.
            decoder_kernel_size (int): 
                Kernel size of decoder.
            decoder_dilations (List[int]): 
                Dilations of decoder.
            decoder_output_size (int): 
                Dimension of the outputs.
            tone_size (Optional[int]): 
                Number of tones.
            spk_num (Optional[int]): 
                Number of speakers. 
            init_type (str): 
                How to initialize transformer parameters.
        """
        super().__init__()
@ -304,14 +338,20 @@ class SpeedySpeech(nn.Layer):
                spk_id: paddle.Tensor=None):
        """Calculate forward propagation.
        Args:
-            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text(Tensor(int64)): 
-            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+                Batch of padded token ids (B, Tmax).
-            tones(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+            durations(Tensor(int64)): 
-            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+                Batch of padded durations (B, Tmax).
            tones(Tensor, optional(int64)): 
                Batch of padded tone ids  (B, Tmax).
            spk_id(Tnesor, optional(int64)): 
                Batch of speaker ids (B,)
        Returns:
-            Tensor: Output tensor (B, T_frames, decoder_output_size).
+            Tensor: 
-            Tensor: Predicted durations (B, Tmax).
+                Output tensor (B, T_frames, decoder_output_size).
            Tensor: 
                Predicted durations (B, Tmax).
        """
        # input of embedding must be int64
        text = paddle.cast(text, 'int64')
@ -336,10 +376,14 @@ class SpeedySpeech(nn.Layer):
                  spk_id: paddle.Tensor=None):
        """Generate the sequence of features given the sequences of characters.
        Args:
-            text(Tensor(int64)): Input sequence of characters (T,).
+            text(Tensor(int64)): 
-            tones(Tensor, optional(int64)): Batch of padded tone ids (T, ).
+                Input sequence of characters (T,).
-            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+            tones(Tensor, optional(int64)): 
-            spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
+                Batch of padded tone ids (T, ).
            durations(Tensor, optional (int64)): 
                Groundtruth of duration (T,).
            spk_id(Tensor, optional(int64), optional): 
                spk ids (1,). (Default value = None)
        Returns:
            Tensor: logmel (T, decoder_output_size).
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@ -83,38 +83,67 @@ class Tacotron2(nn.Layer):
            init_type: str="xavier_uniform", ):
        """Initialize Tacotron2 module.
        Args:
-            idim (int): Dimension of the inputs.
+            idim (int): 
-            odim (int): Dimension of the outputs.
+                Dimension of the inputs.
-            embed_dim (int): Dimension of the token embedding.
+            odim (int): 
-            elayers (int): Number of encoder blstm layers.
+                Dimension of the outputs.
-            eunits (int): Number of encoder blstm units.
+            embed_dim (int): 
-            econv_layers (int): Number of encoder conv layers.
+                Dimension of the token embedding.
-            econv_filts (int): Number of encoder conv filter size.
+            elayers (int): 
-            econv_chans (int): Number of encoder conv filter channels.
+                Number of encoder blstm layers.
-            dlayers (int): Number of decoder lstm layers.
+            eunits (int): 
-            dunits (int): Number of decoder lstm units.
+                Number of encoder blstm units.
-            prenet_layers (int): Number of prenet layers.
+            econv_layers (int): 
-            prenet_units (int): Number of prenet units.
+                Number of encoder conv layers.
-            postnet_layers (int): Number of postnet layers.
+            econv_filts (int): 
-            postnet_filts (int): Number of postnet filter size.
+                Number of encoder conv filter size.
-            postnet_chans (int): Number of postnet filter channels.
+            econv_chans (int): 
-            output_activation (str): Name of activation function for outputs.
+                Number of encoder conv filter channels.
-            adim (int): Number of dimension of mlp in attention.
+            dlayers (int): 
-            aconv_chans (int): Number of attention conv filter channels.
+                Number of decoder lstm layers.
-            aconv_filts (int): Number of attention conv filter size.
+            dunits (int): 
-            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+                Number of decoder lstm units.
-            use_batch_norm (bool): Whether to use batch normalization.
+            prenet_layers (int): 
-            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+                Number of prenet layers.
-            reduction_factor (int): Reduction factor.
+            prenet_units (int): 
-            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
+                Number of prenet units.
            postnet_layers (int): 
                Number of postnet layers.
            postnet_filts (int): 
                Number of postnet filter size.
            postnet_chans (int): 
                Number of postnet filter channels.
            output_activation (str): 
                Name of activation function for outputs.
            adim (int): 
                Number of dimension of mlp in attention.
            aconv_chans (int): 
                Number of attention conv filter channels.
            aconv_filts (int): 
                Number of attention conv filter size.
            cumulate_att_w (bool): 
                Whether to cumulate previous attention weight.
            use_batch_norm (bool): 
                Whether to use batch normalization.
            use_concate (bool): 
                Whether to concat enc outputs w/ dec lstm outputs.
            reduction_factor (int): 
                Reduction factor.
            spk_num (Optional[int]): 
                Number of speakers. If set to > 1, assume that the
                sids will be provided as the input and use sid embedding layer.
-            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
+            lang_num (Optional[int]): 
                Number of languages. If set to > 1, assume that the
                lids will be provided as the input and use sid embedding layer.
-            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+            spk_embed_dim (Optional[int]): 
                Speaker embedding dimension. If set to > 0,
                assume that spk_emb will be provided as the input.
-            spk_embed_integration_type (str): How to integrate speaker embedding.
+            spk_embed_integration_type (str): 
-            dropout_rate (float): Dropout rate.
+                How to integrate speaker embedding.
-            zoneout_rate (float): Zoneout rate.
+            dropout_rate (float): 
                Dropout rate.
            zoneout_rate (float): 
                Zoneout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -230,18 +259,28 @@ class Tacotron2(nn.Layer):
        """Calculate forward propagation.
        Args:
-            text (Tensor(int64)): Batch of padded character ids (B, T_text).
+            text (Tensor(int64)):   
-            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+                Batch of padded character ids (B, T_text).
-            speech (Tensor): Batch of padded target features (B, T_feats, odim).
+            text_lengths (Tensor(int64)): 
-            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+                Batch of lengths of each input batch (B,).
-            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            speech (Tensor):
-            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+                 Batch of padded target features (B, T_feats, odim).
-            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+            speech_lengths (Tensor(int64)): 
                Batch of the lengths of each target (B,).
            spk_emb (Optional[Tensor]): 
                Batch of speaker embeddings (B, spk_embed_dim).
            spk_id (Optional[Tensor]): 
                Batch of speaker IDs (B, 1).
            lang_id (Optional[Tensor]): 
                Batch of language IDs (B, 1).
        Returns:
-            Tensor: Loss scalar value.
+            Tensor: 
-            Dict: Statistics to be monitored.
+                Loss scalar value.
-            Tensor: Weight value if not joint training else model outputs.
+            Dict: 
                Statistics to be monitored.
            Tensor: 
                Weight value if not joint training else model outputs.
        """
        text = text[:, :text_lengths.max()]
@ -329,18 +368,30 @@ class Tacotron2(nn.Layer):
        """Generate the sequence of features given the sequences of characters.
        Args:
-            text (Tensor(int64)): Input sequence of characters (T_text,).
+            text (Tensor(int64)): 
-            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+                Input sequence of characters (T_text,).
-            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+            speech (Optional[Tensor]): 
-            spk_id (Optional[Tensor]): Speaker ID (1,).
+                Feature sequence to extract style (N, idim).
-            lang_id (Optional[Tensor]): Language ID (1,).
+            spk_emb (ptional[Tensor]): 
-            threshold (float): Threshold in inference.
+                Speaker embedding (spk_embed_dim,).
-            minlenratio (float): Minimum length ratio in inference.
+            spk_id (Optional[Tensor]): 
-            maxlenratio (float): Maximum length ratio in inference.
+                Speaker ID (1,).
-            use_att_constraint (bool): Whether to apply attention constraint.
+            lang_id (Optional[Tensor]): 
-            backward_window (int): Backward window in attention constraint.
+                Language ID (1,).
-            forward_window (int): Forward window in attention constraint.
+            threshold (float): 
-            use_teacher_forcing (bool): Whether to use teacher forcing.
+                Threshold in inference.
            minlenratio (float): 
                Minimum length ratio in inference.
            maxlenratio (float): 
                Maximum length ratio in inference.
            use_att_constraint (bool): 
                Whether to apply attention constraint.
            backward_window (int): 
                Backward window in attention constraint.
            forward_window (int): 
                    Forward window in attention constraint.
            use_teacher_forcing (bool): 
                Whether to use teacher forcing.
        Returns:
            Dict[str, Tensor]
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -49,66 +49,124 @@ class TransformerTTS(nn.Layer):
        https://arxiv.org/pdf/1809.08895.pdf
    Args:
-        idim (int): Dimension of the inputs.
+        idim (int): 
-        odim (int): Dimension of the outputs.
+            Dimension of the inputs.
-        embed_dim (int, optional): Dimension of character embedding.
+        odim (int): 
-        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+            Dimension of the outputs.
-        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+        embed_dim (int, optional): 
-        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+            Dimension of character embedding.
-        dprenet_layers (int, optional): Number of decoder prenet layers.
+        eprenet_conv_layers (int, optional): 
-        dprenet_units (int, optional): Number of decoder prenet hidden units.
+            Number of encoder prenet convolution layers.
-        elayers (int, optional): Number of encoder layers.
+        eprenet_conv_chans (int, optional): 
-        eunits (int, optional): Number of encoder hidden units.
+            Number of encoder prenet convolution channels.
-        adim (int, optional): Number of attention transformation dimensions.
+        eprenet_conv_filts (int, optional): 
-        aheads (int, optional): Number of heads for multi head attention.
+            Filter size of encoder prenet convolution.
-        dlayers (int, optional): Number of decoder layers.
+        dprenet_layers (int, optional): 
-        dunits (int, optional): Number of decoder hidden units.
+            Number of decoder prenet layers.
-        postnet_layers (int, optional): Number of postnet layers.
+        dprenet_units (int, optional): 
-        postnet_chans (int, optional): Number of postnet channels.
+            Number of decoder prenet hidden units.
-        postnet_filts (int, optional): Filter size of postnet.
+        elayers (int, optional): 
-        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+            Number of encoder layers.
-        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+        eunits (int, optional): 
-        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+            Number of encoder hidden units.
-        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+        adim (int, optional): 
-        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+            Number of attention transformation dimensions.
-        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+        aheads (int, optional): 
-        positionwise_layer_type (str, optional): Position-wise operation type.
+            Number of heads for multi head attention.
-        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+        dlayers (int, optional): 
-        reduction_factor (int, optional): Reduction factor.
+            Number of decoder layers.
-        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+        dunits (int, optional): 
-        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+            Number of decoder hidden units.
-        use_gst (str, optional): Whether to use global style token.
+        postnet_layers (int, optional): 
-        gst_tokens (int, optional): The number of GST embeddings.
+            Number of postnet layers.
-        gst_heads (int, optional): The number of heads in GST multihead attention.
+        postnet_chans (int, optional): 
-        gst_conv_layers (int, optional): The number of conv layers in GST.
+            Number of postnet channels.
-        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+        postnet_filts (int, optional): 
-        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+            Filter size of postnet.
-        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        use_scaled_pos_enc (pool, optional): 
-        gst_gru_layers (int, optional): The number of GRU layers in GST.
+            Whether to use trainable scaled positional encoding.
-        gst_gru_units (int, optional): The number of GRU units in GST.
+        use_batch_norm (bool, optional): 
-        transformer_lr (float, optional): Initial value of learning rate.
+            Whether to use batch normalization in encoder prenet.
-        transformer_warmup_steps (int, optional): Optimizer warmup steps.
+        encoder_normalize_before (bool, optional): 
-        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+            Whether to perform layer normalization before encoder block.
-        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+        decoder_normalize_before (bool, optional): 
-        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
+            Whether to perform layer normalization before decoder block.
-        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+        encoder_concat_after (bool, optional): 
-        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+            Whether to concatenate attention layer's input and output in encoder.
-        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
+        decoder_concat_after (bool, optional): 
-        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+            Whether to concatenate attention layer's input and output in decoder.
-        init_type (str, optional): How to initialize transformer parameters.
+        positionwise_layer_type (str, optional): 
-        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
+            Position-wise operation type.
-        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+        positionwise_conv_kernel_size (int, optional): 
-        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+            Kernel size in position wise conv 1d.
-        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+        reduction_factor (int, optional): 
-        postnet_dropout_rate (float, optional): Dropout rate in postnet.
+            Reduction factor.
-        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+        spk_embed_dim (int, optional): 
-        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+            Number of speaker embedding dimenstions.
-        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+        spk_embed_integration_type (str, optional): 
-        loss_type (str, optional): How to calculate loss.
+            How to integrate speaker embedding.
-        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+        use_gst (str, optional): 
-        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+            Whether to use global style token.
-        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
+        gst_tokens (int, optional): 
-            List of module names to apply guided attention loss.
+            The number of GST embeddings.
        gst_heads (int, optional): 
            The number of heads in GST multihead attention.
        gst_conv_layers (int, optional): 
            The number of conv layers in GST.
        gst_conv_chans_list (Sequence[int], optional): 
            List of the number of channels of conv layers in GST.
        gst_conv_kernel_size (int, optional): 
            Kernal size of conv layers in GST.
        gst_conv_stride (int, optional): 
            Stride size of conv layers in GST.
        gst_gru_layers (int, optional): 
            The number of GRU layers in GST.
        gst_gru_units (int, optional): 
            The number of GRU units in GST.
        transformer_lr (float, optional): 
            Initial value of learning rate.
        transformer_warmup_steps (int, optional): 
            Optimizer warmup steps.
        transformer_enc_dropout_rate (float, optional): 
            Dropout rate in encoder except attention and positional encoding.
        transformer_enc_positional_dropout_rate (float, optional): 
            Dropout rate after encoder positional encoding.
        transformer_enc_attn_dropout_rate （float, optional): 
            Dropout rate in encoder self-attention module.
        transformer_dec_dropout_rate (float, optional): 
            Dropout rate in decoder except attention & positional encoding.
        transformer_dec_positional_dropout_rate (float, optional): 
            Dropout rate after decoder positional encoding.
        transformer_dec_attn_dropout_rate （float, optional): 
            Dropout rate in deocoder self-attention module.
        transformer_enc_dec_attn_dropout_rate (float, optional): 
            Dropout rate in encoder-deocoder attention module.
        init_type (str, optional): 
            How to initialize transformer parameters.
        init_enc_alpha （float, optional）: 
            Initial value of alpha in scaled pos encoding of the encoder.
        init_dec_alpha (float, optional): 
            Initial value of alpha in scaled pos encoding of the decoder.
        eprenet_dropout_rate (float, optional): 
            Dropout rate in encoder prenet.
        dprenet_dropout_rate (float, optional): 
            Dropout rate in decoder prenet.
        postnet_dropout_rate (float, optional): 
            Dropout rate in postnet.
        use_masking (bool, optional): 
            Whether to apply masking for padded part in loss calculation.
        use_weighted_masking (bool, optional): 
            Whether to apply weighted masking in loss calculation.
        bce_pos_weight (float, optional): 
            Positive sample weight in bce calculation (only for use_masking=true).
        loss_type (str, optional): 
            How to calculate loss.
        use_guided_attn_loss (bool, optional): 
            Whether to use guided attention loss.
        num_heads_applied_guided_attn (int, optional):
            Number of heads in each layer to apply guided attention loss.
        num_layers_applied_guided_attn (int, optional): 
            Number of layers to apply guided attention loss.
    """
    def __init__(
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@ -33,8 +33,10 @@ def fold(x, n_group):
    """Fold audio or spectrogram's temporal dimension in to groups.
    Args:
-        x(Tensor): The input tensor. shape=(*, time_steps)
+        x(Tensor): 
-        n_group(int): The size of a group.
+            The input tensor. shape=(*, time_steps)
        n_group(int): 
            The size of a group.
    Returns:
        Tensor: Folded tensor. shape=(*, time_steps // n_group, group)
@ -53,7 +55,8 @@ class UpsampleNet(nn.LayerList):
    on mel and time dimension.
    Args:
-        upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
+        upscale_factors(List[int], optional): 
            Time upsampling factors for each Conv2DTranspose Layer.
            The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
            Layers. Each upscale_factor is used as the ``stride`` for the
            corresponding Conv2DTranspose. Defaults to [16, 16], this the default
@ -94,8 +97,10 @@ class UpsampleNet(nn.LayerList):
        """Forward pass of the ``UpsampleNet``
        Args:
-            x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
+            x(Tensor): 
-            trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
+                The input spectrogram. shape=(batch_size, input_channels, time_steps)
            trim_conv_artifact(bool, optional, optional): 
                Trim deconvolution artifact at each layer. Defaults to False.
        Returns:
           Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor)
@ -123,10 +128,14 @@ class ResidualBlock(nn.Layer):
    and output.
    Args:
-        channels (int): Feature size of the input.
+        channels (int): 
-        cond_channels (int): Featuer size of the condition.
+            Feature size of the input.
-        kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
+        cond_channels (int): 
-        dilations (int): Dilations of the Convolution2d applied to the input.
+            Featuer size of the condition.
        kernel_size (Tuple[int]): 
            Kernel size of the Convolution2d applied to the input.
        dilations (int): 
            Dilations of the Convolution2d applied to the input.
    """
    def __init__(self, channels, cond_channels, kernel_size, dilations):
@ -173,12 +182,16 @@ class ResidualBlock(nn.Layer):
        """Compute output for a whole folded sequence.
        Args:
-            x (Tensor): The input. [shape=(batch_size, channel, height, width)]
+            x (Tensor): 
-            condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
+                The input. [shape=(batch_size, channel, height, width)]
            condition (Tensor [shape=(batch_size, condition_channel, height, width)]): 
                The local condition.
        Returns: 
-            res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
+            res (Tensor): 
-            skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
+                The residual output. [shape=(batch_size, channel, height, width)]
            skip (Tensor): 
                The skip output. [shape=(batch_size, channel, height, width)]
        """
        x_in = x
        x = self.conv(x)
@ -216,12 +229,16 @@ class ResidualBlock(nn.Layer):
        """Compute the output for a row and update the buffer.
        Args:
-            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            x_row (Tensor): 
-            condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
+                A row of the input. shape=(batch_size, channel, 1, width)
            condition_row (Tensor): 
                A row of the condition. shape=(batch_size, condition_channel, 1, width)
        Returns:
-            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
+            res (Tensor): 
-            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+                A row of the the residual output. shape=(batch_size, channel, 1, width)
            skip (Tensor): 
                A row of the skip output. shape=(batch_size, channel, 1, width)
        """
        x_row_in = x_row
@ -258,11 +275,16 @@ class ResidualNet(nn.LayerList):
    """A stack of several ResidualBlocks. It merges condition at each layer.
    Args:
-        n_layer (int): Number of ResidualBlocks in the ResidualNet.
+        n_layer (int): 
-        residual_channels (int): Feature size of each ResidualBlocks.
+            Number of ResidualBlocks in the ResidualNet.
-        condition_channels (int): Feature size of the condition.
+        residual_channels (int): 
-        kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
+            Feature size of each ResidualBlocks.
-        dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
+        condition_channels (int): 
            Feature size of the condition.
        kernel_size (Tuple[int]): 
            Kernel size of each ResidualBlock.
        dilations_h (List[int]): 
            Dilation in height dimension of every ResidualBlock.
    Raises:
        ValueError: If the length of dilations_h does not equals n_layers.
@ -288,11 +310,13 @@ class ResidualNet(nn.LayerList):
        """Comput the output of given the input and the condition.
        Args:
-            x (Tensor): The input. shape=(batch_size, channel, height, width)
+            x (Tensor): 
-            condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
+                The input. shape=(batch_size, channel, height, width)
            condition (Tensor): 
                The local condition. shape=(batch_size, condition_channel, height, width)
        Returns: 
-            Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
+            Tensor: The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
        """
        skip_connections = []
@ -312,12 +336,16 @@ class ResidualNet(nn.LayerList):
        """Compute the output for a row and update the buffers.
        Args:
-            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            x_row (Tensor): 
-            condition_row (Tensor):  A row of the condition. shape=(batch_size, condition_channel, 1, width)
+                A row of the input. shape=(batch_size, channel, 1, width)
            condition_row (Tensor):  
                A row of the condition. shape=(batch_size, condition_channel, 1, width)
        Returns:
-            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) 
+            res (Tensor): 
-            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+                A row of the the residual output. shape=(batch_size, channel, 1, width) 
            skip (Tensor): 
                A row of the skip output. shape=(batch_size, channel, 1, width)
        """
        skip_connections = []
@ -337,11 +365,16 @@ class Flow(nn.Layer):
    sampling.
    Args:
-        n_layers (int): Number of ResidualBlocks in the Flow.
+        n_layers (int): 
-        channels (int): Feature size of the ResidualBlocks.
+            Number of ResidualBlocks in the Flow.
-        mel_bands (int): Feature size of the mel spectrogram (mel bands).
+        channels (int): 
-        kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
+            Feature size of the ResidualBlocks.
-        n_group (int): Number of timesteps to the folded into a group.
+        mel_bands (int): 
            Feature size of the mel spectrogram (mel bands).
        kernel_size (Tuple[int]): 
            Kernel size of each ResisualBlocks in the Flow.
        n_group (int): 
            Number of timesteps to the folded into a group.
    """
    dilations_dict = {
        8: [1, 1, 1, 1, 1, 1, 1, 1],
@ -393,11 +426,14 @@ class Flow(nn.Layer):
        a sample from p(X) into a sample from p(Z).
        Args:
-            x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
+            x (Tensor): 
-            condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
+                A input sample of the distribution p(X). shape=(batch, 1, height, width)
            condition (Tensor): 
                The local condition. shape=(batch, condition_channel, height, width)
        Returns:
-            z (Tensor): shape(batch, 1, height, width), the transformed sample.
+            z (Tensor): 
                shape(batch, 1, height, width), the transformed sample.
            Tuple[Tensor, Tensor]:
                The parameter of the transformation.
                logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
@ -433,8 +469,10 @@ class Flow(nn.Layer):
        p(Z) and transform the sample. It is a auto regressive transformation.
        Args:
-            z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            z(Tensor): 
-            condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
+                A sample of the distribution p(Z). shape=(batch, 1, time_steps
            condition(Tensor): 
                The local condition. shape=(batch, condition_channel, time_steps)
        Returns:
            Tensor:
                The transformed sample. shape=(batch, 1, height, width)
@ -462,12 +500,18 @@ class WaveFlow(nn.LayerList):
    flows.
    Args:
-        n_flows (int): Number of flows in the WaveFlow model.
+        n_flows (int): 
-        n_layers (int): Number of ResidualBlocks in each Flow.
+            Number of flows in the WaveFlow model.
-        n_group (int): Number of timesteps to fold as a group.
+        n_layers (int): 
-        channels (int): Feature size of each ResidualBlock.
+            Number of ResidualBlocks in each Flow.
-        mel_bands (int): Feature size of mel spectrogram (mel bands).
+        n_group (int): 
-        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+            Number of timesteps to fold as a group.
        channels (int): 
            Feature size of each ResidualBlock.
        mel_bands (int): 
            Feature size of mel spectrogram (mel bands).
        kernel_size (Union[int, List[int]]): 
            Kernel size of the convolution layer in each ResidualBlock.
    """
    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
@ -518,12 +562,16 @@ class WaveFlow(nn.LayerList):
        condition.
        Args:
-            x (Tensor): The audio. shape=(batch_size, time_steps)
+            x (Tensor): 
-            condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
+                The audio. shape=(batch_size, time_steps)
            condition (Tensor): 
                The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
        Returns:
-            Tensor: The transformed random variable. shape=(batch_size, time_steps)
+            Tensor: 
-            Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
+                The transformed random variable. shape=(batch_size, time_steps)
            Tensor: 
                The log determinant of the jacobian of the transformation from x to z. shape=(1,)
        """
        # x: (B, T)
        # condition: (B, C, T) upsampled condition
@ -559,12 +607,13 @@ class WaveFlow(nn.LayerList):
        autoregressive manner.
        Args:
-            z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            z (Tensor): 
-            condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)    
+                A sample of the distribution p(Z). shape=(batch, 1, time_steps
            condition (Tensor): 
                The local condition. shape=(batch, condition_channel, time_steps)    
        Returns: 
            Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
        """
        z, condition = self._trim(z, condition)
@ -590,13 +639,20 @@ class ConditionalWaveFlow(nn.LayerList):
    """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
    Args:
-        upsample_factors (List[int]): Upsample factors for the upsample net.
+        upsample_factors (List[int]): 
-        n_flows (int): Number of flows in the WaveFlow model.
+            Upsample factors for the upsample net.
-        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_flows (int): 
-        n_group (int): Number of timesteps to fold as a group.
+            Number of flows in the WaveFlow model.
-        channels (int): Feature size of each ResidualBlock.
+        n_layers (int): 
-        n_mels (int): Feature size of mel spectrogram (mel bands).
+            Number of ResidualBlocks in each Flow.
-        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+        n_group (int): 
            Number of timesteps to fold as a group.
        channels (int): 
            Feature size of each ResidualBlock.
        n_mels (int): 
            Feature size of mel spectrogram (mel bands).
        kernel_size (Union[int, List[int]]): 
            Kernel size of the convolution layer in each ResidualBlock.
        """
    def __init__(self,
@ -622,12 +678,16 @@ class ConditionalWaveFlow(nn.LayerList):
        the determinant of the jacobian of the transformation from x to z.
        Args:
-            audio(Tensor): The audio. shape=(B, T)
+            audio(Tensor): 
-            mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
+                The audio. shape=(B, T)
            mel(Tensor): 
                The mel spectrogram. shape=(B, C_mel, T_mel)
        Returns:
-            Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
+            Tensor: 
-            Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
+                The inversely transformed random variable z (x to z). shape=(B, T)
            Tensor: 
                the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
        """
        condition = self.encoder(mel)
        z, log_det_jacobian = self.decoder(audio, condition)
@ -638,10 +698,12 @@ class ConditionalWaveFlow(nn.LayerList):
        """Generate raw audio given mel spectrogram.
        Args:
-            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+            mel(np.ndarray): 
                Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
        Returns:
-            Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
+            Tensor: 
                The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
        """
        start = time.time()
        condition = self.encoder(mel, trim_conv_artifact=True)  # (B, C, T)
@ -657,7 +719,8 @@ class ConditionalWaveFlow(nn.LayerList):
        """Generate raw audio given mel spectrogram.
        Args:
-            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+            mel(np.ndarray): 
                Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
        Returns:
            np.ndarray: The synthesized audio. shape=(T,)
@ -673,8 +736,10 @@ class ConditionalWaveFlow(nn.LayerList):
        """Build a ConditionalWaveFlow model from a pretrained model.
        Args:
-            config(yacs.config.CfgNode): model configs
+            config(yacs.config.CfgNode): 
-            checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
+                model configs
            checkpoint_path(Path or str): 
                the path of pretrained model checkpoint, without extension name
        Returns:
            ConditionalWaveFlow The model built from pretrained result.
@ -694,8 +759,8 @@ class WaveFlowLoss(nn.Layer):
    """Criterion of a WaveFlow model.
    Args:
-        sigma (float): The standard deviation of the gaussian noise used in WaveFlow, 
+        sigma (float): 
-            by default 1.0.
+            The standard deviation of the gaussian noise used in WaveFlow, by default 1.0.
    """
    def __init__(self, sigma=1.0):
@ -708,8 +773,10 @@ class WaveFlowLoss(nn.Layer):
        log_det_jacobian of transformation from x to z.
        Args:
-            z(Tensor): The transformed random variable (x to z). shape=(B, T)
+            z(Tensor): 
-            log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
+                The transformed random variable (x to z). shape=(B, T)
            log_det_jacobian(Tensor): 
                The log of the determinant of the jacobian matrix of the
                transformation from x to z.  shape=(1,)
        Returns:
@ -726,7 +793,8 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
        """Generate raw audio given mel spectrogram.
        Args:
-            mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+            mel (np.ndarray): 
                Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
        Returns:
            np.ndarray: The synthesized audio. shape=(T,)
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@ -165,19 +165,29 @@ class WaveRNN(nn.Layer):
            init_type: str="xavier_uniform", ):
        '''
        Args:
-            rnn_dims (int, optional): Hidden dims of RNN Layers.
+            rnn_dims (int, optional): 
-            fc_dims (int, optional): Dims of FC Layers.
+                Hidden dims of RNN Layers.
-            bits (int, optional): bit depth of signal.
+            fc_dims (int, optional): 
-            aux_context_window (int, optional): The context window size of the first convolution applied to the 
+                Dims of FC Layers.
-                auxiliary input, by default 2
+            bits (int, optional): 
-            upsample_scales (List[int], optional): Upsample scales of the upsample network.
+                bit depth of signal.
-            aux_channels (int, optional): Auxiliary channel of the residual blocks.
+            aux_context_window (int, optional): 
-            compute_dims (int, optional): Dims of Conv1D in MelResNet.
+                The context window size of the first convolution applied to the auxiliary input, by default 2
-            res_out_dims (int, optional): Dims of output in MelResNet.
+            upsample_scales (List[int], optional): 
-            res_blocks (int, optional): Number of residual blocks.
+                Upsample scales of the upsample network.
-            mode (str, optional): Output mode of the WaveRNN vocoder. 
+            aux_channels (int, optional): 
                Auxiliary channel of the residual blocks.
            compute_dims (int, optional): 
                Dims of Conv1D in MelResNet.
            res_out_dims (int, optional): 
                Dims of output in MelResNet.
            res_blocks (int, optional): 
                Number of residual blocks.
            mode (str, optional): 
                Output mode of the WaveRNN vocoder. 
                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
-            init_type (str): How to initialize parameters.
+            init_type (str): 
                How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
@ -226,8 +236,10 @@ class WaveRNN(nn.Layer):
    def forward(self, x, c):
        '''
        Args:
-            x (Tensor): wav sequence, [B, T]
+            x (Tensor): 
-            c (Tensor): mel spectrogram [B, C_aux, T']
+                wav sequence, [B, T]
            c (Tensor): 
                mel spectrogram [B, C_aux, T']
            T = (T' - 2 * aux_context_window ) * hop_length
        Returns:
@ -280,10 +292,14 @@ class WaveRNN(nn.Layer):
                 gen_display: bool=False):
        """
        Args:
-            c(Tensor): input mels, (T', C_aux)
+            c(Tensor): 
-            batched(bool): generate in batch or not
+                input mels, (T', C_aux)
-            target(int): target number of samples to be generated in each batch entry
+            batched(bool): 
-            overlap(int): number of samples for crossfading between batches
+                generate in batch or not
            target(int): 
                target number of samples to be generated in each batch entry
            overlap(int): 
                number of samples for crossfading between batches
            mu_law(bool)
        Returns: 
            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
@ -404,7 +420,8 @@ class WaveRNN(nn.Layer):
    def pad_tensor(self, x, pad, side='both'):
        '''
        Args:
-            x(Tensor): mel, [1, n_frames, 80]
+            x(Tensor): 
                mel, [1, n_frames, 80]
            pad(int): 
            side(str, optional):  (Default value = 'both')
@ -428,12 +445,15 @@ class WaveRNN(nn.Layer):
        Overlap will be used for crossfading in xfade_and_unfold()
        Args:
-            x(Tensor): Upsampled conditioning features. mels or aux
+            x(Tensor): 
                Upsampled conditioning features. mels or aux
                shape=(1, T, features)
                mels: [1, T, 80]
                aux: [1, T, 128]
-            target(int): Target timesteps for each index of batch
+            target(int): 
-            overlap(int): Timesteps for both xfade and rnn warmup
+                Target timesteps for each index of batch
            overlap(int): 
                Timesteps for both xfade and rnn warmup
        Returns:
            Tensor: 
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@ -42,7 +42,8 @@ class CausalConv1D(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input tensor (B, in_channels, T).
+            x (Tensor): 
                Input tensor (B, in_channels, T).
        Returns: 
            Tensor: Output tensor (B, out_channels, T).
        """
@ -67,7 +68,8 @@ class CausalConv1DTranspose(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input tensor (B, in_channels, T_in).
+            x (Tensor): 
                Input tensor (B, in_channels, T_in).
        Returns:
            Tensor: Output tensor (B, out_channels, T_out).
        """
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@ -20,8 +20,10 @@ class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model.
    Args:
-        channels (int): The number of channels of conv layers.
+        channels (int): 
-        kernel_size (int): Kernerl size of conv layers.
+            The number of channels of conv layers.
        kernel_size (int): 
            Kernerl size of conv layers.
    """
    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@ -59,7 +61,8 @@ class ConvolutionModule(nn.Layer):
        """Compute convolution module.
        Args:
-            x (Tensor): Input tensor (#batch, time, channels).
+            x (Tensor): 
                Input tensor (#batch, time, channels).
        Returns:
            Tensor: Output tensor (#batch, time, channels).
        """
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@ -23,25 +23,34 @@ class EncoderLayer(nn.Layer):
    """Encoder layer module.
    Args:
-        size (int): Input dimension.
+        size (int): 
-        self_attn (nn.Layer): Self-attention module instance.
+            Input dimension.
        self_attn (nn.Layer): 
            Self-attention module instance.
            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
            can be used as the argument.
-        feed_forward (nn.Layer): Feed-forward module instance.
+        feed_forward (nn.Layer): 
            Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
-        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
+        feed_forward_macaron (nn.Layer): 
            Additional feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
            can be used as the argument.
-        conv_module (nn.Layer): Convolution module instance.
+        conv_module (nn.Layer): 
            Convolution module instance.
            `ConvlutionModule` instance can be used as the argument.
-        dropout_rate (float): Dropout rate.
+        dropout_rate (float): 
-        normalize_before (bool): Whether to use layer_norm before the first block.
+            Dropout rate.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        normalize_before (bool): 
            Whether to use layer_norm before the first block.
        concat_after (bool): 
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        stochastic_depth_rate (float): Proability to skip this layer.
+        stochastic_depth_rate (float): 
            Proability to skip this layer.
            During training, the layer may skip residual computation and return input
            as-is with given probability.
    """
@ -86,15 +95,19 @@ class EncoderLayer(nn.Layer):
        """Compute encoded features.
        Args:
-            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
+            x_input(Union[Tuple, Tensor]): 
                Input tensor w/ or w/o pos emb.
                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
                - w/o pos emb: Tensor (#batch, time, size).
-            mask(Tensor): Mask tensor for the input (#batch, time).
+            mask(Tensor): 
                Mask tensor for the input (#batch, time).
            cache (Tensor): 
        Returns:
-            Tensor: Output tensor (#batch, time, size).
+            Tensor: 
-            Tensor: Mask tensor (#batch, time).
+                Output tensor (#batch, time, size).
            Tensor: 
                Mask tensor (#batch, time).
        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@ -42,13 +42,19 @@ class Conv1dCell(nn.Conv1D):
    class.
    Args:
-        in_channels (int): The feature size of the input.
+        in_channels (int): 
-        out_channels (int): The feature size of the output.
+            The feature size of the input.
-        kernel_size (int or Tuple[int]): The size of the kernel.
+        out_channels (int): 
-        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+            The feature size of the output.
-        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
+        kernel_size (int or Tuple[int]): 
            The size of the kernel.
        dilation (int or Tuple[int]): 
            The dilation of the convolution, by default 1
        weight_attr (ParamAttr, Initializer, str or bool, optional): 
            The parameter attribute of the convolution kernel, 
            by default None.
-        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
            The parameter attribute of the bias. 
            If ``False``, this layer does not have a bias, by default None.
    Examples: 
@ -122,7 +128,8 @@ class Conv1dCell(nn.Conv1D):
        """Initialize the buffer for the step input.
        Args:
-            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            x_t (Tensor): 
                The step input. shape=(batch_size, in_channels)
        """
        batch_size, _ = x_t.shape
@ -134,7 +141,8 @@ class Conv1dCell(nn.Conv1D):
        """Shift the buffer by one step.
        Args:
-            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            x_t (Tensor): T
                he step input. shape=(batch_size, in_channels)
        """
        self._buffer = paddle.concat(
@ -144,10 +152,12 @@ class Conv1dCell(nn.Conv1D):
        """Add step input and compute step output.
        Args:
-            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            x_t (Tensor): 
                The step input. shape=(batch_size, in_channels)
        Returns: 
-            y_t (Tensor): The step output. shape=(batch_size, out_channels)
+            y_t (Tensor): 
                The step output. shape=(batch_size, out_channels)
        """
        batch_size = x_t.shape[0]
@ -173,10 +183,14 @@ class Conv1dBatchNorm(nn.Layer):
    """A Conv1D Layer followed by a BatchNorm1D.
    Args:
-        in_channels (int): The feature size of the input.
+        in_channels (int): 
-        out_channels (int): The feature size of the output.
+            The feature size of the input.
-        kernel_size (int): The size of the convolution kernel.
+        out_channels (int): 
-        stride (int, optional): The stride of the convolution, by default 1.
+            The feature size of the output.
        kernel_size (int): 
            The size of the convolution kernel.
        stride (int, optional): 
            The stride of the convolution, by default 1.
        padding (int, str or Tuple[int], optional):
            The padding of the convolution.
            If int, a symmetrical padding is applied before convolution;
@ -189,9 +203,12 @@ class Conv1dBatchNorm(nn.Layer):
        bias_attr (ParamAttr, Initializer, str or bool, optional):
            The parameter attribute of the bias of the convolution,
            by defaultNone.
-        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+        data_format (str ["NCL" or "NLC"], optional): 
-        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+            The data layout of the input, by default "NCL"
-        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
+        momentum (float, optional): 
            The momentum of the BatchNorm1D layer, by default 0.9
        epsilon (float, optional): 
            The epsilon of the BatchNorm1D layer, by default 1e-05
    """
    def __init__(self,
@ -225,12 +242,13 @@ class Conv1dBatchNorm(nn.Layer):
        """Forward pass of the Conv1dBatchNorm layer.
        Args:
-            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
+            x (Tensor): 
-            shape=(B, C_in, T_in) or (B, T_in, C_in)
+                The input tensor. Its data layout depends on ``data_format``. 
                shape=(B, C_in, T_in) or (B, T_in, C_in)
        Returns:
-            Tensor: The output tensor. 
+            Tensor: 
-                shape=(B, C_out, T_out) or (B, T_out, C_out)
+                The output tensor. shape=(B, C_out, T_out) or (B, T_out, C_out)
        """
        x = self.conv(x)
--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@ -19,8 +19,10 @@ def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.
    Args:
-        x (Tensor): The input tensor.
+        x (Tensor): 
-        axis (int): The axis to shuffle.
+            The input tensor.
        axis (int): 
            The axis to shuffle.
        perm (List[int], ndarray, optional): 
            The order to reorder the tensor along the ``axis``-th dimension.
            It is a permutation of ``[0, d)``, where d is the size of the
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@ -19,8 +19,10 @@ from paddle import nn
 class LayerNorm(nn.LayerNorm):
    """Layer normalization module.
    Args:
-        nout (int): Output dim size.
+        nout (int): 
-        dim (int): Dimension to be normalized.
+            Output dim size.
        dim (int): 
            Dimension to be normalized.
    """
    def __init__(self, nout, dim=-1):
@ -32,7 +34,8 @@ class LayerNorm(nn.LayerNorm):
        """Apply layer normalization.
        Args:
-            x (Tensor):Input tensor.
+            x (Tensor):
                Input tensor.
        Returns: 
            Tensor: Normalized tensor.
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -269,8 +269,10 @@ class GuidedAttentionLoss(nn.Layer):
        """Make masks indicating non-padded part.
        Args:
-            ilens(Tensor(int64) or List): Batch of lengths (B,).
+            ilens(Tensor(int64) or List): 
-            olens(Tensor(int64) or List): Batch of lengths (B,).
+                Batch of lengths (B,).
            olens(Tensor(int64) or List): 
                Batch of lengths (B,).
        Returns:
            Tensor: Mask tensor indicating non-padded part.
@ -322,9 +324,12 @@ class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
        """Calculate forward propagation.
        Args:
-            att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            att_ws(Tensor): 
-            ilens(Tensor): Batch of input lenghts (B,).
+                Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-            olens(Tensor): Batch of output lenghts (B,).
+            ilens(Tensor): 
                Batch of input lenghts (B,).
            olens(Tensor): 
                Batch of output lenghts (B,).
        Returns:
            Tensor: Guided attention loss value.
@ -354,9 +359,12 @@ class Tacotron2Loss(nn.Layer):
        """Initialize Tactoron2 loss module.
        Args:
-            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_masking (bool): 
-            use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
+                Whether to apply masking for padded part in loss calculation.
-            bce_pos_weight (float): Weight of positive sample of stop token.
+            use_weighted_masking (bool): 
                Whether to apply weighted masking in loss calculation.
            bce_pos_weight (float): 
                Weight of positive sample of stop token.
        """
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
@ -374,17 +382,25 @@ class Tacotron2Loss(nn.Layer):
        """Calculate forward propagation.
        Args:
-            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            after_outs(Tensor): 
-            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+                Batch of outputs after postnets (B, Lmax, odim).
-            logits(Tensor): Batch of stop logits (B, Lmax).
+            before_outs(Tensor): 
-            ys(Tensor): Batch of padded target features (B, Lmax, odim).
+                Batch of outputs before postnets (B, Lmax, odim).
-            stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
+            logits(Tensor): 
                Batch of stop logits (B, Lmax).
            ys(Tensor): 
                Batch of padded target features (B, Lmax, odim).
            stop_labels(Tensor(int64)): 
                Batch of the sequences of stop token labels (B, Lmax).
            olens(Tensor(int64)): 
        Returns:
-            Tensor: L1 loss value.
+            Tensor: 
-            Tensor: Mean square error loss value.
+                L1 loss value.
-            Tensor: Binary cross entropy loss value.
+            Tensor: 
                Mean square error loss value.
            Tensor: 
                Binary cross entropy loss value.
        """
        # make mask and apply it
        if self.use_masking:
@ -437,16 +453,24 @@ def stft(x,
         pad_mode='reflect'):
    """Perform STFT and convert to magnitude spectrogram.
    Args:
-        x(Tensor): Input signal tensor (B, T).
+        x(Tensor): 
-        fft_size(int): FFT size.
+            Input signal tensor (B, T).
-        hop_size(int): Hop size.
+        fft_size(int): 
-        win_length(int, optional): window : str, optional (Default value = None)
+            FFT size.
-        window(str, optional): Name of window function, see `scipy.signal.get_window` for more
+        hop_size(int): 
-            details. Defaults to "hann".
+            Hop size.
-        center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
+        win_length(int, optional): 
        window (str, optional):
            (Default value = None)
        window(str, optional): 
            Name of window function, see `scipy.signal.get_window` for more details. Defaults to "hann".
        center(bool, optional, optional): center (bool, optional): 
            Whether to pad `x` to make that the
            :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
-        pad_mode(str, optional, optional):  (Default value = 'reflect')
+        pad_mode(str, optional, optional):  
-        hop_length:  (Default value = None)
+            (Default value = 'reflect')
        hop_length:  
            (Default value = None)
    Returns:
        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
@ -480,8 +504,10 @@ class SpectralConvergenceLoss(nn.Layer):
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
        Args: 
-            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            x_mag (Tensor):
-            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+                 Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
            y_mag (Tensor): 
                Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns:
            Tensor: Spectral convergence loss value.
        """
@ -501,8 +527,10 @@ class LogSTFTMagnitudeLoss(nn.Layer):
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
        Args:
-            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            x_mag (Tensor): 
-            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+                Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
            y_mag (Tensor):
                Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns:
            Tensor: Log STFT magnitude loss value.
        """
@ -531,11 +559,15 @@ class STFTLoss(nn.Layer):
    def forward(self, x, y):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Predicted signal (B, T).
+            x (Tensor): 
-            y (Tensor): Groundtruth signal (B, T).
+                Predicted signal (B, T).
            y (Tensor): 
                Groundtruth signal (B, T).
        Returns:
-            Tensor: Spectral convergence loss value.
+            Tensor: 
-            Tensor: Log STFT magnitude loss value.
+                Spectral convergence loss value.
            Tensor: 
                Log STFT magnitude loss value.
        """
        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
                     self.window)
@ -558,10 +590,14 @@ class MultiResolutionSTFTLoss(nn.Layer):
            window="hann", ):
        """Initialize Multi resolution STFT loss module.
        Args:
-            fft_sizes (list): List of FFT sizes.
+            fft_sizes (list): 
-            hop_sizes (list): List of hop sizes.
+                List of FFT sizes.
-            win_lengths (list): List of window lengths.
+            hop_sizes (list): 
-            window (str): Window function type.
+                List of hop sizes.
            win_lengths (list): 
                List of window lengths.
            window (str): 
                Window function type.
        """
        super().__init__()
        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@ -573,11 +609,15 @@ class MultiResolutionSTFTLoss(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+            x (Tensor): 
-            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+                Predicted signal (B, T) or (B, #subband, T).
            y (Tensor): 
                Groundtruth signal (B, T) or (B, #subband, T).
        Returns:
-            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: 
-            Tensor: Multi resolution log STFT magnitude loss value.
+                Multi resolution spectral convergence loss value.
            Tensor: 
                Multi resolution log STFT magnitude loss value.
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B x C, T)
@ -615,9 +655,11 @@ class GeneratorAdversarialLoss(nn.Layer):
    def forward(self, outputs):
        """Calcualate generator adversarial loss.
        Args:
-            outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
+            outputs (Tensor or List): 
                Discriminator outputs or list of discriminator outputs.
        Returns:
-            Tensor: Generator adversarial loss value.
+            Tensor: 
                Generator adversarial loss value.
        """
        if isinstance(outputs, (tuple, list)):
            adv_loss = 0.0
@ -659,13 +701,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):
        """Calcualate discriminator adversarial loss.
        Args:
-            outputs_hat (Tensor or list): Discriminator outputs or list of
+            outputs_hat (Tensor or list): 
-                discriminator outputs calculated from generator outputs.
+                Discriminator outputs or list of discriminator outputs calculated from generator outputs.
-            outputs (Tensor or list): Discriminator outputs or list of
+            outputs (Tensor or list): 
-                discriminator outputs calculated from groundtruth.
+                Discriminator outputs or list of discriminator outputs calculated from groundtruth.
        Returns:
-            Tensor: Discriminator real loss value.
+            Tensor: 
-            Tensor: Discriminator fake loss value.
+                Discriminator real loss value.
            Tensor: 
                Discriminator fake loss value.
        """
        if isinstance(outputs, (tuple, list)):
            real_loss = 0.0
@ -766,9 +810,12 @@ def masked_l1_loss(prediction, target, mask):
    """Compute maksed L1 loss.
    Args:
-        prediction(Tensor): The prediction.
+        prediction(Tensor): 
-        target(Tensor): The target. The shape should be broadcastable to ``prediction``.
+            The prediction.
-        mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
+        target(Tensor): 
            The target. The shape should be broadcastable to ``prediction``.
        mask(Tensor): 
            The mask. The shape should be broadcatable to the broadcasted shape of
            ``prediction`` and ``target``.
    Returns:
@ -916,8 +963,10 @@ class MelSpectrogramLoss(nn.Layer):
    def forward(self, y_hat, y):
        """Calculate Mel-spectrogram loss.
        Args:
-            y_hat(Tensor): Generated single tensor (B, 1, T).
+            y_hat(Tensor): 
-            y(Tensor): Groundtruth single tensor (B, 1, T).
+                Generated single tensor (B, 1, T).
            y(Tensor): 
                Groundtruth single tensor (B, 1, T).
        Returns:
            Tensor: Mel-spectrogram loss value.
@ -947,9 +996,11 @@ class FeatureMatchLoss(nn.Layer):
        """Calcualate feature matching loss.
        Args:
-            feats_hat(list): List of list of discriminator outputs
+            feats_hat(list): 
                List of list of discriminator outputs
                calcuated from generater outputs.
-            feats(list): List of list of discriminator outputs
+            feats(list): 
                List of list of discriminator outputs
        Returns:
            Tensor: Feature matching loss value.
@ -986,11 +1037,16 @@ class KLDivergenceLoss(nn.Layer):
        """Calculate KL divergence loss.
        Args:
-            z_p (Tensor): Flow hidden representation (B, H, T_feats).
+            z_p (Tensor): 
-            logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats).
+                Flow hidden representation (B, H, T_feats).
-            m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats).
+            logs_q (Tensor): 
-            logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats).
+                Posterior encoder projected scale (B, H, T_feats).
-            z_mask (Tensor): Mask tensor (B, 1, T_feats).
+            m_p (Tensor): 
                Expanded text encoder projected mean (B, H, T_feats).
            logs_p (Tensor): 
                Expanded text encoder projected scale (B, H, T_feats).
            z_mask (Tensor): 
                Mask tensor (B, 1, T_feats).
        Returns:
            Tensor: KL divergence loss.
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -25,8 +25,10 @@ def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.
    Args:
-        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        xs (List[Tensor]): 
-        pad_value (float): Value for padding.
+            List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
        pad_value (float): 
            Value for padding.
    Returns:
        Tensor: Padded tensor (B, Tmax, `*`).
@ -55,10 +57,13 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
    """Make mask tensor containing indices of padded part.
    Args:
-        lengths (Tensor(int64)): Batch of lengths (B,).
+        lengths (Tensor(int64)): 
-        xs (Tensor, optional): The reference tensor.
+            Batch of lengths (B,).
        xs (Tensor, optional): 
            The reference tensor.
            If set, masks will be the same shape as this tensor.
-        length_dim (int, optional): Dimension indicator of the above tensor.
+        length_dim (int, optional): 
            Dimension indicator of the above tensor.
            See the example.
    Returns:
@ -166,14 +171,18 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.
    Args:
-        lengths (Tensor(int64) or List): Batch of lengths (B,).
+        lengths (Tensor(int64) or List): 
-        xs (Tensor, optional): The reference tensor.
+            Batch of lengths (B,).
        xs (Tensor, optional): 
            The reference tensor.
            If set, masks will be the same shape as this tensor.
-        length_dim (int, optional): Dimension indicator of the above tensor.
+        length_dim (int, optional): 
            Dimension indicator of the above tensor.
            See the example.
    Returns:
-        Tensor(bool): mask tensor containing indices of padded part bool.
+        Tensor(bool): 
            mask tensor containing indices of padded part bool.
    Examples:
        With only lengths.
@ -257,8 +266,10 @@ def initialize(model: nn.Layer, init: str):
    Custom initialization routines can be implemented into submodules
    Args:
-        model (nn.Layer): Target.
+        model (nn.Layer): 
-        init (str): Method of initialization.
+            Target.
        init (str):
            Method of initialization.
    """
    assert check_argument_types()
@ -285,12 +296,17 @@ def get_random_segments(
        segment_size: int, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
    """Get random segments.
    Args:
-        x (Tensor): Input tensor (B, C, T).
+        x (Tensor): 
-        x_lengths (Tensor): Length tensor (B,).
+            Input tensor (B, C, T).
-        segment_size (int): Segment size.
+        x_lengths (Tensor): 
            Length tensor (B,).
        segment_size (int): 
            Segment size.
    Returns:
-        Tensor: Segmented tensor (B, C, segment_size).
+        Tensor: 
-        Tensor: Start index tensor (B,).
+            Segmented tensor (B, C, segment_size).
        Tensor: 
            Start index tensor (B,).
    """
    b, c, t = paddle.shape(x)
    max_start_idx = x_lengths - segment_size
@ -306,9 +322,12 @@ def get_segments(
        segment_size: int, ) -> paddle.Tensor:
    """Get segments.
    Args:
-        x (Tensor): Input tensor (B, C, T).
+        x (Tensor): 
-        start_idxs (Tensor): Start index tensor (B,).
+            Input tensor (B, C, T).
-        segment_size (int): Segment size.
+        start_idxs (Tensor): 
            Start index tensor (B,).
        segment_size (int): 
            Segment size.
    Returns:
        Tensor: Segmented tensor (B, C, segment_size).
    """
@ -353,14 +372,20 @@ def phones_masking(xs_pad: paddle.Tensor,
                   span_bdy: paddle.Tensor=None):
    '''
    Args:
-        xs_pad (paddle.Tensor): input speech (B, Tmax, D).
+        xs_pad (paddle.Tensor): 
-        src_mask (paddle.Tensor): mask of speech (B, 1, Tmax).
+            input speech (B, Tmax, D).
-        align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2).
+        src_mask (paddle.Tensor): 
-        align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2).
+            mask of speech (B, 1, Tmax).
-        align_start_lens (paddle.Tensor): length of align_start (B, ).
+        align_start (paddle.Tensor): 
            frame level phone alignment start (B, Tmax2).
        align_end (paddle.Tensor): 
            frame level phone alignment end (B, Tmax2).
        align_start_lens (paddle.Tensor): 
            length of align_start (B, ).
        mlm_prob (float):
        mean_phn_span (int):
-        span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2).
+        span_bdy (paddle.Tensor): 
            masked mel boundary of input speech (B, 2).
    Returns:
        paddle.Tensor[bool]: masked position of input speech (B, Tmax).
    '''
@ -416,19 +441,29 @@ def phones_text_masking(xs_pad: paddle.Tensor,
                        span_bdy: paddle.Tensor=None):
    '''
    Args:
-        xs_pad (paddle.Tensor): input speech (B, Tmax, D).
+        xs_pad (paddle.Tensor): 
-        src_mask (paddle.Tensor): mask of speech (B, 1, Tmax).
+            input speech (B, Tmax, D).
-        text_pad (paddle.Tensor): input text (B, Tmax2).
+        src_mask (paddle.Tensor): 
-        text_mask (paddle.Tensor): mask of text (B, 1, Tmax2).
+            mask of speech (B, 1, Tmax).
-        align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2).
+        text_pad (paddle.Tensor): 
-        align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2).
+            input text (B, Tmax2).
-        align_start_lens (paddle.Tensor): length of align_start (B, ).
+        text_mask (paddle.Tensor):
            mask of text (B, 1, Tmax2).
        align_start (paddle.Tensor): 
            frame level phone alignment start (B, Tmax2).
        align_end (paddle.Tensor): 
            frame level phone alignment end (B, Tmax2).
        align_start_lens (paddle.Tensor): 
            length of align_start (B, ).
        mlm_prob (float):
        mean_phn_span (int):
-        span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2).
+        span_bdy (paddle.Tensor): 
            masked mel boundary of input speech (B, 2).
    Returns:
-        paddle.Tensor[bool]: masked position of input speech (B, Tmax).
+        paddle.Tensor[bool]: 
-        paddle.Tensor[bool]: masked position of input text (B, Tmax2).
+            masked position of input speech (B, Tmax).
        paddle.Tensor[bool]: 
            masked position of input text (B, Tmax2).
    '''
    bz, sent_len, _ = paddle.shape(xs_pad)
    masked_pos = paddle.zeros((bz, sent_len))
@ -488,12 +523,18 @@ def get_seg_pos(speech_pad: paddle.Tensor,
                seg_emb: bool=False):
    '''
    Args:
-        speech_pad (paddle.Tensor): input speech (B, Tmax, D).
+        speech_pad (paddle.Tensor): 
-        text_pad (paddle.Tensor): input text (B, Tmax2).
+            input speech (B, Tmax, D).
-        align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2).
+        text_pad (paddle.Tensor): 
-        align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2).
+            input text (B, Tmax2).
-        align_start_lens (paddle.Tensor): length of align_start (B, ).
+        align_start (paddle.Tensor): 
-        seg_emb (bool): whether to use segment embedding.
+            frame level phone alignment start (B, Tmax2).
        align_end (paddle.Tensor): 
            frame level phone alignment end (B, Tmax2).
        align_start_lens (paddle.Tensor): 
            length of align_start (B, ).
        seg_emb (bool): 
            whether to use segment embedding.
    Returns:
        paddle.Tensor[int]: n-th phone of each mel, 0<=n<=Tmax2 (B, Tmax).
            eg: 
@ -579,8 +620,10 @@ def random_spans_noise_mask(length: int,
    def _random_seg(num_items, num_segs):
        """Partition a sequence of items randomly into non-empty segments.
        Args:
-            num_items: an integer scalar > 0
+            num_items: 
-            num_segs: an integer scalar in [1, num_items]
+                an integer scalar > 0
            num_segs: 
                an integer scalar in [1, num_items]
        Returns:
            a Tensor with shape [num_segs] containing positive integers that add
            up to num_items
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@ -26,9 +26,12 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
    filters of cosine modulated filterbanks`_.
    Args:
-        taps (int): The number of filter taps.
+        taps (int): 
-        cutoff_ratio (float): Cut-off frequency ratio.
+            The number of filter taps.
-        beta (float): Beta coefficient for kaiser window.
+        cutoff_ratio (float): 
            Cut-off frequency ratio.
        beta (float): 
            Beta coefficient for kaiser window.
    Returns:
        ndarray:
            Impluse response of prototype filter (taps + 1,).
@ -66,10 +69,14 @@ class PQMF(nn.Layer):
        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
        Args:
-            subbands (int): The number of subbands.
+            subbands (int): 
-            taps (int): The number of filter taps.
+                The number of subbands.
-            cutoff_ratio (float): Cut-off frequency ratio.
+            taps (int): 
-            beta (float): Beta coefficient for kaiser window.
+                The number of filter taps.
            cutoff_ratio (float): 
                Cut-off frequency ratio.
            beta (float): 
                Beta coefficient for kaiser window.
        """
        super().__init__()
@ -103,7 +110,8 @@ class PQMF(nn.Layer):
    def analysis(self, x):
        """Analysis with PQMF.
        Args:
-            x (Tensor): Input tensor (B, 1, T).
+            x (Tensor): 
                Input tensor (B, 1, T).
        Returns:
            Tensor: Output tensor (B, subbands, T // subbands).
        """
@ -113,7 +121,8 @@ class PQMF(nn.Layer):
    def synthesis(self, x):
        """Synthesis with PQMF.
        Args:
-            x (Tensor): Input tensor (B, subbands, T // subbands).
+            x (Tensor): 
                Input tensor (B, subbands, T // subbands).
        Returns:
            Tensor: Output tensor (B, 1, T).
        """
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@ -50,12 +50,18 @@ class DurationPredictor(nn.Layer):
        """Initilize duration predictor module.
        Args:
-            idim (int):Input dimension.
+            idim (int):
-            n_layers (int, optional): Number of convolutional layers.
+                Input dimension.
-            n_chans (int, optional): Number of channels of convolutional layers.
+            n_layers (int, optional): 
-            kernel_size (int, optional): Kernel size of convolutional layers.
+                Number of convolutional layers.
-            dropout_rate (float, optional): Dropout rate.
+            n_chans (int, optional): 
-            offset (float, optional): Offset value to avoid nan in log domain.
+                Number of channels of convolutional layers.
            kernel_size (int, optional): 
                Kernel size of convolutional layers.
            dropout_rate (float, optional): 
                Dropout rate.
            offset (float, optional): 
                Offset value to avoid nan in log domain.
        """
        super().__init__()
@ -99,8 +105,10 @@ class DurationPredictor(nn.Layer):
    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.
        Args:
-            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            xs(Tensor): 
-            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
+                Batch of input sequences (B, Tmax, idim).
            x_masks(ByteTensor, optional, optional): 
                Batch of masks indicating padded part (B, Tmax). (Default value = None)
        Returns:
            Tensor: Batch of predicted durations in log domain (B, Tmax).
@ -110,8 +118,10 @@ class DurationPredictor(nn.Layer):
    def inference(self, xs, x_masks=None):
        """Inference duration.
        Args:
-            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            xs(Tensor): 
-            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
+                Batch of input sequences (B, Tmax, idim).
            x_masks(Tensor(bool), optional, optional): 
                Batch of masks indicating padded part (B, Tmax). (Default value = None)
        Returns:
            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
@ -140,8 +150,10 @@ class DurationPredictorLoss(nn.Layer):
        """Calculate forward propagation.
        Args:
-            outputs(Tensor): Batch of prediction durations in log domain (B, T)
+            outputs(Tensor): 
-            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
+                Batch of prediction durations in log domain (B, T)
            targets(Tensor): 
                Batch of groundtruth durations in linear domain (B, T)
        Returns: 
            Tensor: Mean squared error loss value.
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@ -36,7 +36,8 @@ class LengthRegulator(nn.Layer):
        """Initilize length regulator module.
        Args:
-            pad_value (float, optional): Value used for padding.
+            pad_value (float, optional): 
                Value used for padding.
        """
        super().__init__()
@ -97,9 +98,12 @@ class LengthRegulator(nn.Layer):
        """Calculate forward propagation.
        Args:
-            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            xs (Tensor): 
-            ds (Tensor(int64)): Batch of durations of each frame (B, T).
+                Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-            alpha (float, optional): Alpha value to control speed of speech.
+            ds (Tensor(int64)): 
                Batch of durations of each frame (B, T).
            alpha (float, optional): 
                Alpha value to control speed of speech.
        Returns:
            Tensor: replicated input tensor based on durations (B, T*, D).
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@ -43,11 +43,16 @@ class VariancePredictor(nn.Layer):
        """Initilize duration predictor module.
        Args:
-            idim (int): Input dimension.
+            idim (int): 
-            n_layers (int, optional): Number of convolutional layers.
+                Input dimension.
-            n_chans (int, optional): Number of channels of convolutional layers.
+            n_layers (int, optional): 
-            kernel_size (int, optional): Kernel size of convolutional layers.
+                Number of convolutional layers.
-            dropout_rate (float, optional): Dropout rate.
+            n_chans (int, optional): 
                Number of channels of convolutional layers.
            kernel_size (int, optional): 
                Kernel size of convolutional layers.
            dropout_rate (float, optional): 
                Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -74,11 +79,14 @@ class VariancePredictor(nn.Layer):
        """Calculate forward propagation.
        Args:
-            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            xs (Tensor): 
-            x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).
+                Batch of input sequences (B, Tmax, idim).
            x_masks (Tensor(bool), optional): 
                Batch of masks indicating padded part (B, Tmax, 1).
        Returns:
-            Tensor: Batch of predicted sequences (B, Tmax, 1).
+            Tensor: 
                Batch of predicted sequences (B, Tmax, 1).
        """
        # (B, idim, Tmax)
        xs = xs.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@ -29,15 +29,24 @@ class WaveNetResidualBlock(nn.Layer):
    refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.
    Args:
-        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
+        kernel_size (int, optional): 
-        residual_channels (int, optional): Feature size of the residual output(and also the input), by default 64
+            Kernel size of the 1D convolution, by default 3
-        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
+        residual_channels (int, optional): 
-        skip_channels (int, optional): Feature size of the skip output, by default 64
+            Feature size of the residual output(and also the input), by default 64
-        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
+        gate_channels (int, optional): 
-        dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
+            Output feature size of the 1D convolution, by default 128
-        dilation (int, optional): Dilation of the 1D convolution, by default 1
+        skip_channels (int, optional): 
-        bias (bool, optional): Whether to use bias in the 1D convolution, by default True
+            Feature size of the skip output, by default 64
-        use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
+        aux_channels (int, optional): 
            Feature size of the auxiliary input (e.g. spectrogram), by default 80
        dropout (float, optional): 
            Probability of the dropout before the 1D convolution, by default 0.
        dilation (int, optional): 
            Dilation of the 1D convolution, by default 1
        bias (bool, optional): 
            Whether to use bias in the 1D convolution, by default True
        use_causal_conv (bool, optional): 
            Whether to use causal padding for the 1D convolution, by default False
    """
    def __init__(self,
@ -81,13 +90,17 @@ class WaveNetResidualBlock(nn.Layer):
    def forward(self, x, c):
        """
        Args:
-            x (Tensor): the input features. Shape (N, C_res, T)
+            x (Tensor): 
-            c (Tensor): the auxiliary input. Shape (N, C_aux, T)
+                the input features. Shape (N, C_res, T)
            c (Tensor):
                the auxiliary input. Shape (N, C_aux, T)
        Returns:
-            res (Tensor): Shape (N, C_res, T), the residual output, which is used as the 
+            res (Tensor): 
                Shape (N, C_res, T), the residual output, which is used as the 
                input of the next ResidualBlock in a stack of ResidualBlocks.
-            skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
+            skip (Tensor): 
                Shape (N, C_skip, T), the skip output, which is collected among
                each layer in a stack of ResidualBlocks.
        """
        x_input = x
@ -121,13 +134,20 @@ class HiFiGANResidualBlock(nn.Layer):
    ):
        """Initialize HiFiGANResidualBlock module.
        Args:
-            kernel_size (int): Kernel size of dilation convolution layer.
+            kernel_size (int): 
-            channels (int): Number of channels for convolution layer.
+                Kernel size of dilation convolution layer.
-            dilations (List[int]): List of dilation factors.
+            channels (int): 
-            use_additional_convs (bool): Whether to use additional convolution layers.
+                Number of channels for convolution layer.
-            bias (bool): Whether to add bias parameter in convolution layers.
+            dilations (List[int]): 
-            nonlinear_activation (str): Activation function module name.
+                List of dilation factors.
-            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_additional_convs (bool): 
                Whether to use additional convolution layers.
            bias (bool): 
                Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (dict): 
                Hyperparameters for activation function.
        """
        super().__init__()
@ -167,7 +187,8 @@ class HiFiGANResidualBlock(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input tensor (B, channels, T).
+            x (Tensor): 
                Input tensor (B, channels, T).
        Returns:
            Tensor: Output tensor (B, channels, T).
        """
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@ -39,15 +39,24 @@ class ResidualStack(nn.Layer):
        """Initialize ResidualStack module.
        Args:
-            kernel_size (int): Kernel size of dilation convolution layer.
+            kernel_size (int): 
-            channels (int): Number of channels of convolution layers.
+                Kernel size of dilation convolution layer.
-            dilation (int): Dilation factor.
+            channels (int): 
-            bias (bool): Whether to add bias parameter in convolution layers.
+                Number of channels of convolution layers.
-            nonlinear_activation (str): Activation function module name.
+            dilation (int): 
-            nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
+                Dilation factor.
-            pad (str): Padding function module name before dilated convolution layer.
+            bias (bool): 
-            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+                Whether to add bias parameter in convolution layers.
-            use_causal_conv (bool): Whether to use causal convolution.
+            nonlinear_activation (str): 
                Activation function module name.
            nonlinear_activation_params (Dict[str,Any]): 
                Hyperparameters for activation function.
            pad (str): 
                Padding function module name before dilated convolution layer.
            pad_params (Dict[str, Any]): 
                Hyperparameters for padding function.
            use_causal_conv (bool): 
                Whether to use causal convolution.
        """
        super().__init__()
        # for compatibility
@ -95,7 +104,8 @@ class ResidualStack(nn.Layer):
        """Calculate forward propagation.
        Args:
-            c (Tensor): Input tensor (B, channels, T).
+            c (Tensor): 
                Input tensor (B, channels, T).
        Returns:     
            Tensor: Output tensor (B, chennels, T).
        """
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@ -32,16 +32,26 @@ class StyleEncoder(nn.Layer):
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
    Args:
-        idim (int, optional): Dimension of the input mel-spectrogram.
+        idim (int, optional): 
-        gst_tokens (int, optional): The number of GST embeddings.
+            Dimension of the input mel-spectrogram.
-        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_tokens (int, optional): 
-        gst_heads (int, optional): The number of heads in GST multihead attention.
+            The number of GST embeddings.
-        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        gst_token_dim (int, optional): 
-        conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+            Dimension of each GST embedding.
-        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        gst_heads (int, optional): 
-        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+            The number of heads in GST multihead attention.
-        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        conv_layers (int, optional): 
-        gru_units (int, optional):The number of GRU units in the reference encoder.
+            The number of conv layers in the reference encoder.
        conv_chans_list (Sequence[int], optional): 
            List of the number of channels of conv layers in the referece encoder.
        conv_kernel_size (int, optional): 
            Kernal size of conv layers in the reference encoder.
        conv_stride (int, optional): 
            Stride size of conv layers in the reference encoder.
        gru_layers (int, optional): 
            The number of GRU layers in the reference encoder.
        gru_units (int, optional):
            The number of GRU units in the reference encoder.
    Todo:
        * Support manual weight specification in inference.
@ -82,7 +92,8 @@ class StyleEncoder(nn.Layer):
        """Calculate forward propagation.
        Args:
-            speech (Tensor): Batch of padded target features (B, Lmax, odim).
+            speech (Tensor): 
                Batch of padded target features (B, Lmax, odim).
        Returns: 
            Tensor: Style token embeddings (B, token_dim).
@ -104,13 +115,20 @@ class ReferenceEncoder(nn.Layer):
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
    Args:
-        idim (int, optional): Dimension of the input mel-spectrogram.
+        idim (int, optional): 
-        conv_layers (int, optional): The number of conv layers in the reference encoder.
+            Dimension of the input mel-spectrogram.
-        conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_layers (int, optional): 
-        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+            The number of conv layers in the reference encoder.
-        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        conv_chans_list: (Sequence[int], optional): 
-        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+            List of the number of channels of conv layers in the referece encoder.
-        gru_units (int, optional): The number of GRU units in the reference encoder.
+        conv_kernel_size (int, optional): 
            Kernal size of conv layers in the reference encoder.
        conv_stride (int, optional): 
            Stride size of conv layers in the reference encoder.
        gru_layers (int, optional): 
            The number of GRU layers in the reference encoder.
        gru_units (int, optional): 
            The number of GRU units in the reference encoder.
    """
@ -168,7 +186,8 @@ class ReferenceEncoder(nn.Layer):
    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.
        Args:
-            speech (Tensor): Batch of padded target features (B, Lmax, idim).
+            speech (Tensor): 
                Batch of padded target features (B, Lmax, idim).
        Returns:
            Tensor: Reference embedding (B, gru_units)
@ -200,11 +219,16 @@ class StyleTokenLayer(nn.Layer):
    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
    Args:
-        ref_embed_dim (int, optional): Dimension of the input reference embedding.
+        ref_embed_dim (int, optional): 
-        gst_tokens (int, optional): The number of GST embeddings.
+            Dimension of the input reference embedding.
-        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_tokens (int, optional): 
-        gst_heads (int, optional): The number of heads in GST multihead attention.
+            The number of GST embeddings.
-        dropout_rate (float, optional): Dropout rate in multi-head attention.
+        gst_token_dim (int, optional): 
            Dimension of each GST embedding.
        gst_heads (int, optional): 
            The number of heads in GST multihead attention.
        dropout_rate (float, optional): 
            Dropout rate in multi-head attention.
    """
@ -236,7 +260,8 @@ class StyleTokenLayer(nn.Layer):
        """Calculate forward propagation.
        Args:
-            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
+            ref_embs (Tensor):
                Reference embeddings (B, ref_embed_dim).
        Returns: 
            Tensor: Style token embeddings (B, gst_token_dim).
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@ -31,10 +31,14 @@ def _apply_attention_constraint(e,
    Text-to-Speech with Convolutional Sequence Learning`_.
    Args:
-        e(Tensor): Attention energy before applying softmax (1, T).
+        e(Tensor): 
-       last_attended_idx(int): The index of the inputs of the last attended [0, T].
+            Attention energy before applying softmax (1, T).
-       backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
+        last_attended_idx(int): 
-       forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
+            The index of the inputs of the last attended [0, T].
        backward_window(int, optional, optional): 
            Backward window size in attention constraint. (Default value = 1)
        forward_window(int, optional, optional): 
            Forward window size in attetion constraint. (Default value = 3)
    Returns:
        Tensor: Monotonic constrained attention energy (1, T).
@ -62,12 +66,18 @@ class AttLoc(nn.Layer):
        (https://arxiv.org/pdf/1506.07503.pdf)
    Args:
-        eprojs (int): projection-units of encoder
+        eprojs (int): 
-        dunits (int): units of decoder
+            projection-units of encoder
-        att_dim (int): attention dimension
+        dunits (int): 
-        aconv_chans (int): channels of attention convolution
+            units of decoder
-        aconv_filts (int): filter size of attention convolution
+        att_dim (int): 
-        han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
+            attention dimension
        aconv_chans (int): 
            channels of attention convolution
        aconv_filts (int): 
            filter size of attention convolution
        han_mode (bool): 
            flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    """
    def __init__(self,
@ -117,18 +127,29 @@ class AttLoc(nn.Layer):
            forward_window=3, ):
        """Calculate AttLoc forward propagation.
        Args:
-            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_pad(Tensor): 
-            enc_hs_len(Tensor): padded encoder hidden state length (B)
+                padded encoder hidden state (B, T_max, D_enc)
-            dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
+            enc_hs_len(Tensor): 
-            att_prev(Tensor): previous attention weight (B, T_max)
+                padded encoder hidden state length (B)
-            scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
+            dec_z(Tensor dec_z): 
-            forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
+                decoder hidden state (B, D_dec)
-            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            att_prev(Tensor): 
-            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+                previous attention weight (B, T_max)
-            forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
+            scaling(float, optional): 
                scaling parameter before applying softmax (Default value = 2.0)
            forward_window(Tensor, optional): 
                    forward window size when constraining attention (Default value = 3)
            last_attended_idx(int, optional): 
                index of the inputs of the last attended (Default value = None)
            backward_window(int, optional): 
                backward window size in attention constraint (Default value = 1)
            forward_window(int, optional): 
                    forward window size in attetion constraint (Default value = 3)
        Returns:
-            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: 
-            Tensor: previous attention weights (B, T_max)
+                attention weighted encoder state (B, D_enc)
            Tensor: 
                previous attention weights (B, T_max)
        """
        batch = paddle.shape(enc_hs_pad)[0]
        # pre-compute all h outside the decoder loop
@ -192,11 +213,16 @@ class AttForward(nn.Layer):
        (https://arxiv.org/pdf/1807.06736.pdf)
    Args:
-        eprojs (int): projection-units of encoder
+        eprojs (int): 
-        dunits (int): units of decoder
+            projection-units of encoder
-        att_dim (int): attention dimension
+        dunits (int): 
-        aconv_chans (int): channels of attention convolution
+            units of decoder
-        aconv_filts (int): filter size of attention convolution
+        att_dim (int): 
            attention dimension
        aconv_chans (int): 
            channels of attention convolution
        aconv_filts (int): 
            filter size of attention convolution
    """
    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
@ -239,18 +265,28 @@ class AttForward(nn.Layer):
        """Calculate AttForward forward propagation.
        Args:
-            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_pad(Tensor): 
-            enc_hs_len(list): padded encoder hidden state length (B,)
+                padded encoder hidden state (B, T_max, D_enc)
-            dec_z(Tensor): decoder hidden state (B, D_dec)
+            enc_hs_len(list): 
-            att_prev(Tensor): attention weights of previous step (B, T_max)
+                padded encoder hidden state length (B,)
-            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            dec_z(Tensor): 
-            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+                decoder hidden state (B, D_dec)
-            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            att_prev(Tensor): 
-            forward_window(int, optional):  (Default value = 3)
+                attention weights of previous step (B, T_max)
            scaling(float, optional): 
                scaling parameter before applying softmax (Default value = 1.0)
            last_attended_idx(int, optional): 
                index of the inputs of the last attended (Default value = None)
            backward_window(int, optional): 
                backward window size in attention constraint (Default value = 1)
            forward_window(int, optional):  
                (Default value = 3)
        Returns:
-            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: 
-            Tensor: previous attention weights (B, T_max)
+                attention weighted encoder state (B, D_enc)
            Tensor: 
                previous attention weights (B, T_max)
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
@ -321,12 +357,18 @@ class AttForwardTA(nn.Layer):
            (https://arxiv.org/pdf/1807.06736.pdf)
    Args:
-        eunits (int): units of encoder
+        eunits (int): 
-        dunits (int): units of decoder
+            units of encoder
-        att_dim (int): attention dimension
+        dunits (int): 
-        aconv_chans (int): channels of attention convolution
+            units of decoder
-        aconv_filts (int): filter size of attention convolution
+        att_dim (int): 
-        odim (int): output dimension
+            attention dimension
        aconv_chans (int):  
            channels of attention convolution
        aconv_filts (int): 
            filter size of attention convolution
        odim (int): 
            output dimension
    """
    def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
@ -372,19 +414,30 @@ class AttForwardTA(nn.Layer):
        """Calculate AttForwardTA forward propagation.
        Args:
-            enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
+            enc_hs_pad(Tensor): 
-            enc_hs_len(list Tensor): padded encoder hidden state length (B,)
+                padded encoder hidden state (B, Tmax, eunits)
-            dec_z(Tensor): decoder hidden state (B, dunits)
+            enc_hs_len(list Tensor): 
-            att_prev(Tensor): attention weights of previous step (B, T_max)
+                padded encoder hidden state length (B,)
-            out_prev(Tensor): decoder outputs of previous step (B, odim)
+            dec_z(Tensor): 
-            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+                decoder hidden state (B, dunits)
-            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            att_prev(Tensor): 
-            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+                attention weights of previous step (B, T_max)
-            forward_window(int, optional):  (Default value = 3)
+            out_prev(Tensor): 
                decoder outputs of previous step (B, odim)
            scaling(float, optional): 
                scaling parameter before applying softmax (Default value = 1.0)
            last_attended_idx(int, optional): 
                index of the inputs of the last attended (Default value = None)
            backward_window(int, optional): 
                backward window size in attention constraint (Default value = 1)
            forward_window(int, optional):  
                (Default value = 3)
        Returns:
-            Tensor: attention weighted encoder state (B, dunits)
+            Tensor: 
-            Tensor: previous attention weights (B, Tmax)
+                attention weighted encoder state (B, dunits)
            Tensor: 
                previous attention weights (B, Tmax)
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@ -45,10 +45,14 @@ class Prenet(nn.Layer):
        """Initialize prenet module.
        Args:
-            idim (int): Dimension of the inputs.
+            idim (int): 
-            odim (int): Dimension of the outputs.
+                Dimension of the inputs.
-            n_layers (int, optional): The number of prenet layers.
+            odim (int): 
-            n_units (int, optional): The number of prenet units.
+                Dimension of the outputs.
            n_layers (int, optional): 
                The number of prenet layers.
            n_units (int, optional): 
                The number of prenet units.
        """
        super().__init__()
        self.dropout_rate = dropout_rate
@ -62,7 +66,8 @@ class Prenet(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Batch of input tensors (B, ..., idim).
+            x (Tensor): 
                Batch of input tensors (B, ..., idim).
        Returns: 
            Tensor: Batch of output tensors (B, ..., odim).
@ -212,7 +217,8 @@ class ZoneOutCell(nn.Layer):
        """Calculate forward propagation.
        Args:
-            inputs (Tensor): Batch of input tensor (B, input_size).
+            inputs (Tensor): 
                Batch of input tensor (B, input_size).
            hidden (tuple):
                - Tensor: Batch of initial hidden states (B, hidden_size).
                - Tensor: Batch of initial cell states (B, hidden_size).
@ -277,26 +283,39 @@ class Decoder(nn.Layer):
        """Initialize Tacotron2 decoder module.
        Args:
-            idim (int): Dimension of the inputs.
+            idim (int): 
-            odim (int): Dimension of the outputs.
+                Dimension of the inputs.
-            att (nn.Layer): Instance of attention class.
+            odim (int): 
-            dlayers (int, optional): The number of decoder lstm layers.
+                Dimension of the outputs.
-            dunits (int, optional): The number of decoder lstm units.
+            att (nn.Layer): 
-            prenet_layers (int, optional): The number of prenet layers.
+                Instance of attention class.
-            prenet_units (int, optional): The number of prenet units.
+            dlayers (int, optional): 
-            postnet_layers (int, optional): The number of postnet layers.
+                The number of decoder lstm layers.
-            postnet_filts (int, optional): The number of postnet filter size.
+            dunits (int, optional): 
-            postnet_chans (int, optional): The number of postnet filter channels.
+                The number of decoder lstm units.
-            output_activation_fn (nn.Layer, optional): Activation function for outputs.
+            prenet_layers (int, optional): 
-            cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
+                The number of prenet layers.
-            use_batch_norm (bool, optional): Whether to use batch normalization.
+            prenet_units (int, optional): 
-            use_concate : bool, optional
+                The number of prenet units.
            postnet_layers (int, optional): 
                The number of postnet layers.
            postnet_filts (int, optional): 
                The number of postnet filter size.
            postnet_chans (int, optional): 
                The number of postnet filter channels.
            output_activation_fn (nn.Layer, optional): 
                Activation function for outputs.
            cumulate_att_w (bool, optional): 
                Whether to cumulate previous attention weight.
            use_batch_norm (bool, optional): 
                Whether to use batch normalization.
            use_concate (bool, optional):
                Whether to concatenate encoder embedding with decoder lstm outputs.
-            dropout_rate : float, optional
+            dropout_rate (float, optional):
                Dropout rate.
-            zoneout_rate : float, optional
+            zoneout_rate (float, optional):
                Zoneout rate.
-            reduction_factor : int, optional
+            reduction_factor (int, optional):
                Reduction factor.
        """
        super().__init__()
@ -363,15 +382,22 @@ class Decoder(nn.Layer):
        """Calculate forward propagation.
        Args:
-            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hs (Tensor): 
-            hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
+                Batch of the sequences of padded hidden states (B, Tmax, idim).
-            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+            hlens (Tensor(int64) padded): 
                Batch of lengths of each input batch (B,).
            ys (Tensor): 
                Batch of the sequences of padded target features (B, Lmax, odim).
        Returns:
-            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
+            Tensor: 
-            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
+                Batch of output tensors after postnet (B, Lmax, odim).
-            Tensor: Batch of logits of stop prediction (B, Lmax).
+            Tensor: 
-            Tensor: Batch of attention weights (B, Lmax, Tmax).
+                Batch of output tensors before postnet (B, Lmax, odim).
            Tensor: 
                Batch of logits of stop prediction (B, Lmax).
            Tensor: 
                Batch of attention weights (B, Lmax, Tmax).
        Note: 
            This computation is performed in teacher-forcing manner.
@ -471,20 +497,30 @@ class Decoder(nn.Layer):
            forward_window=None, ):
        """Generate the sequence of features given the sequences of characters.
        Args:
-            h(Tensor): Input sequence of encoder hidden states (T, C).
+            h(Tensor): 
-            threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
+                Input sequence of encoder hidden states (T, C).
-            minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
+            threshold(float, optional, optional): 
                Threshold to stop generation. (Default value = 0.5)
            minlenratio(float, optional, optional): 
                Minimum length ratio. If set to 1.0 and the length of input is 10,
                the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
-            maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
+            maxlenratio(float, optional, optional):
                 Minimum length ratio. If set to 10 and the length of input is 10,
                the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
-            use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
+            use_att_constraint(bool, optional): 
-            backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
+                Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
-            forward_window(int, optional):  (Default value = None)
+            backward_window(int, optional): 
                Backward window size in attention constraint. (Default value = None)
            forward_window(int, optional):  
                    (Default value = None)
        Returns:
-            Tensor: Output sequence of features (L, odim).
+            Tensor: 
-            Tensor: Output sequence of stop probabilities (L,).
+                Output sequence of features (L, odim).
-            Tensor: Attention weights (L, T).
+            Tensor: 
                Output sequence of stop probabilities (L,).
            Tensor: 
                Attention weights (L, T).
        Note: 
            This computation is performed in auto-regressive manner.
@ -625,9 +661,12 @@ class Decoder(nn.Layer):
        """Calculate all of the attention weights.
        Args:
-            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hs (Tensor): 
-            hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
+                Batch of the sequences of padded hidden states (B, Tmax, idim).
-            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+            hlens (Tensor(int64)): 
                Batch of lengths of each input batch (B,).
            ys (Tensor): 
                Batch of the sequences of padded target features (B, Lmax, odim).
        Returns:
            numpy.ndarray:
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@ -46,17 +46,28 @@ class Encoder(nn.Layer):
            padding_idx=0, ):
        """Initialize Tacotron2 encoder module.
        Args:
-            idim (int): Dimension of the inputs.
+            idim (int): 
-            input_layer (str): Input layer type.
+                Dimension of the inputs.
-            embed_dim (int, optional): Dimension of character embedding.
+            input_layer (str): 
-            elayers (int, optional): The number of encoder blstm layers.
+                Input layer type.
-            eunits (int, optional): The number of encoder blstm units.
+            embed_dim (int, optional): 
-            econv_layers (int, optional): The number of encoder conv layers.
+                Dimension of character embedding.
-            econv_filts (int, optional): The number of encoder conv filter size.
+            elayers (int, optional): 
-            econv_chans (int, optional): The number of encoder conv filter channels.
+                The number of encoder blstm layers.
-            use_batch_norm (bool, optional): Whether to use batch normalization.
+            eunits (int, optional): 
-            use_residual (bool, optional): Whether to use residual connection.
+                The number of encoder blstm units.
-            dropout_rate (float, optional): Dropout rate.
+            econv_layers (int, optional): 
                The number of encoder conv layers.
            econv_filts (int, optional): 
                The number of encoder conv filter size.
            econv_chans (int, optional): 
                The number of encoder conv filter channels.
            use_batch_norm (bool, optional): 
                Whether to use batch normalization.
            use_residual (bool, optional): 
                Whether to use residual connection.
            dropout_rate (float, optional): 
                Dropout rate.
        """
        super().__init__()
@ -127,14 +138,18 @@ class Encoder(nn.Layer):
        """Calculate forward propagation.
        Args:
-            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
+            xs (Tensor): 
                Batch of the padded sequence. Either character ids (B, Tmax)
                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
                Padded value should be 0.
-            ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
+            ilens (Tensor(int64)): 
                Batch of lengths of each input batch (B,).
        Returns:
-            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
+            Tensor: 
-            Tensor(int64): Batch of lengths of each sequence (B,)
+                Batch of the sequences of encoder states(B, Tmax, eunits).
            Tensor(int64): 
                Batch of lengths of each sequence (B,)
        """
        xs = self.embed(xs).transpose([0, 2, 1])
        if self.convs is not None:
@ -161,8 +176,8 @@ class Encoder(nn.Layer):
        """Inference.
        Args:
-            x (Tensor): The sequeunce of character ids (T,) 
+            x (Tensor): 
-                or acoustic feature (T, idim * encoder_reduction_factor).
+                The sequeunce of character ids (T,) or acoustic feature (T, idim * encoder_reduction_factor).
        Returns:
            Tensor: The sequences of encoder states(T, eunits).
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@ -60,11 +60,15 @@ class TADELayer(nn.Layer):
    def forward(self, x, c):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input tensor (B, in_channels, T).
+            x (Tensor): 
-            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+                Input tensor (B, in_channels, T).
            c (Tensor): 
                Auxiliary input tensor (B, aux_channels, T).
        Returns:
-            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: 
-            Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
+                Output tensor (B, in_channels, T * upsample_factor).
            Tensor:
                Upsampled aux tensor (B, in_channels, T * upsample_factor).
        """
        x = self.norm(x)
@ -138,11 +142,15 @@ class TADEResBlock(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Input tensor (B, in_channels, T).
+            x (Tensor): 
-            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+                Input tensor (B, in_channels, T).
            c (Tensor): 
                Auxiliary input tensor (B, aux_channels, T).
        Returns:
-            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: 
-            Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+                Output tensor (B, in_channels, T * upsample_factor).
            Tensor: 
                Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
        """
        residual = x
        x, c = self.tade1(x, c)
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@ -25,9 +25,12 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill
 class MultiHeadedAttention(nn.Layer):
    """Multi-Head Attention layer.
    Args:
-        n_head (int): The number of heads.
+        n_head (int): 
-        n_feat (int): The number of features.
+            The number of heads.
-        dropout_rate (float): Dropout rate.
+        n_feat (int): 
            The number of features.
        dropout_rate (float): 
            Dropout rate.
    """
    def __init__(self, n_head, n_feat, dropout_rate):
@ -48,14 +51,20 @@ class MultiHeadedAttention(nn.Layer):
        """Transform query, key and value.
        Args:
-            query(Tensor): query tensor (#batch, time1, size).
+            query(Tensor): 
-            key(Tensor): Key tensor (#batch, time2, size).
+                query tensor (#batch, time1, size).
-            value(Tensor): Value tensor (#batch, time2, size).
+            key(Tensor): 
                Key tensor (#batch, time2, size).
            value(Tensor): 
                Value tensor (#batch, time2, size).
        Returns:
-            Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            Tensor: 
-            Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+                Transformed query tensor (#batch, n_head, time1, d_k).
-            Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+            Tensor: 
                Transformed key tensor (#batch, n_head, time2, d_k).
            Tensor: 
                Transformed value tensor (#batch, n_head, time2, d_k).
        """
        n_batch = paddle.shape(query)[0]
@ -77,9 +86,12 @@ class MultiHeadedAttention(nn.Layer):
        """Compute attention context vector.
        Args:
-            value(Tensor): Transformed value (#batch, n_head, time2, d_k).
+            value(Tensor): 
-            scores(Tensor): Attention score (#batch, n_head, time1, time2).
+                Transformed value (#batch, n_head, time2, d_k).
-            mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+            scores(Tensor): 
                Attention score (#batch, n_head, time1, time2).
            mask(Tensor, optional): 
                Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
        Returns:
            Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
@ -113,10 +125,14 @@ class MultiHeadedAttention(nn.Layer):
        """Compute scaled dot product attention.
        Args:
-            query(Tensor): Query tensor (#batch, time1, size).
+            query(Tensor): 
-            key(Tensor): Key tensor (#batch, time2, size).
+                Query tensor (#batch, time1, size).
-            value(Tensor): Value tensor (#batch, time2, size).
+            key(Tensor): 
-            mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+                Key tensor (#batch, time2, size).
            value(Tensor): 
                Value tensor (#batch, time2, size).
            mask(Tensor, optional): 
                Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
        Returns:
            Tensor: Output tensor (#batch, time1, d_model).
@ -134,10 +150,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    Paper: https://arxiv.org/abs/1901.02860
    Args:
-        n_head (int): The number of heads.
+        n_head (int): 
-        n_feat (int): The number of features.
+            The number of heads.
-        dropout_rate (float): Dropout rate.
+        n_feat (int): 
-        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+            The number of features.
        dropout_rate (float): 
            Dropout rate.
        zero_triu (bool): 
            Whether to zero the upper triangular part of attention matrix.
    """
    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@ -161,10 +181,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    def rel_shift(self, x):
        """Compute relative positional encoding.
        Args:
-            x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            x(Tensor): 
                Input tensor (batch, head, time1, 2*time1-1).
        Returns:
-            Tensor:Output tensor.
+            Tensor: Output tensor.
        """
        b, h, t1, t2 = paddle.shape(x)
        zero_pad = paddle.zeros((b, h, t1, 1))
@ -183,11 +204,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
        Args:
-            query(Tensor): Query tensor (#batch, time1, size).
+            query(Tensor): 
-            key(Tensor): Key tensor (#batch, time2, size).
+                Query tensor (#batch, time1, size).
-            value(Tensor): Value tensor (#batch, time2, size).
+            key(Tensor): 
-            pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
+                Key tensor (#batch, time2, size).
-            mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
+            value(Tensor): 
                Value tensor (#batch, time2, size).
            pos_emb(Tensor): 
                Positional embedding tensor (#batch, 2*time1-1, size).
            mask(Tensor): 
                Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
        Returns:
            Tensor: Output tensor (#batch, time1, d_model).
@ -228,10 +254,14 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
    Paper: https://arxiv.org/abs/1901.02860
    Args:
-        n_head (int): The number of heads.
+        n_head (int): 
-        n_feat (int): The number of features.
+            The number of heads.
-        dropout_rate (float): Dropout rate.
+        n_feat (int): 
-        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+            The number of features.
        dropout_rate (float): 
            Dropout rate.
        zero_triu (bool): 
            Whether to zero the upper triangular part of attention matrix.
    """
    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@ -255,8 +285,8 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
    def rel_shift(self, x):
        """Compute relative positional encoding.
        Args:
-            x(Tensor): Input tensor (batch, head, time1, time2).
+            x(Tensor): 
-
+                Input tensor (batch, head, time1, time2).
        Returns:
            Tensor:Output tensor.
        """
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@ -37,28 +37,46 @@ class Decoder(nn.Layer):
    """Transfomer decoder module.
    Args:
-        odim (int): Output diminsion.
+        odim (int): 
-        self_attention_layer_type (str): Self-attention layer type.
+            Output diminsion.
-        attention_dim (int): Dimention of attention.
+        self_attention_layer_type (str): 
-        attention_heads (int): The number of heads of multi head attention.
+            Self-attention layer type.
-        conv_wshare (int): The number of kernel of convolution. Only used in
+        attention_dim (int): 
            Dimention of attention.
        attention_heads (int): 
            The number of heads of multi head attention.
        conv_wshare (int):
            The number of kernel of convolution. Only used in
            self_attention_layer_type == "lightconv*" or "dynamiconv*".
-        conv_kernel_length (Union[int, str]):Kernel size str of convolution
+        conv_kernel_length (Union[int, str]):
            Kernel size str of convolution
            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
-        conv_usebias (bool): Whether to use bias in convolution. Only used in
+        conv_usebias (bool): 
            Whether to use bias in convolution. Only used in
            self_attention_layer_type == "lightconv*" or "dynamiconv*".
-        linear_units(int): The number of units of position-wise feed forward.
+        linear_units(int): 
-        num_blocks (int): The number of decoder blocks.
+            The number of units of position-wise feed forward.
-        dropout_rate (float): Dropout rate.
+        num_blocks (int): 
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+            The number of decoder blocks.
-        self_attention_dropout_rate (float): Dropout rate in self-attention.
+        dropout_rate (float): 
-        src_attention_dropout_rate (float): Dropout rate in source-attention.
+            Dropout rate.
-        input_layer (Union[str, nn.Layer]): Input layer type.
+        positional_dropout_rate (float): 
-        use_output_layer (bool): Whether to use output layer.
+            Dropout rate after adding positional encoding.
-        pos_enc_class (nn.Layer): Positional encoding module class.
+        self_attention_dropout_rate (float): 
            Dropout rate in self-attention.
        src_attention_dropout_rate (float): 
            Dropout rate in source-attention.
        input_layer (Union[str, nn.Layer]): 
            Input layer type.
        use_output_layer (bool): 
            Whether to use output layer.
        pos_enc_class (nn.Layer): 
            Positional encoding module class.
            `PositionalEncoding `or `ScaledPositionalEncoding`
-        normalize_before (bool): Whether to use layer_norm before the first block.
+        normalize_before (bool): 
-        concat_after (bool): Whether to concat attention layer's input and output.
+            Whether to use layer_norm before the first block.
        concat_after (bool): 
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
@ -143,17 +161,22 @@ class Decoder(nn.Layer):
    def forward(self, tgt, tgt_mask, memory, memory_mask):
        """Forward decoder.
        Args:
-            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
+            tgt(Tensor): 
                Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
                In the other case, input tensor (#batch, maxlen_out, odim).
-            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            tgt_mask(Tensor): 
-            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+                Input token mask (#batch, maxlen_out).
-            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+            memory(Tensor): 
                Encoded memory, float32 (#batch, maxlen_in, feat).
            memory_mask(Tensor): 
                Encoded memory mask (#batch, maxlen_in).
        Returns:
            Tensor:
                Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. 
                In the other case,final block outputs (#batch, maxlen_out, attention_dim).
-            Tensor: Score mask before softmax (#batch, maxlen_out).
+            Tensor: 
                Score mask before softmax (#batch, maxlen_out).
        """
        x = self.embed(tgt)
@ -169,14 +192,20 @@ class Decoder(nn.Layer):
        """Forward one step.
        Args:
-            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
+            tgt(Tensor): 
-            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+                Input token ids, int64 (#batch, maxlen_out).
-            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            tgt_mask(Tensor): 
-            cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
+                Input token mask (#batch, maxlen_out).
            memory(Tensor): 
                Encoded memory, float32 (#batch, maxlen_in, feat).
            cache((List[Tensor]), optional): 
                List of cached tensors. (Default value = None)
        Returns:
-            Tensor: Output tensor (batch, maxlen_out, odim).
+            Tensor: 
-            List[Tensor]: List of cache tensors of each decoder layer.
+                Output tensor (batch, maxlen_out, odim).
            List[Tensor]: 
                List of cache tensors of each decoder layer.
        """
        x = self.embed(tgt)
@ -219,9 +248,12 @@ class Decoder(nn.Layer):
        """Score new token batch (required).
        Args:
-            ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            ys(Tensor): 
-            states(List[Any]): Scorer states for prefix tokens.
+                paddle.int64 prefix tokens (n_batch, ylen).
-            xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).
+            states(List[Any]): 
                Scorer states for prefix tokens.
            xs(Tensor): 
                The encoder feature that generates ys (n_batch, xlen, n_feat).
        Returns:
            tuple[Tensor, List[Any]]:
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@ -24,16 +24,23 @@ class DecoderLayer(nn.Layer):
    Args:
-        size (int): Input dimension.
+        size (int): 
-        self_attn (nn.Layer): Self-attention module instance.
+            Input dimension.
        self_attn (nn.Layer): 
            Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        src_attn (nn.Layer): Self-attention module instance.
+        src_attn (nn.Layer): 
            Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        feed_forward (nn.Layer): Feed-forward module instance.
+        feed_forward (nn.Layer): 
            Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-        dropout_rate (float): Dropout rate.
+        dropout_rate (float):
-        normalize_before (bool): Whether to use layer_norm before the first block.
+            Dropout rate.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        normalize_before (bool):
             Whether to use layer_norm before the first block.
        concat_after (bool): 
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
@ -69,11 +76,16 @@ class DecoderLayer(nn.Layer):
        """Compute decoded features.
        Args:
-            tgt(Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt(Tensor): 
-            tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
+                Input tensor (#batch, maxlen_out, size).
-            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+            tgt_mask(Tensor): 
-            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+                Mask for input tensor (#batch, maxlen_out).
-            cache(List[Tensor], optional): List of cached tensors.
+            memory(Tensor): 
                Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask(Tensor): 
                Encoded memory mask (#batch, maxlen_in).
            cache(List[Tensor], optional): 
                List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
        Returns:
            Tensor
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -23,11 +23,16 @@ class PositionalEncoding(nn.Layer):
    """Positional encoding.
    Args:
-        d_model (int):  Embedding dimension.
+        d_model (int):
-        dropout_rate (float): Dropout rate.
+            Embedding dimension.
-        max_len (int): Maximum input length.
+        dropout_rate (float): 
-        reverse (bool): Whether to reverse the input position.
+            Dropout rate.
-        type (str): dtype of param
+        max_len (int): 
            Maximum input length.
        reverse (bool): 
            Whether to reverse the input position.
        type (str): 
            dtype of param
    """
    def __init__(self,
@ -68,7 +73,8 @@ class PositionalEncoding(nn.Layer):
        """Add positional encoding.
        Args:
-            x (Tensor): Input tensor (batch, time, `*`).
+            x (Tensor): 
                Input tensor (batch, time, `*`).
        Returns:
            Tensor: Encoded tensor (batch, time, `*`).
@ -84,10 +90,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
    See Sec. 3.2  https://arxiv.org/abs/1809.08895
    Args:
-        d_model (int): Embedding dimension.
+        d_model (int): 
-        dropout_rate (float): Dropout rate.
+            Embedding dimension.
-        max_len (int): Maximum input length.
+        dropout_rate (float): 
-        dtype (str): dtype of param
+            Dropout rate.
        max_len (int): 
            Maximum input length.
        dtype (str): 
            dtype of param
    """
    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -111,7 +121,8 @@ class ScaledPositionalEncoding(PositionalEncoding):
        """Add positional encoding.
        Args:
-            x (Tensor): Input tensor (batch, time, `*`).
+            x (Tensor): 
                Input tensor (batch, time, `*`).
        Returns:
            Tensor: Encoded tensor (batch, time, `*`).
        """
@ -127,9 +138,12 @@ class RelPositionalEncoding(nn.Layer):
    See : Appendix B in https://arxiv.org/abs/1901.02860
    Args:
-        d_model (int): Embedding dimension.
+        d_model (int): 
-        dropout_rate (float): Dropout rate.
+            Embedding dimension.
-        max_len (int): Maximum input length.
+        dropout_rate (float): 
            Dropout rate.
        max_len (int): 
            Maximum input length.
    """
    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -175,7 +189,8 @@ class RelPositionalEncoding(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
        Args:
-            x (Tensor):Input tensor (batch, time, `*`).
+            x (Tensor):
                Input tensor (batch, time, `*`).
        Returns:
            Tensor: Encoded tensor (batch, time, `*`).
        """
@ -195,18 +210,24 @@ class LegacyRelPositionalEncoding(PositionalEncoding):
    See : Appendix B in https://arxiv.org/abs/1901.02860
    Args:
-        d_model (int): Embedding dimension.
+        d_model (int): 
-        dropout_rate (float): Dropout rate.
+            Embedding dimension.
-        max_len (int): Maximum input length.
+        dropout_rate (float): 
            Dropout rate.
        max_len (int): 
            Maximum input length.
    """
    def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
        """
        Args:
-            d_model (int): Embedding dimension.
+            d_model (int): 
-            dropout_rate (float): Dropout rate.
+                Embedding dimension.
-            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
+            dropout_rate (float): 
                Dropout rate.
            max_len (int, optional): 
                [Maximum input length.]. Defaults to 5000.
        """
        super().__init__(d_model, dropout_rate, max_len, reverse=True)
@ -234,10 +255,13 @@ class LegacyRelPositionalEncoding(PositionalEncoding):
    def forward(self, x: paddle.Tensor):
        """Compute positional encoding.
        Args:
-            x (paddle.Tensor): Input tensor (batch, time, `*`).
+            x (Tensor): 
                Input tensor (batch, time, `*`).
        Returns:
-            paddle.Tensor: Encoded tensor (batch, time, `*`).
+            Tensor: 
-            paddle.Tensor: Positional embedding tensor (1, time, `*`).
+                Encoded tensor (batch, time, `*`).
            Tensor: 
                Positional embedding tensor (1, time, `*`).
        """
        self.extend_pe(x)
        x = x * self.xscale
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -38,32 +38,55 @@ class BaseEncoder(nn.Layer):
    """Base Encoder module.
    Args:
-        idim (int): Input dimension.
+        idim (int): 
-        attention_dim (int): Dimention of attention.
+            Input dimension.
-        attention_heads (int): The number of heads of multi head attention.
+        attention_dim (int): 
-        linear_units (int): The number of units of position-wise feed forward.
+            Dimention of attention.
-        num_blocks (int): The number of decoder blocks.
+        attention_heads (int): 
-        dropout_rate (float): Dropout rate.
+            The number of heads of multi head attention.
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        linear_units (int): 
-        attention_dropout_rate (float): Dropout rate in attention.
+            The number of units of position-wise feed forward.
-        input_layer (Union[str, nn.Layer]): Input layer type.
+        num_blocks (int): 
-        normalize_before (bool): Whether to use layer_norm before the first block.
+            The number of decoder blocks.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        dropout_rate (float): 
            Dropout rate.
        positional_dropout_rate (float): 
            Dropout rate after adding positional encoding.
        attention_dropout_rate (float): 
            Dropout rate in attention.
        input_layer (Union[str, nn.Layer]): 
            Input layer type.
        normalize_before (bool): 
            Whether to use layer_norm before the first block.
        concat_after (bool): 
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_layer_type (str): 
-        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+            "linear", "conv1d", or "conv1d-linear".
-        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        positionwise_conv_kernel_size (int): 
-        pos_enc_layer_type (str): Encoder positional encoding layer type.
+            Kernel size of positionwise conv1d layer.
-        selfattention_layer_type (str): Encoder attention layer type.
+        macaron_style (bool): 
-        activation_type (str): Encoder activation function type.
+            Whether to use macaron style for positionwise layer.
-        use_cnn_module (bool): Whether to use convolution module.
+        pos_enc_layer_type (str): 
-        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+            Encoder positional encoding layer type.
-        cnn_module_kernel (int): Kernerl size of convolution module.
+        selfattention_layer_type (str): 
-        padding_idx (int): Padding idx for input_layer=embed.
+            Encoder attention layer type.
-        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        activation_type (str): 
-        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            Encoder activation function type.
        use_cnn_module (bool): 
            Whether to use convolution module.
        zero_triu (bool): 
            Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): 
            Kernerl size of convolution module.
        padding_idx (int): 
            Padding idx for input_layer=embed.
        stochastic_depth_rate (float): 
            Maximum probability to skip the encoder layer.
        intermediate_layers (Union[List[int], None]): 
            indices of intermediate CTC layer.
            indices start from 1.
            if not None, intermediate outputs are returned (which changes return type
            signature.)
@ -266,12 +289,16 @@ class BaseEncoder(nn.Layer):
        """Encode input sequence.
        Args:
-            xs (Tensor): Input tensor (#batch, time, idim).
+            xs (Tensor): 
-            masks (Tensor): Mask tensor (#batch, 1, time).
+                Input tensor (#batch, time, idim).
            masks (Tensor): 
                Mask tensor (#batch, 1, time).
        Returns: 
-            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: 
-            Tensor: Mask tensor (#batch, 1, time).
+                Output tensor (#batch, time, attention_dim).
            Tensor: 
                Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -284,26 +311,43 @@ class TransformerEncoder(BaseEncoder):
    """Transformer encoder module.
    Args:
-        idim (int): Input dimension.
+        idim (int): 
-        attention_dim (int): Dimention of attention.
+            Input dimension.
-        attention_heads (int): The number of heads of multi head attention.
+        attention_dim (int): 
-        linear_units (int): The number of units of position-wise feed forward.
+            Dimention of attention.
-        num_blocks (int): The number of decoder blocks.
+        attention_heads (int): 
-        dropout_rate (float): Dropout rate.
+            The number of heads of multi head attention.
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        linear_units (int): 
-        attention_dropout_rate (float): Dropout rate in attention.
+            The number of units of position-wise feed forward.
-        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        num_blocks (int): 
-        pos_enc_layer_type (str): Encoder positional encoding layer type.
+            The number of decoder blocks.
-        normalize_before (bool): Whether to use layer_norm before the first block.
+        dropout_rate (float): 
-        concat_after (bool): Whether to concat attention layer's input and output.
+            Dropout rate.
        positional_dropout_rate (float): 
            Dropout rate after adding positional encoding.
        attention_dropout_rate (float): 
            Dropout rate in attention.
        input_layer (Union[str, paddle.nn.Layer]): 
            Input layer type.
        pos_enc_layer_type (str): 
            Encoder positional encoding layer type.
        normalize_before (bool): 
            Whether to use layer_norm before the first block.
        concat_after (bool): 
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_layer_type (str): 
-        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+            "linear", "conv1d", or "conv1d-linear".
-        selfattention_layer_type (str): Encoder attention layer type.
+        positionwise_conv_kernel_size (int): 
-        activation_type (str): Encoder activation function type.
+            Kernel size of positionwise conv1d layer.
-        padding_idx (int): Padding idx for input_layer=embed.
+        selfattention_layer_type (str): 
            Encoder attention layer type.
        activation_type (str): 
            Encoder activation function type.
        padding_idx (int): 
            Padding idx for input_layer=embed.
    """
    def __init__(
@ -350,12 +394,16 @@ class TransformerEncoder(BaseEncoder):
        """Encoder input sequence.
        Args:
-            xs(Tensor): Input tensor (#batch, time, idim).
+            xs(Tensor): 
-            masks(Tensor): Mask tensor (#batch, 1, time).
+                Input tensor (#batch, time, idim).
            masks(Tensor): 
                Mask tensor (#batch, 1, time).
        Returns:
-            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: 
-            Tensor: Mask tensor (#batch, 1, time).
+                Output tensor (#batch, time, attention_dim).
            Tensor: 
                Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -367,14 +415,20 @@ class TransformerEncoder(BaseEncoder):
        """Encode input frame.
        Args:
-            xs (Tensor): Input tensor.
+            xs (Tensor): 
-            masks (Tensor): Mask tensor.
+                Input tensor.
-            cache (List[Tensor]): List of cache tensors.
+            masks (Tensor): 
                Mask tensor.
            cache (List[Tensor]): 
                List of cache tensors.
        Returns:
-            Tensor: Output tensor.
+            Tensor:
-            Tensor: Mask tensor.
+                 Output tensor.
-            List[Tensor]: List of new cache tensors.
+            Tensor:
                 Mask tensor.
            List[Tensor]: 
                List of new cache tensors.
        """
        xs = self.embed(xs)
@ -393,32 +447,55 @@ class ConformerEncoder(BaseEncoder):
    """Conformer encoder module.
    Args:
-        idim (int): Input dimension.
+        idim (int): 
-        attention_dim (int): Dimention of attention.
+            Input dimension.
-        attention_heads (int): The number of heads of multi head attention.
+        attention_dim (int): 
-        linear_units (int): The number of units of position-wise feed forward.
+            Dimention of attention.
-        num_blocks (int): The number of decoder blocks.
+        attention_heads (int): 
-        dropout_rate (float): Dropout rate.
+            The number of heads of multi head attention.
-        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        linear_units (int): 
-        attention_dropout_rate (float): Dropout rate in attention.
+            The number of units of position-wise feed forward.
-        input_layer (Union[str, nn.Layer]): Input layer type.
+        num_blocks (int): 
-        normalize_before (bool): Whether to use layer_norm before the first block.
+            The number of decoder blocks.
-        concat_after (bool):Whether to concat attention layer's input and output.
+        dropout_rate (float): 
            Dropout rate.
        positional_dropout_rate (float): 
            Dropout rate after adding positional encoding.
        attention_dropout_rate (float): 
            Dropout rate in attention.
        input_layer (Union[str, nn.Layer]): 
            Input layer type.
        normalize_before (bool): 
            Whether to use layer_norm before the first block.
        concat_after (bool):
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_layer_type (str): 
-        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+            "linear", "conv1d", or "conv1d-linear".
-        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        positionwise_conv_kernel_size (int): 
-        pos_enc_layer_type (str): Encoder positional encoding layer type.
+            Kernel size of positionwise conv1d layer.
-        selfattention_layer_type (str): Encoder attention layer type.
+        macaron_style (bool): 
-        activation_type (str): Encoder activation function type.
+            Whether to use macaron style for positionwise layer.
-        use_cnn_module (bool): Whether to use convolution module.
+        pos_enc_layer_type (str): 
-        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+            Encoder positional encoding layer type.
-        cnn_module_kernel (int): Kernerl size of convolution module.
+        selfattention_layer_type (str): 
-        padding_idx (int): Padding idx for input_layer=embed.
+            Encoder attention layer type.
-        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        activation_type (str): 
-        intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
+            Encoder activation function type.
        use_cnn_module (bool): 
            Whether to use convolution module.
        zero_triu (bool): 
            Whether to zero the upper triangular part of attention matrix.
        cnn_module_kernel (int): 
            Kernerl size of convolution module.
        padding_idx (int): 
            Padding idx for input_layer=embed.
        stochastic_depth_rate (float): 
            Maximum probability to skip the encoder layer.
        intermediate_layers (Union[List[int], None]):
            indices of intermediate CTC layer. indices start from 1.
            if not None, intermediate outputs are returned (which changes return type signature.)
    """
@ -478,11 +555,15 @@ class ConformerEncoder(BaseEncoder):
        """Encode input sequence.
        Args:
-            xs (Tensor): Input tensor (#batch, time, idim).
+            xs (Tensor): 
-            masks (Tensor): Mask tensor (#batch, 1, time).
+                Input tensor (#batch, time, idim).
            masks (Tensor): 
                Mask tensor (#batch, 1, time).
        Returns:
-            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: 
-            Tensor: Mask tensor (#batch, 1, time).
+                Output tensor (#batch, time, attention_dim).
            Tensor: 
                Mask tensor (#batch, 1, time).
        """
        if isinstance(self.embed, (Conv2dSubsampling)):
            xs, masks = self.embed(xs, masks)
@ -539,7 +620,8 @@ class Conv1dResidualBlock(nn.Layer):
    def forward(self, xs):
        """Encode input sequence.
        Args:
-            xs (Tensor): Input tensor (#batch, idim, T).
+            xs (Tensor): 
                Input tensor (#batch, idim, T).
        Returns:
            Tensor: Output tensor (#batch, odim, T).
        """
@ -582,8 +664,10 @@ class CNNDecoder(nn.Layer):
    def forward(self, xs, masks=None):
        """Encode input sequence.
        Args:
-            xs (Tensor): Input tensor (#batch, time, idim).
+            xs (Tensor): 
-            masks (Tensor): Mask tensor (#batch, 1, time).
+                Input tensor (#batch, time, idim).
            masks (Tensor): 
                Mask tensor (#batch, 1, time).
        Returns:
            Tensor: Output tensor (#batch, time, odim).
        """
@ -629,8 +713,10 @@ class CNNPostnet(nn.Layer):
    def forward(self, xs, masks=None):
        """Encode input sequence.
        Args:
-            xs (Tensor): Input tensor (#batch, odim, time).
+            xs (Tensor): 
-            masks (Tensor): Mask tensor (#batch, 1, time).
+                Input tensor (#batch, odim, time).
            masks (Tensor): 
                Mask tensor (#batch, 1, time).
        Returns:
            Tensor: Output tensor (#batch, odim, time).
        """
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@ -21,14 +21,20 @@ class EncoderLayer(nn.Layer):
    """Encoder layer module.
    Args:
-        size (int): Input dimension.
+        size (int): 
-        self_attn (nn.Layer): Self-attention module instance.
+            Input dimension.
        self_attn (nn.Layer): 
            Self-attention module instance.
            `MultiHeadedAttention`  instance can be used as the argument.
-        feed_forward (nn.Layer): Feed-forward module instance.
+        feed_forward (nn.Layer): 
            Feed-forward module instance.
            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-        dropout_rate (float): Dropout rate.
+        dropout_rate (float): 
-        normalize_before (bool): Whether to use layer_norm before the first block.
+            Dropout rate.
-        concat_after (bool): Whether to concat attention layer's input and output.
+        normalize_before (bool): 
            Whether to use layer_norm before the first block.
        concat_after (bool): 
            Whether to concat attention layer's input and output.
            if True, additional linear will be applied.
            i.e. x -> x + linear(concat(x, att(x)))
            if False, no additional linear will be applied. i.e. x -> x + att(x)
@ -59,13 +65,18 @@ class EncoderLayer(nn.Layer):
        """Compute encoded features.
        Args:
-            x(Tensor): Input tensor (#batch, time, size).
+            x(Tensor): 
-            mask(Tensor): Mask tensor for the input (#batch, time).
+                Input tensor (#batch, time, size).
-            cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). 
+            mask(Tensor): 
                Mask tensor for the input (#batch, time).
            cache(Tensor, optional): 
                Cache tensor of the input (#batch, time - 1, size). 
        Returns:
-            Tensor: Output tensor (#batch, time, size).
+            Tensor: 
-            Tensor: Mask tensor (#batch, time).
+                Output tensor (#batch, time, size).
            Tensor: 
                Mask tensor (#batch, time).
        """
        residual = x
        if self.normalize_before:
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@ -31,12 +31,18 @@ class LightweightConvolution(nn.Layer):
    https://github.com/pytorch/fairseq/tree/master/fairseq
    Args:
-        wshare (int): the number of kernel of convolution
+        wshare (int): 
-        n_feat (int): the number of features
+            the number of kernel of convolution
-        dropout_rate (float): dropout_rate
+        n_feat (int): 
-        kernel_size (int): kernel size (length)
+            the number of features
-        use_kernel_mask (bool): Use causal mask or not for convolution kernel
+        dropout_rate (float): 
-        use_bias (bool): Use bias term or not.
+            dropout_rate
        kernel_size (int): 
            kernel size (length)
        use_kernel_mask (bool): 
            Use causal mask or not for convolution kernel
        use_bias (bool): 
            Use bias term or not.
    """
@ -94,10 +100,14 @@ class LightweightConvolution(nn.Layer):
        This is just for compatibility with self-attention layer (attention.py)
        Args:
-            query (Tensor): input tensor. (batch, time1, d_model)
+            query (Tensor): 
-            key (Tensor): NOT USED. (batch, time2, d_model)  
+                input tensor. (batch, time1, d_model)
-            value (Tensor): NOT USED. (batch, time2, d_model) 
+            key (Tensor): 
-            mask : (Tensor): (batch, time1, time2) mask
+                NOT USED. (batch, time2, d_model)  
            value (Tensor): 
                NOT USED. (batch, time2, d_model) 
            mask : (Tensor):
                (batch, time1, time2) mask
        Return:
            Tensor: ouput. (batch, time1, d_model) 
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
@ -19,8 +19,10 @@ def subsequent_mask(size, dtype=paddle.bool):
    """Create mask for subsequent steps (size, size).
    Args:
-        size (int): size of mask
+        size (int): 
-        dtype (paddle.dtype): result dtype
+            size of mask
        dtype (paddle.dtype): 
            result dtype
    Return:
        Tensor:
            >>> subsequent_mask(3)
@ -36,9 +38,12 @@ def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
    """Create mask for decoder self-attention.
    Args:
-        ys_pad (Tensor): batch of padded target sequences (B, Lmax)
+        ys_pad (Tensor): 
-        ignore_id (int): index of padding
+            batch of padded target sequences (B, Lmax)
-        dtype (paddle.dtype): result dtype
+        ignore_id (int): 
            index of padding
        dtype (paddle.dtype): 
            result dtype
    Return: 
        Tensor: (B, Lmax, Lmax)
    """
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@ -32,10 +32,14 @@ class MultiLayeredConv1d(nn.Layer):
        """Initialize MultiLayeredConv1d module.
        Args: 
-            in_chans (int): Number of input channels.
+            in_chans (int): 
-            hidden_chans (int): Number of hidden channels.
+                Number of input channels.
-            kernel_size (int): Kernel size of conv1d.
+            hidden_chans (int): 
-            dropout_rate (float): Dropout rate.
+                Number of hidden channels.
            kernel_size (int): 
                Kernel size of conv1d.
            dropout_rate (float): 
                Dropout rate.
        """
        super().__init__()
@ -58,7 +62,8 @@ class MultiLayeredConv1d(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Batch of input tensors (B, T, in_chans).
+            x (Tensor): 
                Batch of input tensors (B, T, in_chans).
        Returns: 
            Tensor: Batch of output tensors (B, T, in_chans).
@ -79,10 +84,14 @@ class Conv1dLinear(nn.Layer):
        """Initialize Conv1dLinear module.
        Args:
-            in_chans (int): Number of input channels.
+            in_chans (int): 
-            hidden_chans (int): Number of hidden channels.
+                Number of input channels.
-            kernel_size (int): Kernel size of conv1d.
+            hidden_chans (int): 
-            dropout_rate (float): Dropout rate.
+                Number of hidden channels.
            kernel_size (int): 
                Kernel size of conv1d.
            dropout_rate (float):
                Dropout rate.
        """
        super().__init__()
        self.w_1 = nn.Conv1D(
@ -99,7 +108,8 @@ class Conv1dLinear(nn.Layer):
        """Calculate forward propagation.
        Args:
-            x (Tensor): Batch of input tensors (B, T, in_chans).
+            x (Tensor): 
                Batch of input tensors (B, T, in_chans).
        Returns:
            Tensor: Batch of output tensors (B, T, in_chans).
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@ -21,9 +21,12 @@ class PositionwiseFeedForward(nn.Layer):
    """Positionwise feed forward layer.
    Args:
-        idim (int): Input dimenstion.
+        idim (int): 
-        hidden_units (int): The number of hidden units.
+            Input dimenstion.
-        dropout_rate (float): Dropout rate.
+        hidden_units (int): 
            The number of hidden units.
        dropout_rate (float): 
            Dropout rate.
    """
    def __init__(self,
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@ -30,8 +30,10 @@ def repeat(N, fn):
    """Repeat module N times.
    Args:
-        N (int): Number of repeat time.
+        N (int): 
-        fn (Callable): Function to generate module.
+            Number of repeat time.
        fn (Callable): 
            Function to generate module.
    Returns:
        MultiSequential: Repeated model instance.
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@ -23,10 +23,14 @@ class Conv2dSubsampling(nn.Layer):
    """Convolutional 2D subsampling (to 1/4 length).
    Args:
-        idim (int): Input dimension.
+        idim (int): 
-        odim (int): Output dimension.
+            Input dimension.
-        dropout_rate (float): Dropout rate.
+        odim (int): 
-        pos_enc (nn.Layer): Custom position encoding layer.
+            Output dimension.
        dropout_rate (float): 
            Dropout rate.
        pos_enc (nn.Layer): 
            Custom position encoding layer.
    """
    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
@ -45,11 +49,15 @@ class Conv2dSubsampling(nn.Layer):
    def forward(self, x, x_mask):
        """Subsample x.
        Args:
-            x (Tensor): Input tensor (#batch, time, idim).
+            x (Tensor): 
-            x_mask (Tensor): Input mask (#batch, 1, time).
+                Input tensor (#batch, time, idim).
            x_mask (Tensor): 
                Input mask (#batch, 1, time).
        Returns:
-            Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
+            Tensor: 
-            Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
+                Subsampled tensor (#batch, time', odim), where time' = time // 4.
            Tensor: 
                Subsampled mask (#batch, 1, time'), where time' = time // 4.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
@ -28,9 +28,12 @@ class Stretch2D(nn.Layer):
        """Strech an image (or image-like object) with some interpolation.
        Args:
-            w_scale (int): Scalar of width.
+            w_scale (int): 
-            h_scale (int): Scalar of the height.
+                Scalar of width.
-            mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", 
+            h_scale (int): 
                Scalar of the height.
            mode (str, optional): 
                Interpolation mode, modes suppored are "nearest", "bilinear", 
                "trilinear", "bicubic", "linear" and "area",by default "nearest"
        For more details about interpolation, see 
            `paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
@ -44,11 +47,12 @@ class Stretch2D(nn.Layer):
        """
        Args: 
-            x (Tensor): Shape (N, C, H, W)
+            x (Tensor): 
                Shape (N, C, H, W)
        Returns:
-            Tensor: The stretched image.
+            Tensor: 
-                Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
+                The stretched image. Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
        """
        out = F.interpolate(
@ -61,12 +65,18 @@ class UpsampleNet(nn.Layer):
    convolutions.
    Args:
-        upsample_scales (List[int]): Upsampling factors for each strech.
+        upsample_scales (List[int]): 
-        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+            Upsampling factors for each strech.
-        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        nonlinear_activation (Optional[str], optional): 
-        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+            Activation after each convolution, by default None
-        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        nonlinear_activation_params (Dict[str, Any], optional): 
-        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            Parameters passed to construct the activation, by default {}
        interpolate_mode (str, optional): 
            Interpolation mode of the strech, by default "nearest"
        freq_axis_kernel_size (int, optional): 
            Convolution kernel size along the frequency axis, by default 1
        use_causal_conv (bool, optional): 
            Whether to use causal padding before convolution, by default False
            If True, Causal padding is used along the time axis, 
            i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
            If False, "same" padding is used along the time axis.
@ -106,7 +116,8 @@ class UpsampleNet(nn.Layer):
    def forward(self, c):
        """
        Args:
-            c (Tensor): spectrogram. Shape (N, F, T)
+            c (Tensor): 
                spectrogram. Shape (N, F, T)
        Returns: 
            Tensor: upsampled spectrogram.
@ -126,17 +137,25 @@ class ConvInUpsampleNet(nn.Layer):
    UpsampleNet.
    Args:
-        upsample_scales (List[int]): Upsampling factors for each strech.
+        upsample_scales (List[int]): 
-        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+            Upsampling factors for each strech.
-        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        nonlinear_activation (Optional[str], optional): 
-        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+            Activation after each convolution, by default None
-        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        nonlinear_activation_params (Dict[str, Any], optional): 
-        aux_channels (int, optional): Feature size of the input, by default 80
+            Parameters passed to construct the activation, by default {}
-        aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It 
+        interpolate_mode (str, optional): 
            Interpolation mode of the strech, by default "nearest"
        freq_axis_kernel_size (int, optional): 
            Convolution kernel size along the frequency axis, by default 1
        aux_channels (int, optional): 
            Feature size of the input, by default 80
        aux_context_window (int, optional): 
            Context window of the first 1D convolution applied to the input. It 
            related to the kernel size of the convolution, by default 0
            If use causal convolution, the kernel size is ``window + 1``, 
            else the kernel size is ``2 * window + 1``.
-        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+        use_causal_conv (bool, optional):
            Whether to use causal padding before convolution, by default False
            If True, Causal padding is used along the time axis, i.e. padding 
            amount is ``receptive field - 1`` and 0 for before and after, respectively.
            If False, "same" padding is used along the time axis.
@ -171,7 +190,8 @@ class ConvInUpsampleNet(nn.Layer):
    def forward(self, c):
        """
        Args:
-            c (Tensor): spectrogram. Shape (N, F, T)
+            c (Tensor): 
                spectrogram. Shape (N, F, T)
        Returns:
            Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, 
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@ -58,8 +58,10 @@ class ExperimentBase(object):
    need.
    Args:
-        config (yacs.config.CfgNode): The configuration used for the experiment.
+        config (yacs.config.CfgNode): 
-        args (argparse.Namespace): The parsed command line arguments.
+            The configuration used for the experiment.
        args (argparse.Namespace): 
            The parsed command line arguments.
    Examples:
        >>> def main_sp(config, args):
--- a/paddlespeech/t2s/utils/checkpoint.py
+++ b/paddlespeech/t2s/utils/checkpoint.py
@ -25,7 +25,8 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
    """Get the iteration number corresponding to the latest saved checkpoint.
    Args:
-        checkpoint_dir (str): the directory where checkpoint is saved.
+        checkpoint_dir (str):
            the directory where checkpoint is saved.
    Returns:
        int: the latest iteration number.
@ -46,8 +47,10 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
    """Save the iteration number of the latest model to be checkpointed.
    Args:
-        checkpoint_dir (str): the directory where checkpoint is saved.
+        checkpoint_dir (str): 
-        iteration (int): the latest iteration number.
+            the directory where checkpoint is saved.
        iteration (int): 
            the latest iteration number.
    Returns:
        None
@ -65,11 +68,14 @@ def load_parameters(model,
    """Load a specific model checkpoint from disk.
    Args:
-        model (Layer): model to load parameters.
+        model (Layer): 
-        optimizer (Optimizer, optional): optimizer to load states if needed.
+            model to load parameters.
-            Defaults to None.
+        optimizer (Optimizer, optional): 
-        checkpoint_dir (str, optional): the directory where checkpoint is saved.
+            optimizer to load states if needed. Defaults to None.
-        checkpoint_path (str, optional): if specified, load the checkpoint
+        checkpoint_dir (str, optional): 
            the directory where checkpoint is saved.
        checkpoint_path (str, optional): 
            if specified, load the checkpoint
            stored in the checkpoint_path and the argument 'checkpoint_dir' will
            be ignored. Defaults to None.
@ -113,11 +119,14 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
    """Checkpoint the latest trained model parameters.
    Args:
-        checkpoint_dir (str): the directory where checkpoint is saved.
+        checkpoint_dir (str): 
-        iteration (int): the latest iteration number.
+            the directory where checkpoint is saved.
-        model (Layer): model to be checkpointed.
+        iteration (int): 
-        optimizer (Optimizer, optional): optimizer to be checkpointed.
+            the latest iteration number.
-            Defaults to None.
+        model (Layer): 
            model to be checkpointed.
        optimizer (Optimizer, optional): 
            optimizer to be checkpointed. Defaults to None.
    Returns:
        None
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
@ -71,10 +71,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    hypothesis sequence in word-level.
    Args:
-        reference (str): The reference sentence.
+        reference (str): 
-        hypothesis (str): The hypothesis sentence.
+            The reference sentence.
-        ignore_case (bool): Whether case-sensitive or not.
+        hypothesis (str): 
-        delimiter (char(str)): Delimiter of input sentences.
+            The hypothesis sentence.
        ignore_case (bool): 
            Whether case-sensitive or not.
        delimiter (char(str)): 
            Delimiter of input sentences.
    Returns:
        list: Levenshtein distance and word number of reference sentence.
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py
@ -24,8 +24,10 @@ import numpy as np
 def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
    """Read a dataset from a HDF5 file.
    Args:
-        filename (Union[Path, str]): Path of the HDF5 file.
+        filename (Union[Path, str]): 
-        dataset_name (str): Name of the dataset to read.
+            Path of the HDF5 file.
        dataset_name (str): 
            Name of the dataset to read.
    Returns:
        Any: The retrieved dataset.
--- a/paddlespeech/t2s/utils/internals.py
+++ b/paddlespeech/t2s/utils/internals.py
@ -22,7 +22,8 @@ def convert_dtype_to_np_dtype_(dtype):
    Convert paddle's data type to corrsponding numpy data type.
    Args:
-        dtype(np.dtype): the data type in paddle.
+        dtype(np.dtype): 
            the data type in paddle.
    Returns:
        type: the data type in numpy.
--- a/setup.py
+++ b/setup.py
@ -76,7 +76,7 @@ server = [
    "fastapi",
    "uvicorn",
    "pattern_singleton",
-    "websockets",
+    "websockets" 
 ]
 requirements = {