change the docstring style from numpydoc to google, test=tts

4 years ago · 9699c00769
parent 683679bec7
commit 9699c00769
57 changed files with 2350 additions and 4150 deletions
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@ -22,26 +22,17 @@ from paddle.io import Dataset

 class DataTable(Dataset):
    """Dataset to load and convert data for general purpose.
-
-    Parameters
-    ----------
-    data : List[Dict[str, Any]]
-        Metadata, a list of meta datum, each of which is composed of 
-        several fields
-    fields : List[str], optional
-        Fields to use, if not specified, all the fields in the data are 
-        used, by default None
-    converters : Dict[str, Callable], optional
-        Converters used to process each field, by default None
-    use_cache : bool, optional
-        Whether to use cache, by default False
-
-    Raises
-    ------
-    ValueError
-        If there is some field that does not exist in data. 
-    ValueError
-        If there is some field in converters that does not exist in fields.
+    Args:
+        data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of  several fields
+        fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
+        converters (Dict[str, Callable], optional): Converters used to process each field, by default None
+        use_cache (bool, optional): Whether to use cache, by default False
+
+    Raises:
+        ValueError:
+            If there is some field that does not exist in data. 
+        ValueError:
+            If there is some field in converters that does not exist in fields.
    """

    def __init__(self,
@ -95,15 +86,11 @@ class DataTable(Dataset):
        """Convert a meta datum to an example by applying the corresponding 
        converters to each fields requested.

-        Parameters
-        ----------
-        meta_datum : Dict[str, Any]
-            Meta datum
+        Args:
+            meta_datum (Dict[str, Any]): Meta datum

-        Returns
-        -------
-        Dict[str, Any]
-            Converted example
+        Returns:
+            Dict[str, Any]: Converted example
        """
        example = {}
        for field in self.fields:
@ -118,16 +105,11 @@ class DataTable(Dataset):

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get

-        Parameters
-        ----------
-        idx : int
-            Index of the example to get
-
-        Returns
-        -------
-        Dict[str, Any]
-            A converted example
+        Returns:
+            Dict[str, Any]: A converted example
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]
--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@ -18,14 +18,10 @@ import re
 def get_phn_dur(file_name):
    '''
    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-        path of gen_duration_from_textgrid.py's result
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+    Returns: 
+        Dict: sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@ -48,10 +44,8 @@ def get_phn_dur(file_name):
 def merge_silence(sentence):
    '''
    merge silences
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': (([char], [int]), str)}
+    Args:
+        sentence (Dict): sentence: {'utt': (([char], [int]), str)}
    '''
    for utt in sentence:
        cur_phn, cur_dur, speaker = sentence[utt]
@ -81,12 +75,9 @@ def merge_silence(sentence):
 def get_input_token(sentence, output_path, dataset="baker"):
    '''
    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    output_path : str or path
-        path to save phone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        output_path (str or path):path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
@ -112,14 +103,10 @@ def get_phones_tones(sentence,
                     dataset="baker"):
    '''
    get phone set and tone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    phones_output_path : str or path
-        path to save phone_id_map
-    tones_output_path : str or path
-        path to save tone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        phones_output_path (str or path): path to save phone_id_map
+        tones_output_path (str or path): path to save tone_id_map
    '''
    phn_token = set()
    tone_token = set()
@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
 def compare_duration_and_mel_length(sentences, utt, mel):
    '''
    check duration error, correct sentences[utt] if possible, else pop sentences[utt]
-    Parameters
-    ----------
-    sentences : Dict
-        sentences[utt] = [phones_list ,durations_list]
-    utt : str
-        utt_id
-    mel : np.ndarry
-        features (num_frames, n_mels)
+    Args:
+        sentences (Dict): sentences[utt] = [phones_list ,durations_list]
+        utt (str): utt_id
+        mel (np.ndarry): features (num_frames, n_mels)
    '''

    if utt in sentences:
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@ -29,15 +29,11 @@ class Clip(object):
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.
+        Args:

-        Parameters
-        ----------
-        batch_max_steps : int
-            The maximum length of input signal in batch.
-        hop_size : int
-            Hop size of auxiliary features.
-        aux_context_window : int
-            Context window size for auxiliary feature conv.
+            batch_max_steps (int): The maximum length of input signal in batch.
+            hop_size (int): Hop size of auxiliary features.
+            aux_context_window (int): Context window size for auxiliary feature conv.

        """
        if batch_max_steps % hop_size != 0:
@ -56,18 +52,15 @@ class Clip(object):
    def __call__(self, batch):
        """Convert into batch tensors.

-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

-        Returns
-        ----------
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
-        Tensor
-            Target signal batch (B, 1, T).
+        Returns: 
+            Tensor:
+                Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
+            Tensor:
+                Target signal batch (B, 1, T).

        """
        # check length
@ -104,11 +97,10 @@ class Clip(object):
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.

-        Note
-        -------
-        Basically we assume that the length of x and c are adjusted
-        through preprocessing stage, but if we use other library processed
-        features, this process will be needed.
+        Note:
+            Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
+            features, this process will be needed.

        """
        if len(x) < c.shape[0] * self.hop_size:
@ -162,22 +154,14 @@ class WaveRNNClip(Clip):
        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
        """Convert into batch tensors.
-
-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. 
-            Audio shape (T, ), features shape(T', C).
-
-        Returns
-        ----------
-        Tensor
-            Input signal batch (B, 1, T).
-        Tensor
-            Target signal batch (B, 1, T).
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns:
+            Tensor: Input signal batch (B, 1, T).
+            Tensor: Target signal batch (B, 1, T).
+            Tensor: Auxiliary feature batch (B, C, T'), 
+                where T = (T' - 2 * aux_context_window) * hop_size.

        """
        # check length
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English


 def get_lj_sentences(file_name, frontend):
-    '''
-    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    '''read MFA duration.txt
+
+    Args:
+        file_name (str or Path)
+    Returns:
+        Dict: sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):


 def get_input_token(sentence, output_path):
-    '''
-    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], str)}
-    output_path : str or path
-        path to save phone_id_map
+    '''get phone set from training data and save it
+    
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], str)}
+        output_path (str or path): path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@ -133,16 +133,11 @@ class ARPABET(Phonetics):

    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns:
+            List[str]: The list of pronunciation sequence.
        """
        phonemes = [
            self._remove_vowels(item) for item in self.backend(sentence)
@ -156,16 +151,12 @@ class ARPABET(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
    
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
@ -173,30 +164,23 @@ class ARPABET(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
        
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids( List[int]): The list of pronunciation id sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: 
+                The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))
@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args: 
+            sentence (str): The input text sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        phonemes = self.backend(sentence)
        if add_start_end:
@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
    
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
    
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@ -65,14 +65,10 @@ class English(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
@ -123,14 +119,10 @@ class English(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+        Returns: 
+            List[int]: The list of pronunciation id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in phonemes
@ -140,27 +132,19 @@ class English(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@ -183,28 +167,21 @@ class EnglishCharacter(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        str
-            A text sequence after normalize.
+        Args:
+            sentence(str): The input text sequence.
+        Returns:
+            str: A text sequence after normalize.
        """
        words = normalize(sentence)
        return words

    def numericalize(self, sentence):
        """ Convert a text sequence into ids.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[int]:
+                List of a character id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in sentence
@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics):

    def reverse(self, ids):
        """ Convert a character id sequence into text.
-        Parameters
-        -----------
-        ids: List[int]
-            List of a character id sequence.
-        Returns
-        ----------
-        str
-            The input text sequence.
+        Args:
+            ids (List[int]): List of a character id sequence.
+        Returns:
+            str: The input text sequence.
        """
        return [self.vocab.reverse(i) for i in ids]

    def __call__(self, sentence):
        """ Normalize the input text sequence and convert it into character id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[int]: List of a character id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@ -264,14 +233,10 @@ class Chinese(Phonetics):

    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        # simplified = self.opencc_backend.convert(sentence)
        simplified = sentence
@ -296,28 +261,20 @@ class Chinese(Phonetics):

    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes(List[str]): The list of pronunciation sequence.
+        Returns:
+                List[int]: The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids

    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))

@ -329,13 +286,9 @@ class Chinese(Phonetics):

    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+        ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
--- a/paddlespeech/t2s/frontend/vocab.py
+++ b/paddlespeech/t2s/frontend/vocab.py
@ -20,22 +20,12 @@ __all__ = ["Vocab"]
 class Vocab(object):
    """  Vocabulary.

-    Parameters
-    -----------
-    symbols: Iterable[str]
-        Common symbols.
-
-    padding_symbol: str, optional
-        Symbol for pad. Defaults to "<pad>".
-
-    unk_symbol: str, optional
-        Symbol for unknow. Defaults to "<unk>"
-
-    start_symbol: str, optional
-        Symbol for start. Defaults to "<s>"
-
-    end_symbol: str, optional
-        Symbol for end. Defaults to "</s>"
+    Args:
+        symbols (Iterable[str]): Common symbols.
+        padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
+        unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
+        start_symbol (str, optional): Symbol for start. Defaults to "<s>"
+        end_symbol (str, optional): Symbol for end. Defaults to "</s>"
    """

    def __init__(self,
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'

 def replace_time(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """

    is_range = len(match.groups()) > 5
@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'

 def replace_date(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    year = match.group(1)
    month = match.group(3)
@ -114,12 +110,10 @@ RE_DATE2 = re.compile(

 def replace_date2(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    year = match.group(1)
    month = match.group(3)
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')

 def replace_frac(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    nominator = match.group(2)
@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')

 def replace_percentage(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    percent = match.group(2)
@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')

 def replace_negative_num(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    number = match.group(2)
@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')

 def replace_default_num(match):
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    number = match.group(0)
    return verbalize_digit(number)
@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')

 def replace_positive_quantifier(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    number = match.group(1)
    match_2 = match.group(2)
@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str:

 def replace_number(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    number = match.group(2)
@ -169,12 +157,10 @@ RE_RANGE = re.compile(

 def replace_range(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    first, second = match.group(1), match.group(8)
    first = RE_NUMBER.sub(replace_number, first)
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str:

 def replace_phone(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    return phone2str(match.group(0), mobile=False)


 def replace_mobile(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    return phone2str(match.group(0))
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')

 def replace_temperature(match) -> str:
    """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
    """
    sign = match.group(1)
    temperature = match.group(2)
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@ -55,14 +55,10 @@ class TextNormalizer():

    def _split(self, text: str, lang="zh") -> List[str]:
        """Split long text into sentences with sentence-splitting punctuations.
-        Parameters
-        ----------
-        text : str
-            The input text.
-        Returns
-        -------
-        List[str]
-            Sentences.
+        Args:
+            text (str): The input text.
+        Returns:
+            List[str]: Sentences.
        """
        # Only for pure Chinese here
        if lang == "zh":
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder

 class FastSpeech2(nn.Layer):
    """FastSpeech2 module.
-
+    
    This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
    High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
    energy, we use token-averaged value introduced in `FastPitch: Parallel
    Text-to-speech with Pitch Prediction`_.
-
+    
    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
        https://arxiv.org/abs/2006.04558
    .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
        https://arxiv.org/abs/2006.06873

+    Args:
+    
+    Returns:
+
    """

    def __init__(
@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer):
            init_enc_alpha: float=1.0,
            init_dec_alpha: float=1.0, ):
        """Initialize FastSpeech2 module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        adim : int
-            Attention dimension.
-        aheads : int
-            Number of attention heads.
-        elayers : int
-            Number of encoder layers.
-        eunits : int
-            Number of encoder hidden units.
-        dlayers : int
-            Number of decoder layers.
-        dunits : int
-            Number of decoder hidden units.
-        postnet_layers : int
-            Number of postnet layers.
-        postnet_chans : int
-            Number of postnet channels.
-        postnet_filts : int
-            Kernel size of postnet.
-        postnet_dropout_rate : float
-            Dropout rate in postnet.
-        use_scaled_pos_enc : bool
-            Whether to use trainable scaled pos encoding.
-        use_batch_norm : bool
-            Whether to use batch normalization in encoder prenet.
-        encoder_normalize_before : bool
-            Whether to apply layernorm layer before encoder block.
-        decoder_normalize_before : bool
-            Whether to apply layernorm layer before
-            decoder block.
-        encoder_concat_after : bool
-            Whether to concatenate attention layer's input and output in encoder.
-        decoder_concat_after : bool
-            Whether to concatenate attention layer's input  and output in decoder.
-        reduction_factor : int
-            Reduction factor.
-        encoder_type : str
-            Encoder type ("transformer" or "conformer").
-        decoder_type : str
-            Decoder type ("transformer" or "conformer").
-        transformer_enc_dropout_rate : float
-            Dropout rate in encoder except attention and positional encoding.
-        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
-            positional encoding.
-        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
-            self-attention module.
-        transformer_dec_dropout_rate (float): Dropout rate in decoder except
-            attention & positional encoding.
-        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
-            positional encoding.
-        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
-            self-attention module.
-        conformer_pos_enc_layer_type : str
-            Pos encoding layer type in conformer.
-        conformer_self_attn_layer_type : str
-            Self-attention layer type in conformer
-        conformer_activation_type : str
-            Activation function type in conformer.
-        use_macaron_style_in_conformer : bool
-            Whether to use macaron style FFN.
-        use_cnn_in_conformer : bool
-            Whether to use CNN in conformer.
-        zero_triu : bool
-            Whether to use zero triu in relative self-attention module.
-        conformer_enc_kernel_size : int
-            Kernel size of encoder conformer.
-        conformer_dec_kernel_size : int
-            Kernel size of decoder conformer.
-        duration_predictor_layers : int
-            Number of duration predictor layers.
-        duration_predictor_chans : int
-            Number of duration predictor channels.
-        duration_predictor_kernel_size : int
-            Kernel size of duration predictor.
-        duration_predictor_dropout_rate : float
-            Dropout rate in duration predictor.
-        pitch_predictor_layers : int
-            Number of pitch predictor layers.
-        pitch_predictor_chans : int
-            Number of pitch predictor channels.
-        pitch_predictor_kernel_size : int
-            Kernel size of pitch predictor.
-        pitch_predictor_dropout_rate : float
-            Dropout rate in pitch predictor.
-        pitch_embed_kernel_size : float
-            Kernel size of pitch embedding.
-        pitch_embed_dropout_rate : float
-            Dropout rate for pitch embedding.
-        stop_gradient_from_pitch_predictor : bool
-            Whether to stop gradient from pitch predictor to encoder.
-        energy_predictor_layers : int
-            Number of energy predictor layers.
-        energy_predictor_chans : int
-            Number of energy predictor channels.
-        energy_predictor_kernel_size : int
-            Kernel size of energy predictor.
-        energy_predictor_dropout_rate : float
-            Dropout rate in energy predictor.
-        energy_embed_kernel_size : float
-            Kernel size of energy embedding.
-        energy_embed_dropout_rate : float
-            Dropout rate for energy embedding.
-        stop_gradient_from_energy_predictor : bool 
-            Whether to stop gradient from energy predictor to encoder.
-        spk_num : Optional[int]
-            Number of speakers. If not None, assume that the spk_embed_dim is not None,
-            spk_ids will be provided as the input and use spk_embedding_table.
-        spk_embed_dim : Optional[int]
-            Speaker embedding dimension. If not None, 
-            assume that spk_emb will be provided as the input or spk_num is not None.
-        spk_embed_integration_type : str
-            How to integrate speaker embedding.
-        tone_num : Optional[int]
-            Number of tones. If not None, assume that the
-            tone_ids will be provided as the input and use tone_embedding_table.
-        tone_embed_dim : Optional[int]
-            Tone embedding dimension. If not None, assume that tone_num is not None.
-        tone_embed_integration_type : str
-            How to integrate tone embedding.
-        init_type : str
-            How to initialize transformer parameters.
-        init_enc_alpha : float
-            Initial value of alpha in scaled pos encoding of the encoder.
-        init_dec_alpha : float
-            Initial value of alpha in scaled pos encoding of the decoder.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            adim (int): Attention dimension.
+            aheads (int): Number of attention heads.
+            elayers (int): Number of encoder layers.
+            eunits (int): Number of encoder hidden units.
+            dlayers (int): Number of decoder layers.
+            dunits (int): Number of decoder hidden units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_chans (int): Number of postnet channels.
+            postnet_filts (int): Kernel size of postnet.
+            postnet_dropout_rate (float): Dropout rate in postnet.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
+            decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
+            decoder_concat_after (bool): Whether to concatenate attention layer's input  and output in decoder.
+            reduction_factor (int): Reduction factor.
+            encoder_type (str): Encoder type ("transformer" or "conformer").
+            decoder_type (str): Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
+            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+            conformer_activation_type (str): Activation function type in conformer.
+            use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
+            use_cnn_in_conformer (bool): Whether to use CNN in conformer.
+            zero_triu (bool): Whether to use zero triu in relative self-attention module.
+            conformer_enc_kernel_size (int): Kernel size of encoder conformer.
+            conformer_dec_kernel_size (int): Kernel size of decoder conformer.
+            duration_predictor_layers (int): Number of duration predictor layers.
+            duration_predictor_chans (int): Number of duration predictor channels.
+            duration_predictor_kernel_size (int): Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+            pitch_predictor_layers (int): Number of pitch predictor layers.
+            pitch_predictor_chans (int): Number of pitch predictor channels.
+            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
+            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
+            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
+            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
+            stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
+            energy_predictor_layers (int): Number of energy predictor layers.
+            energy_predictor_chans (int): Number of energy predictor channels.
+            energy_predictor_kernel_size (int): Kernel size of energy predictor.
+            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
+            energy_embed_kernel_size (float): Kernel size of energy embedding.
+            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
+            stop_gradient_from_energy_predictor（bool): Whether to stop gradient from energy predictor to encoder.
+            spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
+                spk_ids will be provided as the input and use spk_embedding_table.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, 
+                assume that spk_emb will be provided as the input or spk_num is not None.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            tone_num (Optional[int]): Number of tones. If not None, assume that the
+                tone_ids will be provided as the input and use tone_embedding_table.
+            tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
+            tone_embed_integration_type (str): How to integrate tone embedding.
+            init_type (str): How to initialize transformer parameters.
+            init_enc_alpha （float): Initial value of alpha in scaled pos encoding of the encoder.
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
    
        """
        assert check_argument_types()
@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded token ids (B, Tmax).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input (B,).
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        durations : Tensor(int64)
-            Batch of padded durations (B, Tmax).
-        pitch : Tensor
-            Batch of padded token-averaged pitch (B, Tmax, 1).
-        energy : Tensor
-            Batch of padded token-averaged energy (B, Tmax, 1).
-        tone_id : Tensor, optional(int64)
-                Batch of padded tone ids  (B, Tmax).
-        spk_emb : Tensor, optional
-            Batch of speaker embeddings (B, spk_embed_dim).
-        spk_id : Tnesor, optional(int64)
-            Batch of speaker ids (B,)
-
-        Returns
-        ----------
-        Tensor
-            mel outs before postnet
-        Tensor
-            mel outs after postnet
-        Tensor
-            duration predictor's output
-        Tensor
-            pitch predictor's output
-        Tensor
-            energy predictor's output
-        Tensor
-            speech
-        Tensor
-            speech_lengths, modified if reduction_factor > 1
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+            pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
+            tone_id(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+
+        
        """

        # input of embedding must be int64
@ -680,34 +596,22 @@ class FastSpeech2(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : Tensor, optional (int64)
-            Groundtruth of duration (T,).
-        pitch : Tensor, optional
-            Groundtruth of token-averaged pitch (T, 1).
-        energy : Tensor, optional
-            Groundtruth of token-averaged energy (T, 1).
-        alpha : float, optional
-            Alpha to control the speed.
-        use_teacher_forcing : bool, optional
-            Whether to use teacher forcing.
-            If true, groundtruth of duration, pitch and energy will be used.
-        spk_emb : Tensor, optional
-            peaker embedding vector (spk_embed_dim,).
-        spk_id : Tensor, optional(int64)
-            Batch of padded spk ids  (1,).
-        tone_id : Tensor, optional(int64)
-            Batch of padded tone ids  (T,).
-
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+            pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
+            energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
+            alpha(float, optional): Alpha to control the speed.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): Batch of padded spk ids  (1,). (Default value = None)
+            tone_id(Tensor, optional(int64), optional): Batch of padded tone ids  (T,). (Default value = None)
+
+        Returns:
+
+        
        """
        # input of embedding must be int64
        x = paddle.cast(text, 'int64')
@ -761,17 +665,13 @@ class FastSpeech2(nn.Layer):
    def _integrate_with_spk_embed(self, hs, spk_emb):
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim)
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+
+        
        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
@ -790,17 +690,13 @@ class FastSpeech2(nn.Layer):
    def _integrate_with_tone_embed(self, hs, tone_embs):
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        tone_embs : Tensor
-            Batch of speaker embeddings (B, Tmax, tone_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim)
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
+
+        Returns:
+
+        
        """
        if self.tone_embed_integration_type == "add":
            # apply projection and then add to hidden states
@ -819,24 +715,17 @@ class FastSpeech2(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.

-        Parameters
-        ----------
-        ilens : Tensor
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor): Batch of lengths (B,).

-        Returns
-        -------
-        Tensor
-            Mask tensor for self-attention.
-            dtype=paddle.bool
-
-        Examples
-        -------
-        >>> ilens = [5, 3]
-        >>> self._source_mask(ilens)
-        tensor([[[1, 1, 1, 1, 1],
-                    [1, 1, 1, 0, 0]]]) bool
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool

+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool
        """
        x_masks = make_non_pad_mask(ilens)
        return x_masks.unsqueeze(-2)
@ -910,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
                spk_emb=None,
                spk_id=None):
        """
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : paddle.Tensor/np.ndarray, optional (int64)
-            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
-        durations_scale: int/float, optional
-        durations_bias: int/float, optional
-        pitch : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-        pitch_scale: int/float, optional
-            In denormed HZ domain.
-        pitch_bias: int/float, optional
-            In denormed HZ domain.
-        energy : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
-        energy_scale: int/float, optional
-            In denormed domain.
-        energy_bias: int/float, optional
-            In denormed domain.
-        robot : bool, optional
-            Weather output robot style
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
+
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+            durations_scale(int/float, optional): 
+            durations_bias(int/float, optional): 
+            pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+            pitch_scale(int/float, optional): In denormed HZ domain.
+            pitch_bias(int/float, optional): In denormed HZ domain.
+            energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+            energy_scale(int/float, optional): In denormed domain.
+            energy_bias(int/float, optional): In denormed domain.
+            robot: bool:  (Default value = False)
+            spk_emb: (Default value = None)
+            spk_id: (Default value = None)
+
+        Returns:
+            Tensor: logmel
+
        """
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text,
@ -1011,13 +892,9 @@ class FastSpeech2Loss(nn.Layer):
    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
-
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to weighted masking in loss calculation.
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
@ -1048,42 +925,22 @@ class FastSpeech2Loss(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        d_outs : Tensor
-                Batch of outputs of duration predictor (B, Tmax).
-        p_outs : Tensor
-            Batch of outputs of pitch predictor (B, Tmax, 1).
-        e_outs : Tensor
-            Batch of outputs of energy predictor (B, Tmax, 1).
-        ys : Tensor
-            Batch of target features (B, Lmax, odim).
-        ds : Tensor
-            Batch of durations (B, Tmax).
-        ps : Tensor
-            Batch of target token-averaged pitch (B, Tmax, 1).
-        es : Tensor
-            Batch of target token-averaged energy (B, Tmax, 1).
-        ilens : Tensor
-            Batch of the lengths of each input (B,).
-        olens : Tensor
-            Batch of the lengths of each target (B,).
-
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Duration predictor loss value.
-        Tensor
-            Pitch predictor loss value.
-        Tensor
-            Energy predictor loss value.
-
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
+            p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+            e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+            ys(Tensor): Batch of target features (B, Lmax, odim).
+            ds(Tensor): Batch of durations (B, Tmax).
+            ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+            es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+            ilens(Tensor): Batch of the lengths of each input (B,).
+            olens(Tensor): Batch of the lengths of each target (B,).
+
+        Returns:
+
+        
        """
        # apply mask to remove padded part
        if self.use_masking:
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        channels : int
-            Number of hidden representation channels.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_kernel_sizes : list
-            List of kernel sizes for upsampling layers.
-        resblock_kernel_sizes : list
-            List of kernel sizes for residual blocks.
-        resblock_dilations : list
-            List of dilation list for residual blocks.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+            resblock_dilations (list): List of dilation list for residual blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
        """
        c = self.input_conv(c)
        for i in range(self.num_upsamples):
@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Tensor 
-            Input tensor (T, in_channels).
-            normalize_before (bool): Whether to perform normalization.
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+                normalize_before (bool): Whether to perform normalization.
+        Returns:
+            Tensor:
+                Output tensor (T ** prod(upsample_scales), out_channels).
        """
        c = self.forward(c.transpose([1, 0]).unsqueeze(0))
        return c.squeeze(0).transpose([1, 0])
@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANPeriodDiscriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        period : int
-            Period.
-        kernel_sizes : list
-            Kernel sizes of initial conv layers and the final conv layer.
-        channels : int
-            Number of initial channels.
-        downsample_scales : list
-            List of downsampling scales.
-        max_downsample_channels : int
-            Number of maximum downsampling channels.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+            channels (int): Number of initial channels.
+            downsample_scales (list): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        list
-            List of each layer's tensors.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            list: List of each layer's tensors.
        """
        # transform 1d to 2d -> (B, C, T/P, P)
        b, c, t = paddle.shape(x)
@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANMultiPeriodDiscriminator module.
-        Parameters
-        ----------
-        periods : list
-            List of periods.
-        discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
-            The period parameter will be overwritten.
+
+        Args:
+            periods (list): List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
        """
        super().__init__()
        # initialize parameters
@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : list
-            List of four kernel sizes. The first will be used for the first conv layer,
-            and the second is for downsampling part, and the remaining two are for output layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : list
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+                and the second is for downsampling part, and the remaining two are for output layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer.
        """
        outs = []
        for f in self.layers:
@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
            follow_official_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool
-            Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
+   
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
        """
        super().__init__()

@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        scale_downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        scale_downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        scale_discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool): Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
-        periods : list
-            List of periods.
-        period_discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
-            The period parameter will be overwritten.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
+                The first discriminator uses spectral norm and the other discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
        """
        super().__init__()

@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List:
-            List of list of each discriminator outputs,
-            which consists of each layer output tensors.
-            Multi scale and multi period ones are concatenated.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List:
+                List of list of each discriminator outputs,
+                which consists of each layer output tensors.
+                Multi scale and multi period ones are concatenated.
        """
        msd_outs = self.msd(x)
        mpd_outs = self.mpd(x)
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
            use_causal_conv: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize MelGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels,
-            the number of sub-band is out_channels in multi-band melgan.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        channels : int
-            Initial number of channels for conv layer.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        upsample_scales : List[int]
-            List of upsampling scales.
-        stack_kernel_size : int
-            Kernel size of dilated conv layers in residual stack.
-        stacks : int
-            Number of stacks in a single residual stack.
-        nonlinear_activation : Optional[str], optional
-            Non linear activation in upsample network, by default None
-        nonlinear_activation_params : Dict[str, Any], optional
-            Parameters passed to the linear activation in the upsample network, 
-            by default {}
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_final_nonlinear_activation : nn.Layer
-            Activation function for the final layer.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels,
+                the number of sub-band is out_channels in multi-band melgan.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (List[int]): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+                by default {}
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params （dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()

@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        out = self.melgan(c)
        return out
@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Union[Tensor, ndarray]
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (out_channels*T ** prod(upsample_scales), 1).
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
        """
        # pseudo batch
        c = c.transpose([1, 0]).unsqueeze(0)
@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The prod will be used for the first conv layer,
-            and the first and the second kernel sizes will be used for the last two layers.
-            For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
-            the last two layers' kernel size will be 5 and 3, respectively.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+                the last two layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
        """
        super().__init__()

@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer (for feat_match_loss).
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer (for feat_match_loss).
        """
        outs = []
        for f in self.layers:
@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The sum will be used for the first conv layer,
-            and the first and the second kernel sizes will be used for the last two layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()

@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN generator.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input noise channels.
-        aux_channels : int
-            Number of auxiliary input channels.
-        channels : int
-            Number of channels for conv layer.
-        out_channels : int
-            Number of output channels.
-        kernel_size : int
-            Kernel size of conv layers.
-        dilation : int
-            Dilation factor for conv layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        noise_upsample_scales : list
-            List of noise upsampling scales.
-        noise_upsample_activation : str
-            Activation function module name for noise upsampling.
-        noise_upsample_activation_params : dict
-            Hyperparameters for the above activation function.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_mode : str
-            Upsampling mode in TADE layer.
-        gated_function : str
-            Gated function in TADEResBlock ("softmax" or "sigmoid").
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (list): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+            upsample_scales (list): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()

@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):

    def forward(self, c, z=None):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Auxiliary input tensor (B, channels, T).
-        z : Tensor
-            Input noise tensor (B, in_channels, 1).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
        if z is None:
@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):

    def inference(self, c):
        """Perform inference.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
        """
        # (1, in_channels, T)
        c = c.transpose([1, 0]).unsqueeze(0)
@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN discriminator.
-        Parameters
-        ----------
-        repeats : int
-            Number of repititons to apply RWD.
-        window_sizes : list
-            List of random window sizes.
-        pqmf_params : list
-            List of list of Parameters for PQMF modules
-        discriminator_params : dict
-            Parameters for base discriminator module.
-        use_weight_nom : bool
-            Whether to apply weight normalization.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
+            pqmf_params (list): List of list of Parameters for PQMF modules
+            discriminator_params (dict): Parameters for base discriminator module.
+            use_weight_nom (bool): Whether to apply weight normalization.
        """
        super().__init__()

@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        List
-            List of discriminator outputs, #items in the list will be
-            equal to repeats * #discriminators.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
+                equal to repeats * #discriminators.
        """
        outs = []
        for _ in range(self.repeats):
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
 class PWGGenerator(nn.Layer):
    """Wave Generator for Parallel WaveGAN

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input waveform, by default 1
-    out_channels : int, optional
-        Number of channels of the output waveform, by default 1
-    kernel_size : int, optional
-        Kernel size of the residual blocks inside, by default 3
-    layers : int, optional
-        Number of residual blocks inside, by default 30
-    stacks : int, optional
-        The number of groups to split the residual blocks into, by default 3
-        Within each group, the dilation of the residual block grows 
-        exponentially.
-    residual_channels : int, optional
-        Residual channel of the residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channel of the residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channel of the residual blocks, by default 64
-    aux_channels : int, optional
-        Auxiliary channel of the residual blocks, by default 80
-    aux_context_window : int, optional
-        The context window size of the first convolution applied to the 
-        auxiliary input, by default 2
-    dropout : float, optional
-        Dropout of the residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight norm in all convolutions, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding in the upsample network and residual 
-        blocks, by default False
-    upsample_scales : List[int], optional
-        Upsample scales of the upsample network, by default [4, 4, 4, 4]
-    nonlinear_activation : Optional[str], optional
-        Non linear activation in upsample network, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to the linear activation in the upsample network, 
-        by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the upsample network, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Kernel size along the frequency axis of the upsample network, by default 1
+    Args:
+        in_channels (int, optional): Number of channels of the input waveform, by default 1
+        out_channels (int, optional): Number of channels of the output waveform, by default 1
+        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): Number of residual blocks inside, by default 30
+        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+        aux_context_window (int, optional): The context window size of the first convolution applied to the 
+            auxiliary input, by default 2
+        dropout (float, optional): Dropout of the residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
+            blocks, by default False
+        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+            by default {}
+        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
    """

    def __init__(
@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
    def forward(self, x, c):
        """Generate waveform.

-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_in, T), The input waveform.
-        c : Tensor
-            Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It 
+        Args:
+            x(Tensor): Shape (N, C_in, T), The input waveform.
+            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
            is upsampled to match the time resolution of the input.

-        Returns
-        -------
-        Tensor
-            Shape (N, C_out, T), the generated waveform.
+        Returns:
+            Tensor: Shape (N, C_out, T), the generated waveform.
        """
        c = self.upsample_net(c)
        assert c.shape[-1] == x.shape[-1]
@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
        self.apply(_remove_weight_norm)

    def inference(self, c=None):
-        """Waveform generation. This function is used for single instance 
-        inference.
-        Parameters
-        ----------
-        c : Tensor, optional
-            Shape (T', C_aux), the auxiliary input, by default None
-        x : Tensor, optional
-            Shape (T, C_in), the noise waveform, by default None
-            If not provided, a sample is drawn from a gaussian distribution.
-        Returns
-        -------
-        Tensor
-            Shape (T, C_out), the generated waveform
+        """Waveform generation. This function is used for single instance inference.
+
+        Args:
+            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+
+        Returns:
+            Tensor: Shape (T, C_out), the generated waveform
        """
        # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
        x = paddle.randn(
@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer):
 class PWGDiscriminator(nn.Layer):
    """A convolutional discriminator for audio.

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of convolutional sublayers, by default 3
-    layers : int, optional
-        Number of layers, by default 10
-    conv_channels : int, optional
-        Feature size of the convolutional sublayers, by default 64
-    dilation_factor : int, optional
-        The factor with which dilation of each convolutional sublayers grows 
-        exponentially if it is greater than 1, else the dilation of each 
-        convolutional sublayers grows linearly, by default 1
-    nonlinear_activation : str, optional
-        The activation after each convolutional sublayer, by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        The parameters passed to the activation's initializer, by default 
-        {"negative_slope": 0.2}
-    bias : bool, optional
-        Whether to use bias in convolutional sublayers, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization at all convolutional sublayers, 
-        by default True
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+        layers (int, optional): Number of layers, by default 10
+        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
+            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
+            by default 1
+        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
+            {"negative_slope": 0.2}
+        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
+            by default True
    """

    def __init__(
@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+
+        Args:
+            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        """
        return self.conv_layers(x)

@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
 class ResidualPWGDiscriminator(nn.Layer):
    """A wavenet-style discriminator for audio.

-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of residual blocks, by default 3
-    layers : int, optional
-        Number of residual blocks, by default 30
-    stacks : int, optional
-        Number of groups of residual blocks, within which the dilation 
-        of each residual blocks grows exponentially, by default 3
-    residual_channels : int, optional
-        Residual channels of residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channels of residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channels of residual blocks, by default 64
-    dropout : float, optional
-        Dropout probability of residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization in all convolutional layers, 
-        by default True
-    use_causal_conv : bool, optional
-        Whether to use causal convolution in residual blocks, by default False
-    nonlinear_activation : str, optional
-        Activation after convolutions other than those in residual blocks, 
-        by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters to pass to the activation, by default {"negative_slope": 0.2}
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of residual blocks, by default 3
+        layers (int, optional): Number of residual blocks, by default 30
+        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
+            of each residual blocks grows exponentially, by default 3
+        residual_channels (int, optional): Residual channels of residual blocks, by default 64
+        gate_channels (int, optional): Gate channels of residual blocks, by default 128
+        skip_channels (int, optional): Skip channels of residual blocks, by default 64
+        dropout (float, optional): Dropout probability of residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
+            by default True
+        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
+            by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
+            by default {"negative_slope": 0.2}
    """

    def __init__(
@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+        Args:
+            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        """
        x = self.first_conv(x)
        skip = 0
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@ -81,69 +81,39 @@ class Tacotron2(nn.Layer):
            # training related
            init_type: str="xavier_uniform", ):
        """Initialize Tacotron2 module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        embed_dim : int
-            Dimension of the token embedding.
-        elayers : int
-            Number of encoder blstm layers.
-        eunits : int
-            Number of encoder blstm units.
-        econv_layers : int
-            Number of encoder conv layers.
-        econv_filts : int
-            Number of encoder conv filter size.
-        econv_chans : int
-            Number of encoder conv filter channels.
-        dlayers : int
-            Number of decoder lstm layers.
-        dunits : int
-            Number of decoder lstm units.
-        prenet_layers : int
-            Number of prenet layers.
-        prenet_units : int
-            Number of prenet units.
-        postnet_layers : int
-            Number of postnet layers.
-        postnet_filts : int
-            Number of postnet filter size.
-        postnet_chans : int
-            Number of postnet filter channels.
-        output_activation : str
-            Name of activation function for outputs.
-        adim : int
-            Number of dimension of mlp in attention.
-        aconv_chans : int
-            Number of attention conv filter channels.
-        aconv_filts : int
-            Number of attention conv filter size.
-        cumulate_att_w : bool
-            Whether to cumulate previous attention weight.
-        use_batch_norm : bool
-            Whether to use batch normalization.
-        use_concate : bool
-            Whether to concat enc outputs w/ dec lstm outputs.
-        reduction_factor : int
-            Reduction factor.
-        spk_num : Optional[int]
-            Number of speakers. If set to > 1, assume that the
-            sids will be provided as the input and use sid embedding layer.
-        lang_num : Optional[int]
-            Number of languages. If set to > 1, assume that the
-            lids will be provided as the input and use sid embedding layer.
-        spk_embed_dim : Optional[int]
-            Speaker embedding dimension. If set to > 0,
-            assume that spk_emb will be provided as the input.
-        spk_embed_integration_type : str
-            How to integrate speaker embedding.
-        dropout_rate : float
-            Dropout rate.
-        zoneout_rate : float
-            Zoneout rate.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            embed_dim (int): Dimension of the token embedding.
+            elayers (int): Number of encoder blstm layers.
+            eunits (int): Number of encoder blstm units.
+            econv_layers (int): Number of encoder conv layers.
+            econv_filts (int): Number of encoder conv filter size.
+            econv_chans (int): Number of encoder conv filter channels.
+            dlayers (int): Number of decoder lstm layers.
+            dunits (int): Number of decoder lstm units.
+            prenet_layers (int): Number of prenet layers.
+            prenet_units (int): Number of prenet units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_filts (int): Number of postnet filter size.
+            postnet_chans (int): Number of postnet filter channels.
+            output_activation (str): Name of activation function for outputs.
+            adim (int): Number of dimension of mlp in attention.
+            aconv_chans (int): Number of attention conv filter channels.
+            aconv_filts (int): Number of attention conv filter size.
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+            use_batch_norm (bool): Whether to use batch normalization.
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+            reduction_factor (int): Reduction factor.
+            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spk_emb will be provided as the input.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            dropout_rate (float): Dropout rate.
+            zoneout_rate (float): Zoneout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -258,31 +228,19 @@ class Tacotron2(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, T_text).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, T_feats, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Optional[Tensor]
-            Batch of speaker embeddings (B, spk_embed_dim).
-        spk_id : Optional[Tensor]
-            Batch of speaker IDs (B, 1).
-        lang_id : Optional[Tensor]
-            Batch of language IDs (B, 1).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
-        Tensor
-            Weight value if not joint training else model outputs.
+        Args:
+            text (Tensor(int64)): Batch of padded character ids (B, T_text).
+            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech (Tensor): Batch of padded target features (B, T_feats, odim).
+            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.

        """
        text = text[:, :text_lengths.max()]
@ -369,40 +327,26 @@ class Tacotron2(nn.Layer):
            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text Tensor(int64)
-            Input sequence of characters (T_text,).
-        speech : Optional[Tensor]
-            Feature sequence to extract style (N, idim).
-        spk_emb : ptional[Tensor]
-            Speaker embedding (spk_embed_dim,).
-        spk_id : Optional[Tensor]
-            Speaker ID (1,).
-        lang_id : Optional[Tensor]
-            Language ID (1,).
-        threshold : float
-            Threshold in inference.
-        minlenratio : float
-            Minimum length ratio in inference.
-        maxlenratio : float
-            Maximum length ratio in inference.
-        use_att_constraint : bool
-            Whether to apply attention constraint.
-        backward_window : int
-            Backward window in attention constraint.
-        forward_window : int
-            Forward window in attention constraint.
-        use_teacher_forcing : bool
-            Whether to use teacher forcing.
-
-        Return
-        ----------
-        Dict[str, Tensor]
-        Output dict including the following items:
-            * feat_gen (Tensor): Output sequence of features (T_feats, odim).
-            * prob (Tensor): Output sequence of stop probabilities (T_feats,).
-            * att_w (Tensor): Attention weights (T_feats, T).
+        Args:
+            text (Tensor(int64)): Input sequence of characters (T_text,).
+            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+            spk_id (Optional[Tensor]): Speaker ID (1,).
+            lang_id (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_att_constraint (bool): Whether to apply attention constraint.
+            backward_window (int): Backward window in attention constraint.
+            forward_window (int): Forward window in attention constraint.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]
+            Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+                * att_w (Tensor): Attention weights (T_feats, T).

        """
        x = text
@ -458,18 +402,13 @@ class Tacotron2(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-         hs : Tensor
-            Batch of hidden state sequences (B, Tmax, eunits).
-         spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-         Tensor
-            Batch of integrated hidden state sequences (B, Tmax, eunits) if
-            integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+            spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
+                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).

        """
        if self.spk_embed_integration_type == "add":
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer):
    .. _`Neural Speech Synthesis with Transformer Network`:
        https://arxiv.org/pdf/1809.08895.pdf

-    Parameters
-    ----------
-    idim : int
-        Dimension of the inputs.
-    odim : int
-        Dimension of the outputs.
-    embed_dim : int, optional
-        Dimension of character embedding.
-    eprenet_conv_layers : int, optional
-        Number of encoder prenet convolution layers.
-    eprenet_conv_chans : int, optional
-        Number of encoder prenet convolution channels.
-    eprenet_conv_filts : int, optional
-        Filter size of encoder prenet convolution.
-    dprenet_layers : int, optional
-        Number of decoder prenet layers.
-    dprenet_units : int, optional
-        Number of decoder prenet hidden units.
-    elayers : int, optional
-        Number of encoder layers.
-    eunits : int, optional
-        Number of encoder hidden units.
-    adim : int, optional
-        Number of attention transformation dimensions.
-    aheads : int, optional
-        Number of heads for multi head attention.
-    dlayers : int, optional
-        Number of decoder layers.
-    dunits : int, optional
-        Number of decoder hidden units.
-    postnet_layers : int, optional
-        Number of postnet layers.
-    postnet_chans : int, optional
-        Number of postnet channels.
-    postnet_filts : int, optional
-        Filter size of postnet.
-    use_scaled_pos_enc : pool, optional
-        Whether to use trainable scaled positional encoding.
-    use_batch_norm : bool, optional
-        Whether to use batch normalization in encoder prenet.
-    encoder_normalize_before : bool, optional
-        Whether to perform layer normalization before encoder block.
-    decoder_normalize_before : bool, optional
-        Whether to perform layer normalization before decoder block.
-    encoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in encoder.
-    decoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in decoder.
-    positionwise_layer_type : str, optional
-        Position-wise operation type.
-    positionwise_conv_kernel_size : int, optional
-        Kernel size in position wise conv 1d.
-    reduction_factor : int, optional
-        Reduction factor.
-    spk_embed_dim : int, optional
-        Number of speaker embedding dimenstions.
-    spk_embed_integration_type : str, optional
-        How to integrate speaker embedding.
-    use_gst : str, optional
-        Whether to use global style token.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    gst_conv_layers : int, optional
-        The number of conv layers in GST.
-    gst_conv_chans_list : Sequence[int], optional
-            List of the number of channels of conv layers in GST.
-    gst_conv_kernel_size : int, optional
-        Kernal size of conv layers in GST.
-    gst_conv_stride : int, optional
-        Stride size of conv layers in GST.
-    gst_gru_layers : int, optional
-        The number of GRU layers in GST.
-    gst_gru_units : int, optional
-        The number of GRU units in GST.
-    transformer_lr : float, optional
-        Initial value of learning rate.
-    transformer_warmup_steps : int, optional
-        Optimizer warmup steps.
-    transformer_enc_dropout_rate : float, optional
-        Dropout rate in encoder except attention and positional encoding.
-    transformer_enc_positional_dropout_rate : float, optional
-        Dropout rate after encoder positional encoding.
-    transformer_enc_attn_dropout_rate : float, optional
-        Dropout rate in encoder self-attention module.
-    transformer_dec_dropout_rate : float, optional
-        Dropout rate in decoder except attention & positional encoding.
-    transformer_dec_positional_dropout_rate : float, optional
-        Dropout rate after decoder positional encoding.
-    transformer_dec_attn_dropout_rate : float, optional
-        Dropout rate in deocoder self-attention module.
-    transformer_enc_dec_attn_dropout_rate : float, optional
-        Dropout rate in encoder-deocoder attention module.
-    init_type : str, optional
-        How to initialize transformer parameters.
-    init_enc_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the encoder.
-    init_dec_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the decoder.
-    eprenet_dropout_rate : float, optional
-        Dropout rate in encoder prenet.
-    dprenet_dropout_rate : float, optional
-        Dropout rate in decoder prenet.
-    postnet_dropout_rate : float, optional
-        Dropout rate in postnet.
-    use_masking : bool, optional
-        Whether to apply masking for padded part in loss calculation.
-    use_weighted_masking : bool, optional
-        Whether to apply weighted masking in loss calculation.
-    bce_pos_weight : float, optional
-        Positive sample weight in bce calculation (only for use_masking=true).
-    loss_type : str, optional
-        How to calculate loss.
-    use_guided_attn_loss : bool, optional
-        Whether to use guided attention loss.
-    num_heads_applied_guided_attn : int, optional
-        Number of heads in each layer to apply guided attention loss.
-    num_layers_applied_guided_attn : int, optional
-        Number of layers to apply guided attention loss.
-        List of module names to apply guided attention loss.
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        embed_dim (int, optional): Dimension of character embedding.
+        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+        dprenet_layers (int, optional): Number of decoder prenet layers.
+        dprenet_units (int, optional): Number of decoder prenet hidden units.
+        elayers (int, optional): Number of encoder layers.
+        eunits (int, optional): Number of encoder hidden units.
+        adim (int, optional): Number of attention transformation dimensions.
+        aheads (int, optional): Number of heads for multi head attention.
+        dlayers (int, optional): Number of decoder layers.
+        dunits (int, optional): Number of decoder hidden units.
+        postnet_layers (int, optional): Number of postnet layers.
+        postnet_chans (int, optional): Number of postnet channels.
+        postnet_filts (int, optional): Filter size of postnet.
+        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+        positionwise_layer_type (str, optional): Position-wise operation type.
+        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+        reduction_factor (int, optional): Reduction factor.
+        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
+        transformer_lr (float, optional): Initial value of learning rate.
+        transformer_warmup_steps (int, optional): Optimizer warmup steps.
+        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
+        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
+        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+        init_type (str, optional): How to initialize transformer parameters.
+        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+        postnet_dropout_rate (float, optional): Dropout rate in postnet.
+        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+        loss_type (str, optional): How to calculate loss.
+        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
+            List of module names to apply guided attention loss.
    """

    def __init__(
@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, Tmax).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Tensor, optional
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
+        Args:
+            text(Tensor(int64)): Batch of padded character ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.

        """
        # input of embedding must be int64
@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.

-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        spk_emb : Tensor, optional
-            Speaker embedding vector (spk_embed_dim,).
-        threshold : float, optional
-            Threshold in inference.
-        minlenratio : float, optional
-            Minimum length ratio in inference.
-        maxlenratio : float, optional
-            Maximum length ratio in inference.
-        use_teacher_forcing : bool, optional
-            Whether to use teacher forcing.
-
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        Tensor
-            Output sequence of stop probabilities (L,).
-        Tensor
-            Encoder-decoder (source) attention weights (#layers, #heads, L, T).
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+            threshold(float, optional): Threshold in inference.
+            minlenratio(float, optional): Minimum length ratio in inference.
+            maxlenratio(float, optional): Maximum length ratio in inference.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).

        """
        # input of embedding must be int64
@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.

-        Parameters
-        ----------
-        ilens : Tensor
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor): Batch of lengths (B,).

-        Returns
-        -------
-        Tensor
-            Mask tensor for self-attention.
-            dtype=paddle.bool
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool

-        Examples
-        -------
-        >>> ilens = [5, 3]
-        >>> self._source_mask(ilens)
-        tensor([[[1, 1, 1, 1, 1],
-                    [1, 1, 1, 0, 0]]]) bool
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool

        """
        x_masks = make_non_pad_mask(ilens)
@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer):
    def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for masked self-attention.

-        Parameters
-        ----------
-            olens : LongTensor
-                Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor
-            Mask tensor for masked self-attention.
-
-        Examples
-        ----------
-        >>> olens = [5, 3]
-        >>> self._target_mask(olens)
-        tensor([[[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 0],
-                    [1, 1, 1, 1, 1]],
-                [[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
+        Args:
+            olens (Tensor(int64)): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for masked self-attention.
+
+        Examples:
+            >>> olens = [5, 3]
+            >>> self._target_mask(olens)
+            tensor([[[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 1, 0],
+                        [1, 1, 1, 1, 1]],
+                    [[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0]]], dtype=paddle.uint8)

        """
        y_masks = make_non_pad_mask(olens)
@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.

-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim).
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).

        """
        if self.spk_embed_integration_type == "add":
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]


 def fold(x, n_group):
-    r"""Fold audio or spectrogram's temporal dimension in to groups.
+    """Fold audio or spectrogram's temporal dimension in to groups.

-    Parameters
-    ----------
-    x : Tensor [shape=(\*, time_steps)
-        The input tensor.
+    Args:
+        x(Tensor): The input tensor. shape=(\*, time_steps)
+        n_group(int): The size of a group.

-    n_group : int
-        The size of a group.
-
-    Returns
-    ---------
-    Tensor : [shape=(\*, time_steps // n_group, group)]
-        Folded tensor.
+    Returns:
+        Tensor: Folded tensor. shape=(\*, time_steps // n_group, group)
    """
    spatial_shape = list(x.shape[:-1])
    time_steps = paddle.shape(x)[-1]
@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList):
    It consists of several conv2dtranspose layers which perform deconvolution
    on mel and time dimension.

-    Parameters
-    ----------
-    upscale_factors : List[int], optional
-        Time upsampling factors for each Conv2DTranspose Layer.
-
-        The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
-        Layers. Each upscale_factor is used as the ``stride`` for the
-        corresponding Conv2DTranspose. Defaults to [16, 16], this the default
-        upsampling factor is 256.
+    Args:
+        upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
+            The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
+            Layers. Each upscale_factor is used as the ``stride`` for the
+            corresponding Conv2DTranspose. Defaults to [16, 16], this the default
+            upsampling factor is 256.

-    Notes
-    ------
-    ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
-    transformation used to extract spectrogram features from audio.
+    Notes:
+        ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
+        transformation used to extract spectrogram features from audio.

-    For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
-    transformation whose ``hop_length`` equals 256 is suitable.
+        For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
+        transformation whose ``hop_length`` equals 256 is suitable.

-    See Also
-    ---------
-    ``librosa.core.stft``
+        See Also
+    
+        ``librosa.core.stft``
    """

    def __init__(self, upsample_factors):
@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList):
        self.upsample_factors = upsample_factors

    def forward(self, x, trim_conv_artifact=False):
-        r"""Forward pass of the ``UpsampleNet``.
+        """Forward pass of the ``UpsampleNet``

-        Parameters
-        -----------
-        x : Tensor [shape=(batch_size, input_channels, time_steps)]
-            The input spectrogram.
+        Args:
+            x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
+            trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.

-        trim_conv_artifact : bool, optional
-            Trim deconvolution artifact at each layer. Defaults to False.
+        Returns:
+           Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps \* upsample_factor)

-        Returns
-        --------
-        Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)]
-            The upsampled spectrogram.
-
-        Notes
-        --------
-        If trim_conv_artifact is ``True``, the output time steps is less
-        than ``time_steps \* upsample_factors``.
+        Notes:
+            If trim_conv_artifact is ``True``, the output time steps is less
+            than ``time_steps \* upsample_factors``.
        """
        x = paddle.unsqueeze(x, 1)  # (B, C, T) -> (B, 1, C, T)
        for layer in self:
@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer):
    same paddign in width dimension. It also has projection for the condition
    and output.

-    Parameters
-    ----------
-    channels : int
-        Feature size of the input.
-
-    cond_channels : int
-        Featuer size of the condition.
-
-    kernel_size : Tuple[int]
-        Kernel size of the Convolution2d applied to the input.
-
-    dilations : int
-        Dilations of the Convolution2d applied to the input.
+    Args:
+        channels (int): Feature size of the input.
+        cond_channels (int): Featuer size of the condition.
+        kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
+        dilations (int): Dilations of the Convolution2d applied to the input.
    """

    def __init__(self, channels, cond_channels, kernel_size, dilations):
@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer):
    def forward(self, x, condition):
        """Compute output for a whole folded sequence.

-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, channel, height, width)]
-            The input.
-
-        condition : Tensor [shape=(batch_size, condition_channel, height, width)]
-            The local condition.
+        Args:
+            x (Tensor): The input. [shape=(batch_size, channel, height, width)]
+            condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.

-        Returns
-        -------
-        res : Tensor [shape=(batch_size, channel, height, width)]
-            The residual output.
-
-        skip : Tensor [shape=(batch_size, channel, height, width)]
-            The skip output.
+        Returns: 
+            res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
+            skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
        """
        x_in = x
        x = self.conv(x)
@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer):
    def add_input(self, x_row, condition_row):
        """Compute the output for a row and update the buffer.

-        Parameters
-        ----------
-        x_row : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the input.
-
-        condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
-            A row of the condition.
+        Args:
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)

-        Returns
-        -------
-        res : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the the residual output.
+        Returns:
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)

-        skip : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the skip output.
        """
        x_row_in = x_row
        if len(paddle.shape(self._conv_buffer)) == 1:
@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer):
 class ResidualNet(nn.LayerList):
    """A stack of several ResidualBlocks. It merges condition at each layer.

-    Parameters
-    ----------
-    n_layer : int
-        Number of ResidualBlocks in the ResidualNet.
-
-    residual_channels : int
-        Feature size of each ResidualBlocks.
-
-    condition_channels : int
-        Feature size of the condition.
+    Args:
+        n_layer (int): Number of ResidualBlocks in the ResidualNet.
+        residual_channels (int): Feature size of each ResidualBlocks.
+        condition_channels (int): Feature size of the condition.
+        kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
+        dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.

-    kernel_size : Tuple[int]
-        Kernel size of each ResidualBlock.
-
-    dilations_h : List[int]
-        Dilation in height dimension of every ResidualBlock.
-
-    Raises
-    ------
-    ValueError
-        If the length of dilations_h does not equals n_layers.
+    Raises:
+        ValueError: If the length of dilations_h does not equals n_layers.
    """

    def __init__(self,
@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList):
    def forward(self, x, condition):
        """Comput the output of given the input and the condition.

-        Parameters
-        -----------
-        x : Tensor [shape=(batch_size, channel, height, width)]
-            The input.
-
-        condition : Tensor [shape=(batch_size, condition_channel, height, width)]
-            The local condition.
-
-        Returns
-        --------
-        Tensor : [shape=(batch_size, channel, height, width)]
-            The output, which is an aggregation of all the skip outputs.
+        Args:
+            x (Tensor): The input. shape=(batch_size, channel, height, width)
+            condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
+            
+        Returns: 
+            Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
+            
        """
        skip_connections = []
        for layer in self:
@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList):
    def add_input(self, x_row, condition_row):
        """Compute the output for a row and update the buffers.

-        Parameters
-        ----------
-        x_row : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the input.
-
-        condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
-            A row of the condition.
-
-        Returns
-        -------
-        res : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the the residual output.
-
-        skip : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the skip output.
+        Args:
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            condition_row (Tensor):  A row of the condition. shape=(batch_size, condition_channel, 1, width)
+            
+        Returns:
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) 
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+                
        """
        skip_connections = []
        for layer in self:
@ -400,22 +336,12 @@ class Flow(nn.Layer):
    probability density estimation. The ``inverse`` method implements the
    sampling.

-    Parameters
-    ----------
-    n_layers : int
-        Number of ResidualBlocks in the Flow.
-
-    channels : int
-        Feature size of the ResidualBlocks.
-
-    mel_bands : int
-        Feature size of the mel spectrogram (mel bands).
-
-    kernel_size : Tuple[int]
-        Kernel size of each ResisualBlocks in the Flow.
-
-    n_group : int
-        Number of timesteps to the folded into a group.
+    Args:
+        n_layers (int): Number of ResidualBlocks in the Flow.
+        channels (int): Feature size of the ResidualBlocks.
+        mel_bands (int): Feature size of the mel spectrogram (mel bands).
+        kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
+        n_group (int): Number of timesteps to the folded into a group.
    """
    dilations_dict = {
        8: [1, 1, 1, 1, 1, 1, 1, 1],
@ -466,26 +392,16 @@ class Flow(nn.Layer):
        """Probability density estimation. It is done by inversely transform
        a sample from p(X) into a sample from p(Z).

-        Parameters
-        -----------
-        x : Tensor [shape=(batch, 1, height, width)]
-            A input sample of the distribution p(X).
-
-        condition : Tensor [shape=(batch, condition_channel, height, width)]
-            The local condition.
-
-        Returns
-        --------
-        z (Tensor): shape(batch, 1, height, width), the transformed sample.
-
-        Tuple[Tensor, Tensor]
-            The parameter of the transformation.
-
-            logs (Tensor): shape(batch, 1, height - 1, width), the log scale
-            of the transformation from x to z.
-
-            b (Tensor): shape(batch, 1, height - 1, width), the shift of the
-            transformation from x to z.
+        Args:
+            x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
+            condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
+            
+        Returns:
+            z (Tensor): shape(batch, 1, height, width), the transformed sample.
+            Tuple[Tensor, Tensor]:
+                The parameter of the transformation.
+                logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
+                b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z.
        """
        # (B, C, H-1, W)
        logs, b = self._predict_parameters(x[:, :, :-1, :],
@ -516,27 +432,12 @@ class Flow(nn.Layer):
        """Sampling from the the distrition p(X). It is done by sample form
        p(Z) and transform the sample. It is a auto regressive transformation.

-        Parameters
-        -----------
-        z : Tensor [shape=(batch, 1, height, width)]
-            A sample of the distribution p(Z).
-
-        condition : Tensor [shape=(batch, condition_channel, height, width)]
-            The local condition.
-
-        Returns
-        ---------
-        x : Tensor [shape=(batch, 1, height, width)]
-            The transformed sample.
-
-        Tuple[Tensor, Tensor]
-            The parameter of the transformation.
-
-            logs (Tensor): shape(batch, 1, height - 1, width), the log scale
-            of the transformation from x to z.
-
-            b (Tensor): shape(batch, 1, height - 1, width), the shift of the
-            transformation from x to z.
+        Args:
+            z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
+        Returns:
+            Tensor:
+                The transformed sample. shape=(batch, 1, height, width)
        """
        z_0 = z[:, :, :1, :]
        x = paddle.zeros_like(z)
@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList):
    """An Deep Reversible layer that is composed of severel auto regressive
    flows.

-    Parameters
-    -----------
-    n_flows : int
-        Number of flows in the WaveFlow model.
-
-    n_layers : int
-        Number of ResidualBlocks in each Flow.
-
-    n_group : int
-        Number of timesteps to fold as a group.
-
-    channels : int
-        Feature size of each ResidualBlock.
-
-    mel_bands : int
-        Feature size of mel spectrogram (mel bands).
-
-    kernel_size : Union[int, List[int]]
-        Kernel size of the convolution layer in each ResidualBlock.
+    Args:
+        n_flows (int): Number of flows in the WaveFlow model.
+        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_group (int): Number of timesteps to fold as a group.
+        channels (int): Feature size of each ResidualBlock.
+        mel_bands (int): Feature size of mel spectrogram (mel bands).
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
    """

    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList):
        """Probability density estimation of random variable x given the
        condition.

-        Parameters
-        -----------
-        x : Tensor [shape=(batch_size, time_steps)]
-            The audio.
-
-        condition : Tensor [shape=(batch_size, condition channel, time_steps)]
-            The local condition (mel spectrogram here).
-
-        Returns
-        --------
-        z : Tensor [shape=(batch_size, time_steps)]
-            The transformed random variable.
-
-        log_det_jacobian: Tensor [shape=(1,)]
-            The log determinant of the jacobian of the transformation from x
-            to z.
+        Args:
+            x (Tensor): The audio. shape=(batch_size, time_steps)
+            condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
+                
+        Returns:
+            Tensor: The transformed random variable. shape=(batch_size, time_steps)
+            Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
        """
        # x: (B, T)
        # condition: (B, C, T) upsampled condition
@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList):
        Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
        autoregressive manner.

-        Parameters
-        ----------
-        z : Tensor [shape=(batch, 1, time_steps]
-            A sample of the distribution p(Z).
-
-        condition : Tensor [shape=(batch, condition_channel, time_steps)]
-            The local condition.
+        Args:
+            z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)    

-        Returns
-        --------
-        x : Tensor [shape=(batch_size, time_steps)]
-            The transformed sample (audio here).
+        Returns: 
+            Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
+            
        """

        z, condition = self._trim(z, condition)
@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList):
 class ConditionalWaveFlow(nn.LayerList):
    """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.

-    Parameters
-    ----------
-    upsample_factors : List[int]
-        Upsample factors for the upsample net.
-
-    n_flows : int
-        Number of flows in the WaveFlow model.
-
-    n_layers : int
-        Number of ResidualBlocks in each Flow.
-
-    n_group : int
-        Number of timesteps to fold as a group.
-
-    channels : int
-        Feature size of each ResidualBlock.
-
-    n_mels : int
-        Feature size of mel spectrogram (mel bands).
-
-    kernel_size : Union[int, List[int]]
-        Kernel size of the convolution layer in each ResidualBlock.
-    """
+    Args:
+        upsample_factors (List[int]): Upsample factors for the upsample net.
+        n_flows (int): Number of flows in the WaveFlow model.
+        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_group (int): Number of timesteps to fold as a group.
+        channels (int): Feature size of each ResidualBlock.
+        n_mels (int): Feature size of mel spectrogram (mel bands).
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+        """

    def __init__(self,
                 upsample_factors: List[int],
@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList):
        """Compute the transformed random variable z (x to z) and the log of
        the determinant of the jacobian of the transformation from x to z.

-        Parameters
-        ----------
-        audio : Tensor [shape=(B, T)]
-            The audio.
+        Args:
+            audio(Tensor): The audio. shape=(B, T)
+            mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)

-        mel : Tensor [shape=(B, C_mel, T_mel)]
-            The mel spectrogram.
-
-        Returns
-        -------
-        z : Tensor [shape=(B, T)]
-            The inversely transformed random variable z (x to z)
-
-        log_det_jacobian: Tensor [shape=(1,)]
-            the log of the determinant of the jacobian of the transformation
-            from x to z.
+        Returns:
+            Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
+            Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
        """
        condition = self.encoder(mel)
        z, log_det_jacobian = self.decoder(audio, condition)
@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList):

    @paddle.no_grad()
    def infer(self, mel):
-        r"""Generate raw audio given mel spectrogram.
+        """Generate raw audio given mel spectrogram.

-        Parameters
-        ----------
-        mel : Tensor [shape=(B, C_mel, T_mel)]
-            Mel spectrogram (in log-magnitude).
+        Args:
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)

-        Returns
-        -------
-        Tensor : [shape=(B, T)]
-            The synthesized audio, where``T <= T_mel \* upsample_factors``.
+        Returns:
+            Tensor: The synthesized audio, where``T <= T_mel \* upsample_factors``. shape=(B, T)
        """
        start = time.time()
        condition = self.encoder(mel, trim_conv_artifact=True)  # (B, C, T)
@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList):
    def predict(self, mel):
        """Generate raw audio given mel spectrogram.

-        Parameters
-        ----------
-        mel : np.ndarray [shape=(C_mel, T_mel)]
-            Mel spectrogram of an utterance(in log-magnitude).
+        Args:
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)

-        Returns
-        -------
-        np.ndarray [shape=(T,)]
-            The synthesized audio.
+        Returns:
+            np.ndarray: The synthesized audio. shape=(T,)
        """
        mel = paddle.to_tensor(mel)
        mel = paddle.unsqueeze(mel, 0)
@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList):
    def from_pretrained(cls, config, checkpoint_path):
        """Build a ConditionalWaveFlow model from a pretrained model.

-        Parameters
-        ----------
-        config: yacs.config.CfgNode
-            model configs
+        Args:
+            config(yacs.config.CfgNode): model configs
+            checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name

-        checkpoint_path: Path or str
-            the path of pretrained model checkpoint, without extension name
-
-        Returns
-        -------
-        ConditionalWaveFlow
-            The model built from pretrained result.
+        Returns:
+            ConditionalWaveFlow The model built from pretrained result.
        """
        model = cls(upsample_factors=config.model.upsample_factors,
                    n_flows=config.model.n_flows,
@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList):
 class WaveFlowLoss(nn.Layer):
    """Criterion of a WaveFlow model.

-    Parameters
-    ----------
-    sigma : float
-        The standard deviation of the gaussian noise used in WaveFlow, by
-        default 1.0.
+    Args:
+        sigma (float): The standard deviation of the gaussian noise used in WaveFlow, 
+            by default 1.0.
    """

    def __init__(self, sigma=1.0):
@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer):
        """Compute the loss given the transformed random variable z and the
        log_det_jacobian of transformation from x to z.

-        Parameters
-        ----------
-        z : Tensor [shape=(B, T)]
-            The transformed random variable (x to z).
-
-        log_det_jacobian : Tensor [shape=(1,)]
-            The log of the determinant of the jacobian matrix of the
-            transformation from x to z.
+        Args:
+            z(Tensor): The transformed random variable (x to z). shape=(B, T)
+            log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
+                transformation from x to z.  shape=(1,)

-        Returns
-        -------
-        Tensor [shape=(1,)]
-            The loss.
+        Returns:
+            Tensor: The loss. shape=(1,)
        """
        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
                                    ) - log_det_jacobian
@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
    def forward(self, mel):
        """Generate raw audio given mel spectrogram.

-        Parameters
-        ----------
-        mel : np.ndarray [shape=(C_mel, T_mel)]
-            Mel spectrogram of an utterance(in log-magnitude).
-
-        Returns
-        -------
-        np.ndarray [shape=(T,)]
-            The synthesized audio.
+        Args:
+            mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+            
+        Returns:
+            np.ndarray: The synthesized audio. shape=(T,)
+            
        """
        audio = self.predict(mel)
        return audio
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@ -67,14 +67,10 @@ class MelResNet(nn.Layer):

    def forward(self, x):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_dims, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, res_out_dims, T).
+        Args:
+            x (Tensor): Input tensor (B, in_dims, T).
+        Returns:
+            Tensor: Output tensor (B, res_out_dims, T).
        '''

        x = self.conv_in(x)
@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer):

    def forward(self, m):
        '''
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, C_aux, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
+        Args:
+            c (Tensor): Input tensor (B, C_aux, T).
+        Returns:
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        '''
        # aux: [B, C_aux, T] 
        # -> [B, res_out_dims, T - 2 * aux_context_window]
@ -172,32 +163,20 @@ class WaveRNN(nn.Layer):
            mode='RAW',
            init_type: str="xavier_uniform", ):
        '''
-        Parameters
-        ----------
-        rnn_dims : int, optional
-            Hidden dims of RNN Layers.
-        fc_dims : int, optional
-             Dims of FC Layers.
-        bits : int, optional
-            bit depth of signal.
-        aux_context_window : int, optional
-            The context window size of the first convolution applied to the 
-            auxiliary input, by default 2
-        upsample_scales : List[int], optional
-            Upsample scales of the upsample network.
-        aux_channels : int, optional
-            Auxiliary channel of the residual blocks.
-        compute_dims : int, optional
-            Dims of Conv1D in MelResNet.
-        res_out_dims : int, optional
-            Dims of output in MelResNet.
-        res_blocks : int, optional
-            Number of residual blocks.
-        mode : str, optional
-            Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
-            and `RAW` for quantized bits as the model's output.
-        init_type : str
-            How to initialize parameters.
+        Args:
+            rnn_dims (int, optional): Hidden dims of RNN Layers.
+            fc_dims (int, optional): Dims of FC Layers.
+            bits (int, optional): bit depth of signal.
+            aux_context_window (int, optional): The context window size of the first convolution applied to the 
+                auxiliary input, by default 2
+            upsample_scales (List[int], optional): Upsample scales of the upsample network.
+            aux_channels (int, optional): Auxiliary channel of the residual blocks.
+            compute_dims (int, optional): Dims of Conv1D in MelResNet.
+            res_out_dims (int, optional): Dims of output in MelResNet.
+            res_blocks (int, optional): Number of residual blocks.
+            mode (str, optional): Output mode of the WaveRNN vocoder. 
+                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
+            init_type (str): How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
@ -245,18 +224,13 @@ class WaveRNN(nn.Layer):

    def forward(self, x, c):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            wav sequence, [B, T]
-        c : Tensor
-            mel spectrogram [B, C_aux, T']
-        
-        T = (T' - 2 * aux_context_window ) * hop_length
-        Returns
-        ----------
-        Tensor
-            [B, T, n_classes]
+        Args:
+            x (Tensor): wav sequence, [B, T]
+            c (Tensor): mel spectrogram [B, C_aux, T']
+
+            T = (T' - 2 * aux_context_window ) * hop_length
+        Returns:
+            Tensor: [B, T, n_classes]
        '''
        # Although we `_flatten_parameters()` on init, when using DataParallel
        # the model gets replicated, making it no longer guaranteed that the
@ -304,22 +278,14 @@ class WaveRNN(nn.Layer):
                 mu_law: bool=True,
                 gen_display: bool=False):
        """
-        Parameters
-        ----------
-        c : Tensor
-            input mels, (T', C_aux)
-        batched : bool
-            generate in batch or not
-        target : int
-            target number of samples to be generated in each batch entry
-        overlap : int
-            number of samples for crossfading between batches
-        mu_law : bool
-            use mu law or not
-        Returns
-        ----------
-        wav sequence
-            Output (T' * prod(upsample_scales), out_channels, C_out).
+        Args:
+            c(Tensor): input mels, (T', C_aux)
+            batched(bool): generate in batch or not
+            target(int): target number of samples to be generated in each batch entry
+            overlap(int): number of samples for crossfading between batches
+            mu_law(bool)
+        Returns: 
+            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
        """

        self.eval()
@ -434,16 +400,13 @@ class WaveRNN(nn.Layer):

    def pad_tensor(self, x, pad, side='both'):
        '''
-        Parameters
-        ----------
-        x : Tensor
-            mel, [1, n_frames, 80]
-        pad : int
-        side : str 
-            'both', 'before' or 'after'
-        Returns
-        ----------
-        Tensor
+        Args:
+            x(Tensor): mel, [1, n_frames, 80]
+            pad(int): 
+            side(str, optional):  (Default value = 'both')
+
+        Returns:
+            Tensor
        '''
        b, t, _ = paddle.shape(x)
        # for dygraph to static graph
@ -461,38 +424,29 @@ class WaveRNN(nn.Layer):
        Fold the tensor with overlap for quick batched inference.
        Overlap will be used for crossfading in xfade_and_unfold()

-        Parameters
-        ----------
-        x : Tensor
-            Upsampled conditioning features. mels or aux
-            shape=(1, T, features)
-            mels: [1, T, 80]
-            aux: [1, T, 128]
-        target : int
-            Target timesteps for each index of batch
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-            overlap = hop_length * 2
-
-        Returns
-        ----------
-        Tensor 
-            shape=(num_folds, target + 2 * overlap, features)
-            num_flods = (time_seq - overlap) // (target + overlap)
-            mel: [num_folds, target + 2 * overlap, 80]
-            aux: [num_folds, target + 2 * overlap, 128]
-
-        Details
-        ----------
-        x = [[h1, h2, ... hn]]
-
-        Where each h is a vector of conditioning features
-
-        Eg: target=2, overlap=1 with x.size(1)=10
-
-        folded = [[h1, h2, h3, h4],
-                  [h4, h5, h6, h7],
-                  [h7, h8, h9, h10]]
+        Args:
+            x(Tensor): Upsampled conditioning features. mels or aux
+                shape=(1, T, features)
+                mels: [1, T, 80]
+                aux: [1, T, 128]
+            target(int): Target timesteps for each index of batch
+            overlap(int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor: 
+                shape=(num_folds, target + 2 * overlap, features)
+                num_flods = (time_seq - overlap) // (target + overlap)
+                mel: [num_folds, target + 2 * overlap, 80]
+                aux: [num_folds, target + 2 * overlap, 128]
+
+        Details:
+            x = [[h1, h2, ... hn]]
+            Where each h is a vector of conditioning features
+            Eg: target=2, overlap=1 with x.size(1)=10
+
+            folded = [[h1, h2, h3, h4],
+                    [h4, h5, h6, h7],
+                    [h7, h8, h9, h10]]
        '''

        _, total_len, features = paddle.shape(x)
@ -520,37 +474,33 @@ class WaveRNN(nn.Layer):
    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
        ''' Applies a crossfade and unfolds into a 1d array.

-        Parameters
-        ----------
-        y : Tensor
-            Batched sequences of audio samples
-            shape=(num_folds, target + 2 * overlap)
-            dtype=paddle.float32
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-
-        Returns
-        ----------
-        Tensor
-            audio samples in a 1d array
-            shape=(total_len)
-            dtype=paddle.float32
-
-        Details
-        ----------
-        y = [[seq1],
-            [seq2],
-            [seq3]]
-
-        Apply a gain envelope at both ends of the sequences
-
-        y = [[seq1_in, seq1_target, seq1_out],
-            [seq2_in, seq2_target, seq2_out],
-            [seq3_in, seq3_target, seq3_out]]
-
-        Stagger and add up the groups of samples:
-
-        [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+        Args:
+            y (Tensor): 
+                Batched sequences of audio samples
+                shape=(num_folds, target + 2 * overlap)
+                dtype=paddle.float32
+            overlap (int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor
+                audio samples in a 1d array
+                shape=(total_len)
+                dtype=paddle.float32
+
+        Details:
+            y = [[seq1],
+                [seq2],
+                [seq3]]
+
+            Apply a gain envelope at both ends of the sequences
+
+            y = [[seq1_in, seq1_target, seq1_out],
+                [seq2_in, seq2_target, seq2_out],
+                [seq3_in, seq3_target, seq3_out]]
+
+            Stagger and add up the groups of samples:
+
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]

        '''
        # num_folds = (total_len - overlap) // (target + overlap)
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns: 
+            Tensor: Output tensor (B, out_channels, T).
        """
        return self.conv(self.pad(x))[:, :, :x.shape[2]]

@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T_in).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T_out).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
        """
        return self.deconv(x)[:, :, :-self.stride]
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@ -18,12 +18,10 @@ from paddle import nn

 class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model.
-    Parameters
-    ----------
-    channels : int
-        The number of channels of conv layers.
-    kernel_size : int
-        Kernerl size of conv layers.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
    """

    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):

    def forward(self, x):
        """Compute convolution module.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, channels).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, channels).
+
+        Args:
+            x (Tensor): Input tensor (#batch, time, channels).
+        Returns:
+            Tensor: Output tensor (#batch, time, channels).
        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm

 class EncoderLayer(nn.Layer):
    """Encoder layer module.
-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
-        can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
-    feed_forward_macaron : nn.Layer
-        Additional feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
-    conv_module : nn.Layer
-        Convolution module instance.
-        `ConvlutionModule` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    stochastic_depth_rate : float
-        Proability to skip this layer.
-        During training, the layer may skip residual computation and return input
-        as-is with given probability.
+    
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (nn.Layer): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
    """

    def __init__(
@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):

    def forward(self, x_input, mask, cache=None):
        """Compute encoded features.
-        Parameters
-        ----------
-        x_input : Union[Tuple, paddle.Tensor]
-            Input tensor w/ or w/o pos emb.
-            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
-            - w/o pos emb: Tensor (#batch, time, size).
-        mask : paddle.Tensor
-            Mask tensor for the input (#batch, time).
-        cache paddle.Tensor
-            Cache tensor of the input (#batch, time - 1, size).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, size).
-        paddle.Tensor
-            Mask tensor (#batch, time).
+
+        Args:
+            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache (Tensor): 
+
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D):
    2. padding must be a causal padding (recpetive_field - 1, 0).
    Thus, these arguments are removed from the ``__init__`` method of this
    class.
-    
-    Parameters
-    ----------
-    in_channels: int
-        The feature size of the input.
-    out_channels: int
-        The feature size of the output.
-    kernel_size: int or Tuple[int]
-        The size of the kernel.
-    dilation: int or Tuple[int]
-        The dilation of the convolution, by default 1
-    weight_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias. If ``False``, this layer does not
-        have a bias, by default None.
-        
-    Examples
-    --------
-    >>> cell = Conv1dCell(3, 4, kernel_size=5)
-    >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
-    >>> outputs = []
-    >>> cell.eval()
-    >>> cell.start_sequence()
-    >>> for xt in inputs:
-    >>>     outputs.append(cell.add_input(xt))
-    >>> len(outputs))
-    16
-    >>> outputs[0].shape
-    [4, 4]
+
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int or Tuple[int]): The size of the kernel.
+        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
+            If ``False``, this layer does not have a bias, by default None.
+            
+    Examples: 
+        >>> cell = Conv1dCell(3, 4, kernel_size=5)
+        >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+        >>> outputs = []
+        >>> cell.eval()
+        >>> cell.start_sequence()
+        >>> for xt in inputs:
+        >>>     outputs.append(cell.add_input(xt))
+        >>> len(outputs))
+        16
+        >>> outputs[0].shape
+        [4, 4]
    """

    def __init__(self,
@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D):
    def start_sequence(self):
        """Prepare the layer for a series of incremental forward.
        
-        Warnings
-        ---------
-        This method should be called before a sequence of calls to
-        ``add_input``.
+        Warnings:
+            This method should be called before a sequence of calls to
+            ``add_input``.

-        Raises
-        ------
-        Exception
-            If this method is called when the layer is in training mode.
+        Raises:
+            Exception
+                If this method is called when the layer is in training mode.
        """
        if self.training:
            raise Exception("only use start_sequence in evaluation")
@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
    def initialize_buffer(self, x_t):
        """Initialize the buffer for the step input.

-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
        """
        batch_size, _ = x_t.shape
        self._buffer = paddle.zeros(
@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D):
    def update_buffer(self, x_t):
        """Shift the buffer by one step.

-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
        """
        self._buffer = paddle.concat(
            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)

    def add_input(self, x_t):
        """Add step input and compute step output.
-        
-        Parameters
-        -----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
-            
-        Returns
-        -------
-        y_t :Tensor [shape=(batch_size, out_channels)]
-            The step output.
+
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+          
+        Returns: 
+            y_t (Tensor): The step output. shape=(batch_size, out_channels)
+
        """
        batch_size = x_t.shape[0]
        if self.receptive_field > 1:
@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
 class Conv1dBatchNorm(nn.Layer):
    """A Conv1D Layer followed by a BatchNorm1D.

-    Parameters
-    ----------
-    in_channels : int
-        The feature size of the input.
-    out_channels : int
-        The feature size of the output.
-    kernel_size : int
-        The size of the convolution kernel.
-    stride : int, optional
-        The stride of the convolution, by default 1.
-    padding : int, str or Tuple[int], optional
-        The padding of the convolution.
-        If int, a symmetrical padding is applied before convolution;
-        If str, it should be "same" or "valid";
-        If Tuple[int], its length should be 2, meaning
-        ``(pad_before, pad_after)``, by default 0.
-    weight_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias of the convolution, by default
-        None.
-    data_format : str ["NCL" or "NLC"], optional
-        The data layout of the input, by default "NCL"
-    momentum : float, optional
-        The momentum of the BatchNorm1D layer, by default 0.9
-    epsilon : [type], optional
-        The epsilon of the BatchNorm1D layer, by default 1e-05
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int): The size of the convolution kernel.
+        stride (int, optional): The stride of the convolution, by default 1.
+        padding (int, str or Tuple[int], optional):
+            The padding of the convolution.
+            If int, a symmetrical padding is applied before convolution;
+            If str, it should be "same" or "valid";
+            If Tuple[int], its length should be 2, meaning
+            ``(pad_before, pad_after)``, by default 0.
+        weight_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the convolution kernel,
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the bias of the convolution,
+            by defaultNone.
+        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
    """

    def __init__(self,
@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer):

    def forward(self, x):
        """Forward pass of the Conv1dBatchNorm layer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
-            The input tensor. Its data layout depends on ``data_format``.
-
-        Returns
-        -------
-        Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
-            The output tensor. 
+        
+        Args:
+            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
+            shape=(B, C_in, T_in) or (B, T_in, C_in)
+    
+        Returns:
+            Tensor: The output tensor. 
+                shape=(B, C_out, T_out) or (B, T_out, C_out)
+                
        """
        x = self.conv(x)
        x = self.bn(x)
--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@ -17,24 +17,18 @@ import paddle

 def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.
+    
+    Args:
+        x (Tensor): The input tensor.
+        axis (int): The axis to shuffle.
+        perm (List[int], ndarray, optional): 
+            The order to reorder the tensor along the ``axis``-th dimension.
+            It is a permutation of ``[0, d)``, where d is the size of the
+            ``axis``-th dimension of the input tensor. If not provided,
+            a random permutation is used. Defaults to None.

-    Parameters
-    ----------
-    x : Tensor
-        The input tensor.
-    axis : int
-        The axis to shuffle.
-    perm : List[int], ndarray, optional
-        The order to reorder the tensor along the ``axis``-th dimension.
-        
-        It is a permutation of ``[0, d)``, where d is the size of the
-        ``axis``-th dimension of the input tensor. If not provided,
-        a random permutation is used. Defaults to None.
-
-    Returns
-    ---------
-    Tensor
-        The shuffled tensor, which has the same shape as x does.
+    Returns:
+        Tensor: The shuffled tensor, which has the same shape as x does.
    """
    size = x.shape[axis]
    if perm is not None and len(perm) != size:
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@ -18,13 +18,9 @@ from paddle import nn

 class LayerNorm(nn.LayerNorm):
    """Layer normalization module.
-
-    Parameters
-    ----------
-    nout : int
-        Output dim size.
-    dim : int
-        Dimension to be normalized.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
    """

    def __init__(self, nout, dim=-1):
@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
    def forward(self, x):
        """Apply layer normalization.

-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor.
+        Args:
+            x (Tensor):Input tensor.

-        Returns
-        ----------
-        paddle.Tensor
-            Normalized tensor.
+        Returns: 
+            Tensor: Normalized tensor.
        """

        if self.dim == -1:
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat,
 def sample_from_discretized_mix_logistic(y, log_scale_min=None):
    """
    Sample from discretized mixture of logistic distributions
-    Parameters
-    ----------
-    y : Tensor 
-        (B, C, T)
-    log_scale_min : float
-        Log scale minimum value
-    Returns
-    ----------
-    Tensor
-        sample in range of [-1, 1].
+
+    Args:
+        y(Tensor): (B, C, T)
+        log_scale_min(float, optional):  (Default value = None)
+
+    Returns:
+        Tensor: sample in range of [-1, 1].
    """
    if log_scale_min is None:
        log_scale_min = float(np.log(1e-14))
@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer):
    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
        """Initialize guided attention loss module.

-        Parameters
-        ----------
-        sigma : float, optional
-            Standard deviation to control how close attention to a diagonal.
-        alpha : float, optional
-            Scaling coefficient (lambda).
-        reset_always : bool, optional
-            Whether to always reset masks.
+        Args:
+            sigma (float, optional): Standard deviation to control how close attention to a diagonal.
+            alpha (float, optional): Scaling coefficient (lambda).
+            reset_always (bool, optional): Whether to always reset masks.

        """
        super().__init__()
@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer):
    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of attention weights (B, T_max_out, T_max_in).
-        ilens : Tensor(int64)
-            Batch of input lenghts (B,).
-        olens : Tensor(int64)
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
+        Args:
+            att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+            ilens(Tensor(int64)): Batch of input lenghts (B,).
+            olens(Tensor(int64)): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.

        """
        if self.guided_attn_masks is None:
@ -282,39 +269,33 @@ class GuidedAttentionLoss(nn.Layer):
    def _make_masks(ilens, olens):
        """Make masks indicating non-padded part.

-        Parameters
-        ----------
-        ilens : Tensor(int64) or List
-            Batch of lengths (B,).
-        olens : Tensor(int64) or List
-            Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor
-            Mask tensor indicating non-padded part.
-
-        Examples
-        ----------
-        >>> ilens, olens = [5, 2], [8, 5]
-        >>> _make_mask(ilens, olens)
-        tensor([[[1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1]],
-
-                [[1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+        Args:
+            ilens(Tensor(int64) or List): Batch of lengths (B,).
+            olens(Tensor(int64) or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.
+
+        Examples:
+            >>> ilens, olens = [5, 2], [8, 5]
+            >>> _make_mask(ilens, olens)
+            tensor([[[1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1]],
+
+                    [[1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0]]], dtype=paddle.uint8)

        """
        # (B, T_in)
@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer):
 class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
    """Guided attention loss function module for multi head attention.

-    Parameters
-    ----------
-    sigma : float, optional
-        Standard deviation to controlGuidedAttentionLoss
-        how close attention to a diagonal.
-    alpha : float, optional
-        Scaling coefficient (lambda).
-    reset_always : bool, optional
-        Whether to always reset masks.
+    Args:
+        sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
+            how close attention to a diagonal.
+        alpha (float, optional): Scaling coefficient (lambda).
+        reset_always (bool, optional): Whether to always reset masks.

    """

    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-        ilens : Tensor
-            Batch of input lenghts (B,).
-        olens : Tensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
+        Args:
+            att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            ilens(Tensor): Batch of input lenghts (B,).
+            olens(Tensor): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.

        """
        if self.guided_attn_masks is None:
@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer):
                 use_weighted_masking=False,
                 bce_pos_weight=20.0):
        """Initialize Tactoron2 loss module.
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to apply weighted masking in loss calculation.
-        bce_pos_weight : float
-            Weight of positive sample of stop token.
+
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
+            bce_pos_weight (float): Weight of positive sample of stop token.
        """
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer):

    def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        logits : Tensor
-            Batch of stop logits (B, Lmax).
-        ys : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        stop_labels : Tensor(int64)
-            Batch of the sequences of stop token labels (B, Lmax).
-        olens : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Mean square error loss value.
-        Tensor
-            Binary cross entropy loss value.
+
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            logits(Tensor): Batch of stop logits (B, Lmax).
+            ys(Tensor): Batch of padded target features (B, Lmax, odim).
+            stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
+            olens(Tensor(int64)): 
+
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Mean square error loss value.
+            Tensor: Binary cross entropy loss value.
        """
        # make mask and apply it
        if self.use_masking:
@ -513,28 +472,20 @@ def stft(x,
         center=True,
         pad_mode='reflect'):
    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    Args:
+        x(Tensor): Input signal tensor (B, T).
+        fft_size(int): FFT size.
+        hop_size(int): Hop size.
+        win_length(int, optional): window : str, optional (Default value = None)
+        window(str, optional): Name of window function, see `scipy.signal.get_window` for more
+            details. Defaults to "hann".
+        center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
+            :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
+        pad_mode(str, optional, optional):  (Default value = 'reflect')
+        hop_length:  (Default value = None)
+
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
    """
    # calculate window
    window = signal.get_window(window, win_length, fftbins=True)
@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer):

    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
+        Args: 
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
        """
        return paddle.norm(
            y_mag - x_mag, p="fro") / paddle.clip(
@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer):

    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
        """
        return F.l1_loss(
            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
@ -625,18 +566,12 @@ class STFTLoss(nn.Layer):

    def forward(self, x, y):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
        """
        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
                     self.window)
@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer):
            win_lengths=[600, 1200, 240],
            window="hann", ):
        """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
        """
        super().__init__()
        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer):

    def forward(self, x, y):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
+        
+        Args:
+            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B x C, T)
@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer):

    def forward(self, outputs):
        """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
+        Args:
+            outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
+        Returns:
+            Tensor: Generator adversarial loss value.
        """
        if isinstance(outputs, (tuple, list)):
            adv_loss = 0.0
@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):

    def forward(self, outputs_hat, outputs):
        """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
+
+        Args:
+            outputs_hat (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from generator outputs.
+            outputs (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from groundtruth.
+        Returns:
+            Tensor: Discriminator real loss value.
+            Tensor: Discriminator fake loss value.
        """
        if isinstance(outputs, (tuple, list)):
            real_loss = 0.0
@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True):
 def weighted_mean(input, weight):
    """Weighted mean. It can also be used as masked mean.

-    Parameters
-    -----------
-    input : Tensor 
-        The input tensor.
-    weight : Tensor
-        The weight tensor with broadcastable shape with the input.
-
-    Returns
-    ----------
-    Tensor [shape=(1,)]
-        Weighted mean tensor with the same dtype as input.
+    Args:
+        input(Tensor): The input tensor.
+        weight(Tensor): The weight tensor with broadcastable shape with the input.
+
+    Returns:
+        Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
+            
    """
    weight = paddle.cast(weight, input.dtype)
    # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
@ -889,20 +801,15 @@ def weighted_mean(input, weight):
 def masked_l1_loss(prediction, target, mask):
    """Compute maksed L1 loss.

-    Parameters
-    ----------
-    prediction : Tensor
-        The prediction.
-    target : Tensor
-        The target. The shape should be broadcastable to ``prediction``.
-    mask : Tensor
-        The mask. The shape should be broadcatable to the broadcasted shape of
-        ``prediction`` and ``target``.
-
-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked L1 loss.
+    Args:
+        prediction(Tensor): The prediction.
+        target(Tensor): The target. The shape should be broadcastable to ``prediction``.
+        mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
+            ``prediction`` and ``target``.
+
+    Returns:
+        Tensor: The masked L1 loss. shape=(1,)
+        
    """
    abs_error = F.l1_loss(prediction, target, reduction='none')
    loss = weighted_mean(abs_error, mask)
@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer):

    def forward(self, x):
        """Calculate Mel-spectrogram.
-        Parameters
-        ----------
-        x : Tensor
-            Input waveform tensor (B, T) or (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Mel-spectrogram (B, #mels, #frames).
+        Args:
+        
+            x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
+        Returns:
+            Tensor: Mel-spectrogram (B, #mels, #frames).
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B*C, T)
@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer):

    def forward(self, y_hat, y):
        """Calculate Mel-spectrogram loss.
-        Parameters
-        ----------
-        y_hat : Tensor
-            Generated single tensor (B, 1, T).
-        y : Tensor
-            Groundtruth single tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Mel-spectrogram loss value.
+        Args:
+            y_hat(Tensor): Generated single tensor (B, 1, T).
+            y(Tensor): Groundtruth single tensor (B, 1, T).
+
+        Returns:
+            Tensor: Mel-spectrogram loss value.
        """
        mel_hat = self.mel_spectrogram(y_hat)
        mel = self.mel_spectrogram(y)
@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer):

    def forward(self, feats_hat, feats):
        """Calcualate feature matching loss.
-        Parameters
-        ----------
-        feats_hat : list
-            List of list of discriminator outputs
-            calcuated from generater outputs.
-        feats : list
-            List of list of discriminator outputs
-            calcuated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Feature matching loss value.
+
+        Args:
+            feats_hat(list): List of list of discriminator outputs
+                calcuated from generater outputs.
+            feats(list): List of list of discriminator outputs
+
+        Returns:
+            Tensor: Feature matching loss value.

        """
        feat_match_loss = 0.0
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -20,27 +20,21 @@ from typeguard import check_argument_types
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.

-    Parameters
-    ----------
-    xs : List[Tensor]
-        List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-    pad_value : float)
-        Value for padding.
-
-    Returns
-    ----------
-    Tensor
-        Padded tensor (B, Tmax, `*`).
-
-    Examples
-    ----------
-    >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
-    >>> x
-    [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
-    >>> pad_list(x, 0)
-    tensor([[1., 1., 1., 1.],
-            [1., 1., 0., 0.],
-            [1., 0., 0., 0.]])
+    Args:
+        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
+        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
    """
    n_batch = len(xs)
    max_len = max(x.shape[0] for x in xs)
@ -55,25 +49,20 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of padded part.

-    Parameters
-    ----------
-    lengths : LongTensor
-            Batch of lengths (B,).
-
-    Returns
-    ----------
-    Tensor(bool)
-        Mask tensor containing indices of padded part bool.
-
-    Examples
-    ----------
-    With only lengths.
-
-    >>> lengths = [5, 3, 2]
-    >>> make_non_pad_mask(lengths)
-    masks = [[0, 0, 0, 0 ,0],
-                [0, 0, 0, 1, 1],
-                [0, 0, 1, 1, 1]]
+    Args:
+        lengths (Tensor(int64)): Batch of lengths (B,).
+
+    Returns: 
+        Tensor(bool): Mask tensor containing indices of padded part bool.
+
+    Examples:
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                    [0, 0, 0, 1, 1],
+                    [0, 0, 1, 1, 1]]
    """
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.

-    Parameters
-    ----------
-    lengths : LongTensor or List
-            Batch of lengths (B,).
-    xs : Tensor, optional
-        The reference tensor.
-        If set, masks will be the same shape as this tensor.
-    length_dim : int, optional
-        Dimension indicator of the above tensor.
-        See the example.
-
-    Returns
-    ----------
-    Tensor(bool)
-        mask tensor containing indices of padded part bool.
-
-    Examples
-    ----------
-    With only lengths.
-
-    >>> lengths = [5, 3, 2]
-    >>> make_non_pad_mask(lengths)
-    masks = [[1, 1, 1, 1 ,1],
-                [1, 1, 1, 0, 0],
-                [1, 1, 0, 0, 0]]
+    Args:
+        lengths (Tensor(int64) or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        Tensor(bool): mask tensor containing indices of padded part bool.
+
+    Examples: 
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                    [1, 1, 1, 0, 0],
+                    [1, 1, 0, 0, 0]]
    """
    return paddle.logical_not(make_pad_mask(lengths, length_dim))

@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):

    Custom initialization routines can be implemented into submodules

-    Parameters
-    ----------
-    model : nn.Layer
-        Target.
-    init : str
-        Method of initialization.
+    Args:
+        model (nn.Layer): Target.
+        init (str): Method of initialization.
    """
    assert check_argument_types()

--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
    """Design prototype filter for PQMF.
    This method is based on `A Kaiser window approach for the design of prototype
    filters of cosine modulated filterbanks`_.
-    Parameters
-    ----------
-    taps : int
-        The number of filter taps.
-    cutoff_ratio : float
-        Cut-off frequency ratio.
-    beta : float
-        Beta coefficient for kaiser window.
-    Returns
-    ----------
-    ndarray
-        Impluse response of prototype filter (taps + 1,).
-    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
-        https://ieeexplore.ieee.org/abstract/document/681427
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray:
+            Impluse response of prototype filter (taps + 1,).
+        .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+            https://ieeexplore.ieee.org/abstract/document/681427
    """
    # check the arguments are valid
    assert taps % 2 == 0, "The number of taps mush be even number."
@ -68,16 +64,12 @@ class PQMF(nn.Layer):
        """Initilize PQMF module.
        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
-        Parameters
-        ----------
-        subbands : int
-            The number of subbands.
-        taps : int
-            The number of filter taps.
-        cutoff_ratio : float
-            Cut-off frequency ratio.
-        beta : float
-            Beta coefficient for kaiser window.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
        """
        super().__init__()

@ -110,28 +102,20 @@ class PQMF(nn.Layer):

    def analysis(self, x):
        """Analysis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, subbands, T // subbands).
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
        """
        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
        return F.conv1d(x, self.updown_filter, stride=self.subbands)

    def synthesis(self, x):
        """Synthesis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, subbands, T // subbands).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, 1, T).
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
        """
        x = F.conv1d_transpose(
            x, self.updown_filter * self.subbands, stride=self.subbands)
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
                 offset=1.0):
        """Initilize duration predictor module.

-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-                Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-                Dropout rate.
-        offset : float, optional
-            Offset value to avoid nan in log domain.
+        Args:
+            idim (int):Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.

        """
        super().__init__()
@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):

    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : ByteTensor, optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in log domain (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
        """
        return self._forward(xs, x_masks, False)

    def inference(self, xs, x_masks=None):
        """Inference duration.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : Tensor(bool), optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in linear domain int64 (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
        """
        return self._forward(xs, x_masks, True)

@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):

    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.
-
-        Parameters
-        ----------
-        offset : float, optional
-            Offset value to avoid nan in log domain.
-        reduction : str
-            Reduction type in loss calculation.
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
        """
        super().__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer):
    def forward(self, outputs, targets):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        outputs : Tensor
-            Batch of prediction durations in log domain (B, T)
-        targets : Tensor
-            Batch of groundtruth durations in linear domain (B, T)
-
-        Returns
-        ----------
-        Tensor
-            Mean squared error loss value.
-
-        Note
-        ----------
-        `outputs` is in log domain but `targets` is in linear domain.
+        Args:
+            outputs(Tensor): Batch of prediction durations in log domain (B, T)
+            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
+
+        Returns: 
+            Tensor: Mean squared error loss value.
+
+        Note: 
+            `outputs` is in log domain but `targets` is in linear domain.
        """
        # NOTE: outputs is in log domain while targets in linear
        targets = paddle.log(targets.cast(dtype='float32') + self.offset)
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer):
    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.

-        Parameters
-        ----------
-        pad_value : float, optional
-            Value used for padding.
+        Args:
+            pad_value (float, optional): Value used for padding.

        """
        super().__init__()
@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer):
    def forward(self, xs, ds, alpha=1.0, is_inference=False):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-        ds : Tensor(int64)
-            Batch of durations of each frame (B, T).
-        alpha : float, optional
-            Alpha value to control speed of speech.
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (Tensor(int64)): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.

-        Returns
-        ----------
-        Tensor
-            replicated input tensor based on durations (B, T*, D).
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
        """

        if alpha != 1.0:
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer):
            dropout_rate: float=0.5, ):
        """Initilize duration predictor module.

-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-            Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-            Dropout rate.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer):
                x_masks: paddle.Tensor=None) -> paddle.Tensor:
        """Calculate forward propagation.

-        Parameters
-        ----------
-            xs : Tensor
-                Batch of input sequences (B, Tmax, idim).
-            x_masks : Tensor(bool), optional
-                Batch of masks indicating padded part (B, Tmax, 1).
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).

-        Returns
-        ----------
-            Tensor
-                Batch of predicted sequences (B, Tmax, 1).
+        Returns:
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
        """
        # (B, idim, Tmax)
        xs = xs.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer):
    unit and parametric redidual and skip connections. For more details, 
    refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.

-    Parameters
-    ----------
-    kernel_size : int, optional
-        Kernel size of the 1D convolution, by default 3
-    residual_channels : int, optional
-        Feature size of the resiaudl output(and also the input), by default 64
-    gate_channels : int, optional
-        Output feature size of the 1D convolution, by default 128
-    skip_channels : int, optional
-        Feature size of the skip output, by default 64
-    aux_channels : int, optional
-        Feature size of the auxiliary input (e.g. spectrogram), by default 80
-    dropout : float, optional
-        Probability of the dropout before the 1D convolution, by default 0.
-    dilation : int, optional
-        Dilation of the 1D convolution, by default 1
-    bias : bool, optional
-        Whether to use bias in the 1D convolution, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding for the 1D convolution, by default False
+    Args:
+        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
+        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
+        skip_channels (int, optional): Feature size of the skip output, by default 64
+        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
+        dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
+        dilation (int, optional): Dilation of the 1D convolution, by default 1
+        bias (bool, optional): Whether to use bias in the 1D convolution, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
    """

    def __init__(self,
@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer):

    def forward(self, x, c):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_res, T), the input features.
-        c : Tensor
-            Shape (N, C_aux, T), the auxiliary input.
-
-        Returns
-        -------
-        res : Tensor
-            Shape (N, C_res, T), the residual output, which is used as the 
-            input of the next ResidualBlock in a stack of ResidualBlocks.
-        skip : Tensor
-            Shape (N, C_skip, T), the skip output, which is collected among
-            each layer in a stack of ResidualBlocks.
+        Args:
+            x (Tensor): the input features. Shape (N, C_res, T)
+            c (Tensor): the auxiliary input. Shape (N, C_aux, T)
+
+        Returns:
+            res (Tensor): Shape (N, C_res, T), the residual output, which is used as the 
+                input of the next ResidualBlock in a stack of ResidualBlocks.
+            skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
+                each layer in a stack of ResidualBlocks.
        """
        x_input = x
        x = F.dropout(x, self.dropout, training=self.training)
@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer):
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
    ):
        """Initialize HiFiGANResidualBlock module.
-        Parameters
-        ----------
-        kernel_size : int
-            Kernel size of dilation convolution layer.
-        channels : int
-            Number of channels for convolution layer.
-        dilations : List[int]
-            List of dilation factors.
-        use_additional_convs : bool
-            Whether to use additional convolution layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
        """
        super().__init__()

@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer):

    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, channels, T).
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, channels, T).
        """
        for idx in range(len(self.convs1)):
            xt = self.convs1[idx](x)
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@ -37,26 +37,17 @@ class ResidualStack(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            use_causal_conv: bool=False, ):
        """Initialize ResidualStack module.
-        Parameters
-        ----------
-        kernel_size : int
-            Kernel size of dilation convolution layer.
-        channels : int
-            Number of channels of convolution layers.
-        dilation : int
-            Dilation factor.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : Dict[str,Any]
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : Dict[str, Any]
-            Hyperparameters for padding function.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
        """
        super().__init__()
        # for compatibility
@ -102,13 +93,10 @@ class ResidualStack(nn.Layer):

    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, chennels, T).
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:     
+            Tensor: Output tensor (B, chennels, T).
        """
        return self.stack(c) + self.skip_layer(c)
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer):

    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
-    Parameters
-    ----------
-    idim : int, optional
-        Dimension of the input mel-spectrogram.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_token_dim : int, optional
-        Dimension of each GST embedding.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    conv_layers : int, optional
-        The number of conv layers in the reference encoder.
-    conv_chans_list : Sequence[int], optional
-        List of the number of channels of conv layers in the referece encoder.
-    conv_kernel_size : int, optional
-        Kernal size of conv layers in the reference encoder.
-    conv_stride : int, optional
-        Stride size of conv layers in the reference encoder.
-    gru_layers : int, optional
-        The number of GRU layers in the reference encoder.
-    gru_units : int, optional
-        The number of GRU units in the reference encoder.
-
-    Todo
-    ----------
-    * Support manual weight specification in inference.
+    
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional):The number of GRU units in the reference encoder.
+
+    Todo:
+        * Support manual weight specification in inference.

    """

@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer):
    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, odim).

-        Returns
-        ----------
-        Tensor:
-            Style token embeddings (B, token_dim).
+        Returns: 
+            Tensor: Style token embeddings (B, token_dim).

        """
        ref_embs = self.ref_enc(speech)
@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer):

    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
-    Parameters
-    ----------
-    idim : int, optional
-        Dimension of the input mel-spectrogram.
-    conv_layers : int, optional
-        The number of conv layers in the reference encoder.
-    conv_chans_list: : Sequence[int], optional
-        List of the number of channels of conv layers in the referece encoder.
-    conv_kernel_size : int, optional
-        Kernal size of conv layers in the reference encoder.
-    conv_stride : int, optional
-        Stride size of conv layers in the reference encoder.
-    gru_layers : int, optional
-        The number of GRU layers in the reference encoder.
-    gru_units : int, optional
-        The number of GRU units in the reference encoder.
+    
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional): The number of GRU units in the reference encoder.

    """

@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer):

    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, idim).

-        Parameters
-        ----------
-        speech : Tensor
-            Batch of padded target features (B, Lmax, idim).
-
-        Return
-        ----------
-        Tensor
-            Reference embedding (B, gru_units)
+        Returns:
+            Tensor: Reference embedding (B, gru_units)

        """
        batch_size = speech.shape[0]
@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer):

    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
-    Parameters
-    ----------
-    ref_embed_dim : int, optional
-        Dimension of the input reference embedding.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_token_dim : int, optional
-        Dimension of each GST embedding.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    dropout_rate : float, optional
-        Dropout rate in multi-head attention.
+    Args:
+        ref_embed_dim (int, optional): Dimension of the input reference embedding.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        dropout_rate (float, optional): Dropout rate in multi-head attention.

    """

@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer):
    def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.

-        Parameters
-        ----------
-        ref_embs : Tensor
-            Reference embeddings (B, ref_embed_dim).
+        Args:
+            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).

-        Returns
-        ----------
-        Tensor
-            Style token embeddings (B, gst_token_dim).
+        Returns: 
+            Tensor: Style token embeddings (B, gst_token_dim).

        """
        batch_size = ref_embs.shape[0]
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@ -30,21 +30,14 @@ def _apply_attention_constraint(e,
    introduced in `Deep Voice 3: Scaling
    Text-to-Speech with Convolutional Sequence Learning`_.

-    Parameters
-    ----------
-    e : Tensor
-        Attention energy before applying softmax (1, T).
-    last_attended_idx : int
-        The index of the inputs of the last attended [0, T].
-    backward_window : int, optional
-        Backward window size in attention constraint.
-    forward_window : int, optional
-        Forward window size in attetion constraint.
-
-    Returns
-    ----------
-    Tensor
-        Monotonic constrained attention energy (1, T).
+    Args:
+        e(Tensor): Attention energy before applying softmax (1, T).
+       last_attended_idx(int): The index of the inputs of the last attended [0, T].
+       backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
+       forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
+
+    Returns:
+        Tensor: Monotonic constrained attention energy (1, T).

    .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
        https://arxiv.org/abs/1710.07654
@ -67,20 +60,14 @@ class AttLoc(nn.Layer):

    Reference: Attention-Based Models for Speech Recognition
        (https://arxiv.org/pdf/1506.07503.pdf)
-    Parameters
-    ----------
-    eprojs : int
-        projection-units of encoder
-    dunits : int
-        units of decoder
-    att_dim :  int
-        att_dim: attention dimension
-    aconv_chans : int
-        channels of attention convolution
-    aconv_filts : int
-        filter size of attention convolution
-    han_mode : bool
-        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
+
+    Args:
+        eprojs (int): projection-units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+        han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    """

    def __init__(self,
@ -129,33 +116,19 @@ class AttLoc(nn.Layer):
            backward_window=1,
            forward_window=3, ):
        """Calculate AttLoc forward propagation.
-        Parameters
-        ----------
-        enc_hs_pad : paddle.Tensor
-            padded encoder hidden state (B, T_max, D_enc)
-        enc_hs_len : paddle.Tensor
-            padded encoder hidden state length (B)
-        dec_z : paddle.Tensor dec_z
-            decoder hidden state (B, D_dec)
-        att_prev : paddle.Tensor
-            previous attention weight (B, T_max)
-        scaling : float
-            scaling parameter before applying softmax
-        forward_window : paddle.Tensor
-            forward window size when constraining attention
-        last_attended_idx : int
-            index of the inputs of the last attended
-        backward_window : int
-            backward window size in attention constraint
-        forward_window : int
-            forward window size in attetion constraint
-
-        Returns
-        ----------
-        paddle.Tensor
-            attention weighted encoder state (B, D_enc)
-        paddle.Tensor  
-            previous attention weights (B, T_max)
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(Tensor): padded encoder hidden state length (B)
+            dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
+            att_prev(Tensor): previous attention weight (B, T_max)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
+            forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
+        Returns:
+            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: previous attention weights (B, T_max)
        """
        batch = paddle.shape(enc_hs_pad)[0]
        # pre-compute all h outside the decoder loop
@ -217,19 +190,13 @@ class AttForward(nn.Layer):
    ----------
    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
        (https://arxiv.org/pdf/1807.06736.pdf)
-    
-    Parameters
-    ----------
-    eprojs : int
-        projection-units of encoder
-    dunits : int
-        units of decoder
-    att_dim : int
-        attention dimension
-    aconv_chans : int
-        channels of attention convolution
-    aconv_filts : int 
-        filter size of attention convolution
+
+    Args:
+        eprojs (int): projection-units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
    """

    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
@ -270,30 +237,20 @@ class AttForward(nn.Layer):
            backward_window=1,
            forward_window=3, ):
        """Calculate AttForward forward propagation.
-        Parameters
-        ----------
-        enc_hs_pad : paddle.Tensor
-            padded encoder hidden state (B, T_max, D_enc)
-        enc_hs_len : list
-            padded encoder hidden state length (B,)
-        dec_z : paddle.Tensor
-            decoder hidden state (B, D_dec)
-        att_prev : paddle.Tensor
-            attention weights of previous step (B, T_max)
-        scaling : float
-            scaling parameter before applying softmax
-        last_attended_idx : int
-            index of the inputs of the last attended
-        backward_window : int
-            backward window size in attention constraint
-        forward_window : int
-            forward window size in attetion constraint
-        Returns
-        ----------
-        paddle.Tensor
-            attention weighted encoder state (B, D_enc)
-        paddle.Tensor
-            previous attention weights (B, T_max)
+
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(list): padded encoder hidden state length (B,)
+            dec_z(Tensor): decoder hidden state (B, D_dec)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional):  (Default value = 3)
+
+        Returns:
+            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: previous attention weights (B, T_max)
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
@ -359,24 +316,17 @@ class AttForward(nn.Layer):

 class AttForwardTA(nn.Layer):
    """Forward attention with transition agent module.
-    Reference
-    ----------
-    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
-        (https://arxiv.org/pdf/1807.06736.pdf)
-    Parameters
-    ----------
-    eunits : int
-        units of encoder
-    dunits : int
-        units of decoder
-    att_dim : int
-        attention dimension
-    aconv_chans : int
-        channels of attention convolution
-    aconv_filts : int
-        filter size of attention convolution
-    odim : int
-        output dimension
+    Reference:
+        Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+            (https://arxiv.org/pdf/1807.06736.pdf)
+
+    Args:
+        eunits (int): units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+        odim (int): output dimension
    """

    def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
@ -420,32 +370,21 @@ class AttForwardTA(nn.Layer):
            backward_window=1,
            forward_window=3, ):
        """Calculate AttForwardTA forward propagation.
-        Parameters
-        ----------
-        enc_hs_pad : paddle.Tensor
-            padded encoder hidden state (B, Tmax, eunits)
-        enc_hs_len : list paddle.Tensor
-            padded encoder hidden state length (B,)
-        dec_z : paddle.Tensor
-            decoder hidden state (B, dunits)
-        att_prev : paddle.Tensor
-            attention weights of previous step (B, T_max)
-        out_prev : paddle.Tensor
-            decoder outputs of previous step (B, odim)
-        scaling : float
-            scaling parameter before applying softmax
-        last_attended_idx : int
-            index of the inputs of the last attended
-        backward_window : int
-            backward window size in attention constraint
-        forward_window : int
-            forward window size in attetion constraint
-        Returns
-        ----------
-        paddle.Tensor
-            attention weighted encoder state (B, dunits)
-        paddle.Tensor
-            previous attention weights (B, Tmax)
+
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
+            enc_hs_len(list Tensor): padded encoder hidden state length (B,)
+            dec_z(Tensor): decoder hidden state (B, dunits)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
+            out_prev(Tensor): decoder outputs of previous step (B, odim)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional):  (Default value = 3)
+
+        Returns:
+            Tensor: attention weighted encoder state (B, dunits)
+            Tensor: previous attention weights (B, Tmax)
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@ -44,16 +44,11 @@ class Prenet(nn.Layer):
    def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
        """Initialize prenet module.

-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        n_layers : int, optional
-            The number of prenet layers.
-        n_units : int, optional
-            The number of prenet units.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of prenet layers.
+            n_units (int, optional): The number of prenet units.
        """
        super().__init__()
        self.dropout_rate = dropout_rate
@ -66,15 +61,11 @@ class Prenet(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        x : Tensor
-            Batch of input tensors (B, ..., idim).
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., idim).

-        Returns
-        ----------
-        Tensor
-            Batch of output tensors (B, ..., odim).
+        Returns: 
+            Tensor: Batch of output tensors (B, ..., odim).

        """
        for i in range(len(self.prenet)):
@ -109,22 +100,14 @@ class Postnet(nn.Layer):
            use_batch_norm=True, ):
        """Initialize postnet module.

-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        n_layers : int, optional
-            The number of layers.
-        n_filts : int, optional
-            The number of filter size.
-        n_units : int, optional
-            The number of filter channels.
-        use_batch_norm : bool, optional
-            Whether to use batch normalization..
-        dropout_rate : float, optional
-            Dropout rate..
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of layers.
+            n_filts (int, optional): The number of filter size.
+            n_units (int, optional): The number of filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization..
+            dropout_rate (float, optional): Dropout rate..
        """
        super().__init__()
        self.postnet = nn.LayerList()
@ -184,16 +167,10 @@ class Postnet(nn.Layer):
    def forward(self, xs):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of the sequences of padded input tensors (B, idim, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of padded output tensor. (B, odim, Tmax).
-
+        Args:
+            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+        Returns:
+            Tensor: Batch of padded output tensor. (B, odim, Tmax).
        """
        for i in range(len(self.postnet)):
            xs = self.postnet[i](xs)
@ -217,13 +194,11 @@ class ZoneOutCell(nn.Layer):

    def __init__(self, cell, zoneout_rate=0.1):
        """Initialize zone out cell module.
-        Parameters
-        ----------
-        cell : nn.Layer:
-            Paddle recurrent cell module
-            e.g. `paddle.nn.LSTMCell`.
-        zoneout_rate : float, optional
-            Probability of zoneout from 0.0 to 1.0.
+
+        Args:
+            cell (nn.Layer): Paddle recurrent cell module
+                e.g. `paddle.nn.LSTMCell`.
+            zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.
        """
        super().__init__()
        self.cell = cell
@ -235,20 +210,18 @@ class ZoneOutCell(nn.Layer):

    def forward(self, inputs, hidden):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        inputs : Tensor
-            Batch of input tensor (B, input_size).
-        hidden : tuple
-            - Tensor: Batch of initial hidden states (B, hidden_size).
-            - Tensor: Batch of initial cell states (B, hidden_size).
-        Returns
-        ----------
-        Tensor
-            Batch of next hidden states (B, hidden_size).
-        tuple:
-            - Tensor: Batch of next hidden states (B, hidden_size).
-            - Tensor: Batch of next cell states (B, hidden_size).
+
+        Args:
+            inputs (Tensor): Batch of input tensor (B, input_size).
+            hidden (tuple):
+                - Tensor: Batch of initial hidden states (B, hidden_size).
+                - Tensor: Batch of initial cell states (B, hidden_size).
+        Returns:
+            Tensor:
+                Batch of next hidden states (B, hidden_size).
+            tuple:
+                - Tensor: Batch of next hidden states (B, hidden_size).
+                - Tensor: Batch of next cell states (B, hidden_size).
        """
        # we only use the second output of LSTMCell in paddle
        _, next_hidden = self.cell(inputs, hidden)
@ -302,42 +275,29 @@ class Decoder(nn.Layer):
            zoneout_rate=0.1,
            reduction_factor=1, ):
        """Initialize Tacotron2 decoder module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        att nn.Layer
-            Instance of attention class.
-        dlayers int, optional
-            The number of decoder lstm layers.
-        dunits : int, optional
-            The number of decoder lstm units.
-        prenet_layers : int, optional
-            The number of prenet layers.
-        prenet_units : int, optional
-            The number of prenet units.
-        postnet_layers : int, optional
-            The number of postnet layers.
-        postnet_filts : int, optional
-            The number of postnet filter size.
-        postnet_chans : int, optional
-            The number of postnet filter channels.
-        output_activation_fn : nn.Layer, optional
-            Activation function for outputs.
-        cumulate_att_w : bool, optional
-            Whether to cumulate previous attention weight.
-        use_batch_norm : bool, optional
-            Whether to use batch normalization.
-        use_concate : bool, optional
-            Whether to concatenate encoder embedding with decoder lstm outputs.
-        dropout_rate : float, optional
-            Dropout rate.
-        zoneout_rate : float, optional
-            Zoneout rate.
-        reduction_factor : int, optional
-            Reduction factor.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            att (nn.Layer): Instance of attention class.
+            dlayers (int, optional): The number of decoder lstm layers.
+            dunits (int, optional): The number of decoder lstm units.
+            prenet_layers (int, optional): The number of prenet layers.
+            prenet_units (int, optional): The number of prenet units.
+            postnet_layers (int, optional): The number of postnet layers.
+            postnet_filts (int, optional): The number of postnet filter size.
+            postnet_chans (int, optional): The number of postnet filter channels.
+            output_activation_fn (nn.Layer, optional): Activation function for outputs.
+            cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
+            use_batch_norm (bool, optional): Whether to use batch normalization.
+            use_concate : bool, optional
+                Whether to concatenate encoder embedding with decoder lstm outputs.
+            dropout_rate : float, optional
+                Dropout rate.
+            zoneout_rate : float, optional
+                Zoneout rate.
+            reduction_factor : int, optional
+                Reduction factor.
        """
        super().__init__()

@ -401,26 +361,19 @@ class Decoder(nn.Layer):

    def forward(self, hs, hlens, ys):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of the sequences of padded hidden states (B, Tmax, idim).
-        hlens : Tensor(int64) padded
-            Batch of lengths of each input batch (B,).
-        ys : Tensor
-            Batch of the sequences of padded target features (B, Lmax, odim).
-        Returns
-        ----------
-        Tensor
-            Batch of output tensors after postnet (B, Lmax, odim).
-        Tensor
-            Batch of output tensors before postnet (B, Lmax, odim).
-        Tensor
-            Batch of logits of stop prediction (B, Lmax).
-        Tensor
-            Batch of attention weights (B, Lmax, Tmax).
-        Note
-        ----------
+
+        Args:
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+        Returns:
+            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
+            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
+            Tensor: Batch of logits of stop prediction (B, Lmax).
+            Tensor: Batch of attention weights (B, Lmax, Tmax).
+            
+        Note: 
            This computation is performed in teacher-forcing manner.
        """
        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
@ -517,37 +470,24 @@ class Decoder(nn.Layer):
            backward_window=None,
            forward_window=None, ):
        """Generate the sequence of features given the sequences of characters.
-        Parameters
-        ----------
-        h : Tensor
-            Input sequence of encoder hidden states (T, C).
-        threshold : float, optional
-            Threshold to stop generation.
-        minlenratio : float, optional
-            Minimum length ratio.
-            If set to 1.0 and the length of input is 10,
-            the minimum length of outputs will be 10 * 1 = 10.
-        minlenratio : float, optional
-            Minimum length ratio.
-            If set to 10 and the length of input is 10,
-            the maximum length of outputs will be 10 * 10 = 100.
-        use_att_constraint : bool
-            Whether to apply attention constraint introduced in `Deep Voice 3`_.
-        backward_window : int
-            Backward window size in attention constraint.
-        forward_window : int
-            Forward window size in attention constraint.
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        Tensor
-            Output sequence of stop probabilities (L,).
-        Tensor
-            Attention weights (L, T).
-        Note
-        ----------
-        This computation is performed in auto-regressive manner.
+        Args:
+            h(Tensor): Input sequence of encoder hidden states (T, C).
+            threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
+            minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
+                the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
+            maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
+                the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
+            use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
+            backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
+            forward_window(int, optional):  (Default value = None)
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Attention weights (L, T).
+
+        Note: 
+            This computation is performed in auto-regressive manner.
    .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
        """
        # setup
@ -683,21 +623,18 @@ class Decoder(nn.Layer):

    def calculate_all_attentions(self, hs, hlens, ys):
        """Calculate all of the attention weights.
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of the sequences of padded hidden states (B, Tmax, idim).
-        hlens : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        ys : Tensor
-            Batch of the sequences of padded target features (B, Lmax, odim).
-        Returns
-        ----------
-        numpy.ndarray
-            Batch of attention weights (B, Lmax, Tmax).
-        Note
-        ----------
-        This computation is performed in teacher-forcing manner.
+
+        Args:
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+        Returns:
+            numpy.ndarray:
+                Batch of attention weights (B, Lmax, Tmax).
+    
+        Note:
+            This computation is performed in teacher-forcing manner.
        """
        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
        if self.reduction_factor > 1:
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@ -45,31 +45,18 @@ class Encoder(nn.Layer):
            dropout_rate=0.5,
            padding_idx=0, ):
        """Initialize Tacotron2 encoder module.
-
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        input_layer : str
-            Input layer type.
-        embed_dim : int, optional
-            Dimension of character embedding.
-        elayers : int, optional
-            The number of encoder blstm layers.
-        eunits : int, optional
-            The number of encoder blstm units.
-        econv_layers : int, optional
-            The number of encoder conv layers.
-        econv_filts : int, optional
-            The number of encoder conv filter size.
-        econv_chans : int, optional
-            The number of encoder conv filter channels.
-        use_batch_norm : bool, optional
-            Whether to use batch normalization.
-        use_residual : bool, optional
-            Whether to use residual connection.
-        dropout_rate : float, optional
-            Dropout rate.
+        Args:
+            idim (int): Dimension of the inputs.
+            input_layer (str): Input layer type.
+            embed_dim (int, optional): Dimension of character embedding.
+            elayers (int, optional): The number of encoder blstm layers.
+            eunits (int, optional): The number of encoder blstm units.
+            econv_layers (int, optional): The number of encoder conv layers.
+            econv_filts (int, optional): The number of encoder conv filter size.
+            econv_chans (int, optional): The number of encoder conv filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization.
+            use_residual (bool, optional): Whether to use residual connection.
+            dropout_rate (float, optional): Dropout rate.

        """
        super().__init__()
@ -139,21 +126,15 @@ class Encoder(nn.Layer):
    def forward(self, xs, ilens=None):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        xs : Tensor
-            Batch of the padded sequence. Either character ids (B, Tmax)
-            or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
-            Padded value should be 0.
-        ilens : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-
-        Returns
-        ----------
-        Tensor
-            Batch of the sequences of encoder states(B, Tmax, eunits).
-        Tensor(int64)
-            Batch of lengths of each sequence (B,)
+        Args:
+            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
+                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
+                Padded value should be 0.
+            ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
+
+        Returns:
+            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
+            Tensor(int64): Batch of lengths of each sequence (B,)
        """
        xs = self.embed(xs).transpose([0, 2, 1])
        if self.convs is not None:
@ -179,16 +160,12 @@ class Encoder(nn.Layer):
    def inference(self, x):
        """Inference.

-        Parameters
-        ----------
-        x : Tensor
-            The sequeunce of character ids (T,) 
-            or acoustic feature (T, idim * encoder_reduction_factor).
+        Args:
+            x (Tensor): The sequeunce of character ids (T,) 
+                or acoustic feature (T, idim * encoder_reduction_factor).

-        Returns
-        ----------
-        Tensor
-            The sequences of encoder states(T, eunits).
+        Returns:
+            Tensor: The sequences of encoder states(T, eunits).

        """
        xs = x.unsqueeze(0)
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@ -59,18 +59,12 @@ class TADELayer(nn.Layer):

    def forward(self, x, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        c : Tensor
-            Auxiliary input tensor (B, aux_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, in_channels, T * upsample_factor).
-        Tensor
-            Upsampled aux tensor (B, in_channels, T * upsample_factor).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
        """

        x = self.norm(x)
@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer):

    def forward(self, x, c):
        """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        c : Tensor
-            Auxiliary input tensor (B, aux_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, in_channels, T * upsample_factor).
-        Tensor
-            Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+        Args:
+
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
        """
        residual = x
        x, c = self.tade1(x, c)
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill

 class MultiHeadedAttention(nn.Layer):
    """Multi-Head Attention layer.
-
-    Parameters
-    ----------
-    n_head : int
-        The number of heads.
-    n_feat : int
-        The number of features.
-    dropout_rate : float
-        Dropout rate.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
    """

    def __init__(self, n_head, n_feat, dropout_rate):
@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer):
    def forward_qkv(self, query, key, value):
        """Transform query, key and value.

-        Parameters
-        ----------
-        query : paddle.Tensor
-            query tensor (#batch, time1, size).
-        key : paddle.Tensor
-            Key tensor (#batch, time2, size).
-        value : paddle.Tensor
-            Value tensor (#batch, time2, size).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Transformed query tensor (#batch, n_head, time1, d_k).
-        paddle.Tensor
-            Transformed key tensor (#batch, n_head, time2, d_k).
-        paddle.Tensor
-            Transformed value tensor (#batch, n_head, time2, d_k).
+        Args:
+            query(Tensor): query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
        """
        n_batch = paddle.shape(query)[0]

@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer):
    def forward_attention(self, value, scores, mask=None):
        """Compute attention context vector.

-        Parameters
-        ----------
-        value : paddle.Tensor
-            Transformed value (#batch, n_head, time2, d_k).
-        scores : paddle.Tensor
-            Attention score (#batch, n_head, time1, time2).
-        mask :  paddle.Tensor
-            Mask (#batch, 1, time2) or (#batch, time1, time2).
-
-        Returns
-        ----------
-        paddle.Tensor:
-            Transformed value (#batch, time1, d_model)
-            weighted by the attention score (#batch, time1, time2).
+        Args:
+            value(Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores(Tensor): Attention score (#batch, n_head, time1, time2).
+            mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+        Returns:
+            Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
        """
        n_batch = paddle.shape(value)[0]
        softmax = paddle.nn.Softmax(axis=-1)
@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer):
    def forward(self, query, key, value, mask=None):
        """Compute scaled dot product attention.

-        Parameters
-        ----------
-        query : paddle.Tensor
-            Query tensor (#batch, time1, size).
-        key : paddle.Tensor
-            Key tensor (#batch, time2, size).
-        value : paddle.Tensor
-            Value tensor (#batch, time2, size).
-        mask : paddle.Tensor
-            Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time1, d_model).
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
        """
        q, k, v = self.forward_qkv(query, key, value)
        scores = paddle.matmul(q, k.transpose(
@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    """Multi-Head Attention layer with relative position encoding (new implementation).
    Details can be found in https://github.com/espnet/espnet/pull/2816.
    Paper: https://arxiv.org/abs/1901.02860
-    Parameters
-    ----------
-    n_head : int
-        The number of heads.
-    n_feat : int
-        The number of features.
-    dropout_rate : float
-        Dropout rate.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
    """

    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):

    def rel_shift(self, x):
        """Compute relative positional encoding.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, head, time1, 2*time1-1).
-            time1 means the length of query vector.
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
+        Args:
+            x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
+
+        Returns:
+            Tensor:Output tensor.
        """
        b, h, t1, t2 = paddle.shape(x)
        zero_pad = paddle.zeros((b, h, t1, 1))
@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):

    def forward(self, query, key, value, pos_emb, mask):
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
-        Parameters
-        ----------
-        query : paddle.Tensor 
-            Query tensor (#batch, time1, size).
-        key : paddle.Tensor
-            Key tensor (#batch, time2, size).
-        value : paddle.Tensor
-            Value tensor (#batch, time2, size).
-        pos_emb : paddle.Tensor
-            Positional embedding tensor
-            (#batch, 2*time1-1, size).
-        mask : paddle.Tensor
-            Mask tensor (#batch, 1, time2) or
-            (#batch, time1, time2).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time1, d_model).
+
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
+            mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
        """
        q, k, v = self.forward_qkv(query, key, value)
        # (batch, time1, head, d_k)
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat
 class Decoder(nn.Layer):
    """Transfomer decoder module.

-    Parameters
-    ----------
-    odim : int
-        Output diminsion.
-    self_attention_layer_type : str
-        Self-attention layer type.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    conv_wshare : int
-        The number of kernel of convolution. Only used in
-        self_attention_layer_type == "lightconv*" or "dynamiconv*".
-    conv_kernel_length : Union[int, str])
-        Kernel size str of convolution
-        (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
-    conv_usebias : bool
-        Whether to use bias in convolution. Only used in
-        self_attention_layer_type == "lightconv*" or "dynamiconv*".
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    self_attention_dropout_rate : float
-        Dropout rate in self-attention.
-    src_attention_dropout_rate : float
-        Dropout rate in source-attention.
-    input_layer : (Union[str, nn.Layer])
-        Input layer type.
-    use_output_layer : bool
-        Whether to use output layer.
-    pos_enc_class : nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    Args:
+        odim (int): Output diminsion.
+        self_attention_layer_type (str): Self-attention layer type.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        conv_wshare (int): The number of kernel of convolution. Only used in
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        conv_kernel_length (Union[int, str]):Kernel size str of convolution
+            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        conv_usebias (bool): Whether to use bias in convolution. Only used in
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        linear_units(int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        self_attention_dropout_rate (float): Dropout rate in self-attention.
+        src_attention_dropout_rate (float): Dropout rate in source-attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        use_output_layer (bool): Whether to use output layer.
+        pos_enc_class (nn.Layer): Positional encoding module class.
+            `PositionalEncoding `or `ScaledPositionalEncoding`
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)

    """

@ -161,27 +142,18 @@ class Decoder(nn.Layer):

    def forward(self, tgt, tgt_mask, memory, memory_mask):
        """Forward decoder.
-
-        Parameters
-        ----------
-        tgt : paddle.Tensor
-            Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". 
-            In the other case, input tensor (#batch, maxlen_out, odim).
-        tgt_mask : paddle.Tensor
-            Input token mask (#batch, maxlen_out).
-        memory : paddle.Tensor
-            Encoded memory, float32 (#batch, maxlen_in, feat).
-        memory_mask : paddle.Tensor
-            Encoded memory mask (#batch, maxlen_in).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Decoded token score before softmax (#batch, maxlen_out, odim)
-            if use_output_layer is True. In the other case,final block outputs
-            (#batch, maxlen_out, attention_dim).
-        paddle.Tensor
-            Score mask before softmax (#batch, maxlen_out).
+        Args:
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
+                In the other case, input tensor (#batch, maxlen_out, odim).
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+
+        Returns:
+            Tensor:
+                Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. 
+                In the other case,final block outputs (#batch, maxlen_out, attention_dim).
+            Tensor: Score mask before softmax (#batch, maxlen_out).

        """
        x = self.embed(tgt)
@ -196,23 +168,15 @@ class Decoder(nn.Layer):
    def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
        """Forward one step.

-        Parameters
-        ----------
-        tgt : paddle.Tensor
-            Input token ids, int64 (#batch, maxlen_out).
-        tgt_mask : paddle.Tensor
-            Input token mask (#batch, maxlen_out).
-        memory : paddle.Tensor
-            Encoded memory, float32 (#batch, maxlen_in, feat).
-        cache : (List[paddle.Tensor])
-            List of cached tensors.
-            Each tensor shape should be (#batch, maxlen_out - 1, size).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (batch, maxlen_out, odim).
-        List[paddle.Tensor]
-            List of cache tensors of each decoder layer.
+        Args:
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
+
+        Returns:
+            Tensor: Output tensor (batch, maxlen_out, odim).
+            List[Tensor]: List of cache tensors of each decoder layer.

        """
        x = self.embed(tgt)
@ -254,20 +218,14 @@ class Decoder(nn.Layer):
                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
        """Score new token batch (required).

-        Parameters
-        ----------
-        ys : paddle.Tensor
-            paddle.int64 prefix tokens (n_batch, ylen).
-        states : List[Any]
-            Scorer states for prefix tokens.
-        xs : paddle.Tensor
-            The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Args:
+            ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states(List[Any]): Scorer states for prefix tokens.
+            xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).

-        Returns
-        ----------
-        tuple[paddle.Tensor, List[Any]]
-        Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)`
-        and next state list for ys.
+        Returns:
+            tuple[Tensor, List[Any]]:
+                Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys.

        """
        # merge states
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
 class DecoderLayer(nn.Layer):
    """Single decoder layer module.

-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` instance can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+ 
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)

    """

@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer):
    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
        """Compute decoded features.

-        Parameters
-        ----------
-        tgt : paddle.Tensor
-            Input tensor (#batch, maxlen_out, size).
-        tgt_mask : paddle.Tensor
-            Mask for input tensor (#batch, maxlen_out).
-        memory : paddle.Tensor
-            Encoded memory, float32 (#batch, maxlen_in, size).
-        memory_mask : paddle.Tensor
-            Encoded memory mask (#batch, maxlen_in).
-        cache : List[paddle.Tensor]
-            List of cached tensors.
-            Each tensor shape should be (#batch, maxlen_out - 1, size).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor(#batch, maxlen_out, size).
-        paddle.Tensor
-            Mask for output tensor (#batch, maxlen_out).
-        paddle.Tensor
-            Encoded memory (#batch, maxlen_in, size).
-        paddle.Tensor
-            Encoded memory mask (#batch, maxlen_in).
+        Args:
+            tgt(Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+            cache(List[Tensor], optional): List of cached tensors.
+                Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
+        Returns:
+            Tensor
+                Output tensor(#batch, maxlen_out, size).
+            Tensor
+                Mask for output tensor (#batch, maxlen_out).
+            Tensor
+                Encoded memory (#batch, maxlen_in, size).
+            Tensor
+                Encoded memory mask (#batch, maxlen_in).

        """
        residual = tgt
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -22,18 +22,12 @@ from paddle import nn
 class PositionalEncoding(nn.Layer):
    """Positional encoding.

-    Parameters
-    ----------
-    d_model : int
-        Embedding dimension.
-    dropout_rate : float
-        Dropout rate.
-    max_len : int
-        Maximum input length.
-    reverse : bool
-        Whether to reverse the input position.
-    type : str
-        dtype of param
+    Args:
+        d_model (int):  Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+        type (str): dtype of param
    """

    def __init__(self,
@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.

-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, time, `*`).
+        Args:
+            x (Tensor): Input tensor (batch, time, `*`).

-        Returns
-        ----------
-        paddle.Tensor
-            Encoded tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        T = paddle.shape(x)[1]
@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer):

 class ScaledPositionalEncoding(PositionalEncoding):
    """Scaled positional encoding module.
-
    See Sec. 3.2  https://arxiv.org/abs/1809.08895

-    Parameters
-    ----------
-    d_model : int
-        Embedding dimension.
-    dropout_rate : float
-        Dropout rate.
-    max_len : int
-        Maximum input length.
-    dtype : str
-        dtype of param
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        dtype (str): dtype of param
    """

    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding):
    def forward(self, x):
        """Add positional encoding.

-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, time, `*`).
-        Returns
-        ----------
-        paddle.Tensor
-            Encoded tensor (batch, time, `*`).
+        Args:
+            x (Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        T = paddle.shape(x)[1]
@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer):
    """Relative positional encoding module (new implementation).
    Details can be found in https://github.com/espnet/espnet/pull/2816.
    See : Appendix B in https://arxiv.org/abs/1901.02860
-    Parameters
-    ----------
-    d_model : int
-        Embedding dimension.
-    dropout_rate : float
-        Dropout rate.
-    max_len : int
-        Maximum input length.
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
    """

    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer):

    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, time, `*`).
-        Returns
-        ----------
-        paddle.Tensor
-            Encoded tensor (batch, time, `*`).
+        Args:
+            x (Tensor):Input tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        x = x * self.xscale
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
 class BaseEncoder(nn.Layer):
    """Base Encoder module.

-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, nn.Layer]
-        Input layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    macaron_style : bool
-        Whether to use macaron style for positionwise layer.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    use_cnn_module : bool
-        Whether to use convolution module.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
-    cnn_module_kernel : int
-        Kernerl size of convolution module.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    stochastic_depth_rate : float
-        Maximum probability to skip the encoder layer.
-    intermediate_layers : Union[List[int], None]
-        indices of intermediate CTC layer.
-        indices start from 1.
-        if not None, intermediate outputs are returned (which changes return type
-        signature.)
-    encoder_type: str
-         "transformer", or "conformer".
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type
+            signature.)
+        encoder_type (str): "transformer", or "conformer".
    """

    def __init__(self,
@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer):
    def forward(self, xs, masks):
        """Encode input sequence.

-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, 1, time).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, 1, time).
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+
+        Returns: 
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer):

 class TransformerEncoder(BaseEncoder):
    """Transformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    padding_idx : int
-        Padding idx for input_layer=embed.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        padding_idx (int): Padding idx for input_layer=embed.
    """

    def __init__(
@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder):
    def forward(self, xs, masks):
        """Encode input sequence.

-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, 1, time).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, 1, time).
+        Args:
+            xs(Tensor): Input tensor (#batch, time, idim).
+            masks(Tensor): Mask tensor (#batch, 1, time).
+
+        Returns:
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor:Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder):
    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.

-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor.
-        masks : paddle.Tensor
-            Mask tensor.
-        cache : List[paddle.Tensor]
-            List of cache tensors.
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
-        paddle.Tensor
-            Mask tensor.
-        List[paddle.Tensor]
-            List of new cache tensors.
+        Args:
+            xs (Tensor): Input tensor.
+            masks (Tensor): Mask tensor.
+            cache (List[Tensor]): List of cache tensors.
+
+        Returns:
+            Tensor: Output tensor.
+            Tensor: Mask tensor.
+            List[Tensor]: List of new cache tensors.
        """

        xs = self.embed(xs)
@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder):

 class ConformerEncoder(BaseEncoder):
    """Conformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, nn.Layer]
-        Input layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    macaron_style : bool
-        Whether to use macaron style for positionwise layer.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    use_cnn_module : bool
-        Whether to use convolution module.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
-    cnn_module_kernel : int
-        Kernerl size of convolution module.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    stochastic_depth_rate : float
-        Maximum probability to skip the encoder layer.
-    intermediate_layers : Union[List[int], None]
-        indices of intermediate CTC layer.
-        indices start from 1.
-        if not None, intermediate outputs are returned (which changes return type
-        signature.)
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool):Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type signature.)
    """

    def __init__(
@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder):

    def forward(self, xs, masks):
        """Encode input sequence.
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, 1, time).
+
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: Mask tensor (#batch, 1, time).
        """
        if isinstance(self.embed, (Conv2dSubsampling)):
            xs, masks = self.embed(xs, masks)
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@ -20,25 +20,18 @@ from paddle import nn
 class EncoderLayer(nn.Layer):
    """Encoder layer module.

-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention`  instance can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention`  instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
    """

    def __init__(
@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer):
    def forward(self, x, mask, cache=None):
        """Compute encoded features.

-        Parameters
-        ----------
-        x_input : paddle.Tensor
-            Input tensor (#batch, time, size).
-        mask : paddle.Tensor
-            Mask tensor for the input (#batch, time).
-        cache : paddle.Tensor
-                Cache tensor of the input (#batch, time - 1, size).
+        Args:
+            x(Tensor): Input tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). 

-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, size).
-        paddle.Tensor
-            Mask tensor (#batch, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
        """
        residual = x
        if self.normalize_before:
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer):
    This implementation is based on
    https://github.com/pytorch/fairseq/tree/master/fairseq

-    Parameters
-    ----------
-    wshare : int
-        the number of kernel of convolution
-    n_feat : int
-        the number of features
-    dropout_rate : float
-        dropout_rate
-    kernel_size : int
-        kernel size (length)
-    use_kernel_mask : bool
-        Use causal mask or not for convolution kernel
-    use_bias : bool
-        Use bias term or not.
+    Args:
+        wshare (int): the number of kernel of convolution
+        n_feat (int): the number of features
+        dropout_rate (float): dropout_rate
+        kernel_size (int): kernel size (length)
+        use_kernel_mask (bool): Use causal mask or not for convolution kernel
+        use_bias (bool): Use bias term or not.

    """

@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer):
        This function takes query, key and value but uses only query.
        This is just for compatibility with self-attention layer (attention.py)

-        Parameters
-        ----------
-        query : paddle.Tensor
-            (batch, time1, d_model) input tensor
-        key : paddle.Tensor
-            (batch, time2, d_model) NOT USED
-        value : paddle.Tensor
-            (batch, time2, d_model) NOT USED
-        mask : paddle.Tensor
-            (batch, time1, time2) mask
-
-        Return
-        ----------
-        x : paddle.Tensor
-            (batch, time1, d_model) ouput
+        Args:
+            query (Tensor): input tensor. (batch, time1, d_model)
+            key (Tensor): NOT USED. (batch, time2, d_model)  
+            value (Tensor): NOT USED. (batch, time2, d_model) 
+            mask : (Tensor): (batch, time1, time2) mask
+
+        Return:
+            Tensor: ouput. (batch, time1, d_model) 

        """
        # linear -> GLU -> lightconv -> linear
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
@ -17,19 +17,16 @@ import paddle

 def subsequent_mask(size, dtype=paddle.bool):
    """Create mask for subsequent steps (size, size).
-    Parameters
-    ----------
-    size : int
-        size of mask
-    dtype : paddle.dtype
-        result dtype
-    Return
-    ----------
-    paddle.Tensor
-    >>> subsequent_mask(3)
-    [[1, 0, 0],
-     [1, 1, 0],
-     [1, 1, 1]]
+
+    Args:
+        size (int): size of mask
+        dtype (paddle.dtype): result dtype
+    Return:
+        Tensor:
+            >>> subsequent_mask(3)
+            [[1, 0, 0],
+            [1, 1, 0],
+            [1, 1, 1]]
    """
    ret = paddle.ones([size, size], dtype=dtype)
    return paddle.tril(ret)
@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool):

 def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
    """Create mask for decoder self-attention.
-    Parameters
-    ----------

-    ys_pad : paddle.Tensor
-        batch of padded target sequences (B, Lmax)
-    ignore_id : int
-        index of padding
-    dtype : torch.dtype
-        result dtype
-    Return
-    ----------
-    paddle.Tensor 
-        (B, Lmax, Lmax)
+    Args:
+        ys_pad (Tensor): batch of padded target sequences (B, Lmax)
+        ignore_id (int): index of padding
+        dtype (paddle.dtype): result dtype
+    Return: 
+        Tensor: (B, Lmax, Lmax)
    """
    ys_mask = ys_in_pad != ignore_id
    m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0)
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize MultiLayeredConv1d module.

-        Parameters
-        ----------
-        in_chans : int
-            Number of input channels.
-        hidden_chans : int
-            Number of hidden channels.
-        kernel_size : int
-            Kernel size of conv1d.
-        dropout_rate : float
-            Dropout rate.
+        Args: 
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.

        """
        super().__init__()
@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        x : paddle.Tensor
-            Batch of input tensors (B, T, in_chans).
+        Args:
+            x (Tensor): Batch of input tensors (B, T, in_chans).

-        Returns
-        ----------
-        paddle.Tensor
-            Batch of output tensors (B, T, in_chans).
+        Returns: 
+            Tensor: Batch of output tensors (B, T, in_chans).
        """
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize Conv1dLinear module.

-        Parameters
-        ----------
-        in_chans : int
-            Number of input channels.
-        hidden_chans : int
-            Number of hidden channels.
-        kernel_size : int
-            Kernel size of conv1d.
-        dropout_rate : float
-            Dropout rate.
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
        """
        super().__init__()
        self.w_1 = nn.Conv1D(
@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.

-        Parameters
-        ----------
-        x : paddle.Tensor
-        Batch of input tensors (B, T, in_chans).
+        Args:
+            x (Tensor): Batch of input tensors (B, T, in_chans).

-        Returns
-        ----------
-        paddle.Tensor
-            Batch of output tensors (B, T, in_chans).
+        Returns:
+            Tensor: Batch of output tensors (B, T, in_chans).

        """
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@ -20,14 +20,10 @@ from paddle import nn
 class PositionwiseFeedForward(nn.Layer):
    """Positionwise feed forward layer.

-    Parameters
-    ----------
-    idim : int
-        Input dimenstion.
-    hidden_units : int
-        The number of hidden units.
-    dropout_rate : float
-        Dropout rate.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
    """

    def __init__(self,
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential):
 def repeat(N, fn):
    """Repeat module N times.

-    Parameters
-    ----------
-    N : int
-        Number of repeat time.
-    fn : Callable
-        Function to generate module.
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.

-    Returns
-    ----------
-    MultiSequential
-        Repeated model instance.
+    Returns:
+        MultiSequential: Repeated model instance.
    """
    return MultiSequential(*[fn(n) for n in range(N)])
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding

 class Conv2dSubsampling(nn.Layer):
    """Convolutional 2D subsampling (to 1/4 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : nn.Layer
-        Custom position encoding layer.
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (nn.Layer): Custom position encoding layer.
    """

    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer):

    def forward(self, x, x_mask):
        """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask : paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Subsampled tensor (#batch, time', odim),
-            where time' = time // 4.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 4.
+        Args:
+            x (Tensor): Input tensor (#batch, time, idim).
+            x_mask (Tensor): Input mask (#batch, 1, time).
+        Returns:
+            Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
+            Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
@ -27,17 +27,12 @@ class Stretch2D(nn.Layer):
    def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"):
        """Strech an image (or image-like object) with some interpolation.

-        Parameters
-        ----------
-        w_scale : int
-            Scalar of width.
-        h_scale : int
-            Scalar of the height.
-        mode : str, optional
-            Interpolation mode, modes suppored are "nearest", "bilinear", 
-            "trilinear", "bicubic", "linear" and "area",by default "nearest"
-
-            For more details about interpolation, see 
+        Args:
+            w_scale (int): Scalar of width.
+            h_scale (int): Scalar of the height.
+            mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", 
+                "trilinear", "bicubic", "linear" and "area",by default "nearest"
+        For more details about interpolation, see 
            `paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
        """
        super().__init__()
@ -47,16 +42,14 @@ class Stretch2D(nn.Layer):

    def forward(self, x):
        """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C, H, W)
-
-        Returns
-        -------
-        Tensor
-            Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
-            The stretched image.
+
+        Args: 
+            x (Tensor): Shape (N, C, H, W)
+
+        Returns:
+            Tensor: The stretched image.
+                Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
+            
        """
        out = F.interpolate(
            x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode)
@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer):
    """A Layer to upsample spectrogram by applying consecutive stretch and
    convolutions.

-    Parameters
-    ----------
-    upsample_scales : List[int]
-        Upsampling factors for each strech.
-    nonlinear_activation : Optional[str], optional
-        Activation after each convolution, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to construct the activation, by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the strech, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Convolution kernel size along the frequency axis, by default 1
-    use_causal_conv : bool, optional
-        Whether to use causal padding before convolution, by default False
-
-        If True, Causal padding is used along the time axis, i.e. padding
-        amount is ``receptive field - 1`` and 0 for before and after,
-        respectively.
-
-        If False, "same" padding is used along the time axis.
+    Args:
+        upsample_scales (List[int]): Upsampling factors for each strech.
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            If True, Causal padding is used along the time axis, 
+            i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
+            If False, "same" padding is used along the time axis.
    """

    def __init__(self,
@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer):

    def forward(self, c):
        """
-        Parameters
-        ----------
-        c : Tensor
-            Shape (N, F, T), spectrogram
-
-        Returns
-        -------
-        Tensor
-            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
-            spectrogram
+        Args:
+            c (Tensor): spectrogram. Shape (N, F, T)
+
+        Returns: 
+            Tensor: upsampled spectrogram.
+                Shape (N, F, T'), where ``T' = upsample_factor * T``, 
        """
        c = c.unsqueeze(1)
        for f in self.up_layers:
@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer):
 class ConvInUpsampleNet(nn.Layer):
    """A Layer to upsample spectrogram composed of a convolution and an 
    UpsampleNet.
-
-    Parameters
-    ----------
-    upsample_scales : List[int]
-        Upsampling factors for each strech.
-    nonlinear_activation : Optional[str], optional
-        Activation after each convolution, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to construct the activation, by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the strech, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Convolution kernel size along the frequency axis, by default 1
-    aux_channels : int, optional
-        Feature size of the input, by default 80
-    aux_context_window : int, optional
-        Context window of the first 1D convolution applied to the input. It 
-        related to the kernel size of the convolution, by default 0
-
-        If use causal convolution, the kernel size is ``window + 1``, else
-        the kernel size is ``2 * window + 1``.
-    use_causal_conv : bool, optional
-        Whether to use causal padding before convolution, by default False
-
-        If True, Causal padding is used along the time axis, i.e. padding 
-        amount is ``receptive field - 1`` and 0 for before and after, 
-        respectively.
-
-        If False, "same" padding is used along the time axis.
+    
+    Args:
+        upsample_scales (List[int]): Upsampling factors for each strech.
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        aux_channels (int, optional): Feature size of the input, by default 80
+        aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It 
+            related to the kernel size of the convolution, by default 0
+            If use causal convolution, the kernel size is ``window + 1``, 
+            else the kernel size is ``2 * window + 1``.
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            If True, Causal padding is used along the time axis, i.e. padding 
+            amount is ``receptive field - 1`` and 0 for before and after, respectively.
+            If False, "same" padding is used along the time axis.
    """

    def __init__(self,
@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer):

    def forward(self, c):
        """
-        Parameters
-        ----------
-        c : Tensor
-            Shape (N, F, T), spectrogram
-
-        Returns
-        -------
-        Tensors
-            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
-            spectrogram
+        Args:
+            c (Tensor): spectrogram. Shape (N, F, T)
+
+        Returns:
+            Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, 
        """
        c_ = self.conv_in(c)
        c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@ -57,35 +57,30 @@ class ExperimentBase(object):
    Feel free to add/overwrite other methods and standalone functions if you
    need.

-    Parameters
-    ----------
-    config: yacs.config.CfgNode
-        The configuration used for the experiment.
-
-    args: argparse.Namespace
-        The parsed command line arguments.
-
-    Examples
-    --------
-    >>> def main_sp(config, args):
-    >>>     exp = Experiment(config, args)
-    >>>     exp.setup()
-    >>>     exe.resume_or_load()
-    >>>     exp.run()
-    >>>
-    >>> config = get_cfg_defaults()
-    >>> parser = default_argument_parser()
-    >>> args = parser.parse_args()
-    >>> if args.config:
-    >>>     config.merge_from_file(args.config)
-    >>> if args.opts:
-    >>>     config.merge_from_list(args.opts)
-    >>> config.freeze()
-    >>>
-    >>> if args.ngpu > 1:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
-    >>> else:
-    >>>     main_sp(config, args)
+    Args:
+        config (yacs.config.CfgNode): The configuration used for the experiment.
+        args (argparse.Namespace): The parsed command line arguments.
+
+    Examples:
+        >>> def main_sp(config, args):
+        >>>     exp = Experiment(config, args)
+        >>>     exp.setup()
+        >>>     exe.resume_or_load()
+        >>>     exp.run()
+        >>>
+        >>> config = get_cfg_defaults()
+        >>> parser = default_argument_parser()
+        >>> args = parser.parse_args()
+        >>> if args.config:
+        >>>     config.merge_from_file(args.config)
+        >>> if args.opts:
+        >>>     config.merge_from_list(args.opts)
+        >>> config.freeze()
+        >>>
+        >>> if args.ngpu > 1:
+        >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+        >>> else:
+        >>>     main_sp(config, args)
    """

    def __init__(self, config, args):
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
@ -43,10 +43,8 @@ class Snapshot(extension.Extension):
    parameters and optimizer states. If the updater inside the trainer
    subclasses StandardUpdater, everything is good to go.

-    Parameters
-    ----------
-    checkpoint_dir : Union[str, Path]
-        The directory to save checkpoints into.
+    Arsg:
+        checkpoint_dir (Union[str, Path]): The directory to save checkpoints into.
    """

    trigger = (1, 'epoch')
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    """Compute the levenshtein distance between reference sequence and
    hypothesis sequence in word-level.

-    Parameters
-    ----------
-    reference : str
-        The reference sentence.
-    hypothesis : str
-        The hypothesis sentence.
-    ignore_case : bool
-        Whether case-sensitive or not.
-    delimiter : char(str)
-        Delimiter of input sentences.
-
-    Returns
-    ----------
-    list
-        Levenshtein distance and word number of reference sentence.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        delimiter (char(str)): Delimiter of input sentences.
+
+    Returns:
+        list: Levenshtein distance and word number of reference sentence.
    """
    if ignore_case:
        reference = reference.lower()
@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
    """Compute the levenshtein distance between reference sequence and
    hypothesis sequence in char-level.

-    Parameters
-    ----------
-    reference: str
-        The reference sentence.
-    hypothesis: str
-        The hypothesis sentence.
-    ignore_case: bool
-        Whether case-sensitive or not.
-    remove_space: bool
-        Whether remove internal space characters
-
-    Returns
-    ----------
-    list
-        Levenshtein distance and length of reference sentence.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        remove_space (bool): Whether remove internal space characters
+
+    Returns:
+        list: Levenshtein distance and length of reference sentence.
    """
    if ignore_case:
        reference = reference.lower()
@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
    We can use levenshtein distance to calculate WER. Please draw an attention
    that empty items will be removed when splitting sentences by delimiter.

-    Parameters
-    ----------
-    reference: str
-        The reference sentence.
-
-    hypothesis: str
-        The hypothesis sentence.
-    ignore_case: bool
-        Whether case-sensitive or not.
-    delimiter: char
-        Delimiter of input sentences.
-
-    Returns
-    ----------
-    float
-         Word error rate.
-
-    Raises
-    ----------
-    ValueError
-        If word number of reference is zero.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        delimiter (char): Delimiter of input sentences.
+
+    Returns: 
+        float: Word error rate.
+
+    Raises:
+        ValueError: If word number of reference is zero.
    """
    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
                                         delimiter)
@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False):
    space characters will be truncated and multiple consecutive space
    characters in a sentence will be replaced by one space character.

-    Parameters
-    ----------
-    reference: str
-        The reference sentence.
-    hypothesis: str
-        The hypothesis sentence.
-    ignore_case: bool
-        Whether case-sensitive or not.
-    remove_space: bool
-        Whether remove internal space characters
-
-    Returns
-    ----------
-    float
-        Character error rate.
-
-    Raises
-    ----------
-    ValueError
-        If the reference length is zero.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        remove_space (bool): Whether remove internal space characters
+
+    Returns: 
+        float: Character error rate.
+
+    Raises: 
+        ValueError: If the reference length is zero.
    """
    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
                                         remove_space)
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py
@ -23,18 +23,12 @@ import numpy as np

 def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
    """Read a dataset from a HDF5 file.
+    Args:
+        filename (Union[Path, str]): Path of the HDF5 file.
+        dataset_name (str): Name of the dataset to read.

-    Parameters
-    ----------
-    filename : Union[Path, str]
-        Path of the HDF5 file.
-    dataset_name : str
-        Name of the dataset to read.
-
-    Returns
-    -------
-    Any
-        The retrieved dataset.
+    Returns:
+        Any: The retrieved dataset.
    """
    filename = Path(filename)

@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str],
               write_data: np.ndarray,
               is_overwrite: bool=True) -> None:
    """Write dataset to HDF5 file.
-
-    Parameters
-    ----------
-    filename : Union[Path, str]
-        Path of the HDF5 file.
-    dataset_name : str
-        Name of the dataset to write to.
-    write_data : np.ndarrays
-        The data to write.
-    is_overwrite : bool, optional
-        Whether to overwrite, by default True
+    Args:
+        filename (Union[Path, str]): Path of the HDF5 file.
+        dataset_name (str): Name of the dataset to write to.
+        write_data (np.ndarrays): The data to write.
+        is_overwrite (bool, optional): Whether to overwrite, by default True
    """
    # convert to numpy array
    filename = Path(filename)