change the docstring style from numpydoc to google, test=tts

4 years ago · 9699c00769
parent 683679bec7
commit 9699c00769
57 changed files with 2350 additions and 4150 deletions
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@ -22,26 +22,17 @@ from paddle.io import Dataset
 class DataTable(Dataset):
    """Dataset to load and convert data for general purpose.
-
+    Args:
-    Parameters
+        data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of  several fields
-    ----------
+        fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
-    data : List[Dict[str, Any]]
+        converters (Dict[str, Callable], optional): Converters used to process each field, by default None
-        Metadata, a list of meta datum, each of which is composed of 
+        use_cache (bool, optional): Whether to use cache, by default False
-        several fields
+
-    fields : List[str], optional
+    Raises:
-        Fields to use, if not specified, all the fields in the data are 
+        ValueError:
-        used, by default None
+            If there is some field that does not exist in data. 
-    converters : Dict[str, Callable], optional
+        ValueError:
-        Converters used to process each field, by default None
+            If there is some field in converters that does not exist in fields.
    use_cache : bool, optional
        Whether to use cache, by default False
    Raises
    ------
    ValueError
        If there is some field that does not exist in data. 
    ValueError
        If there is some field in converters that does not exist in fields.
    """
    def __init__(self,
@ -95,15 +86,11 @@ class DataTable(Dataset):
        """Convert a meta datum to an example by applying the corresponding 
        converters to each fields requested.
-        Parameters
+        Args:
-        ----------
+            meta_datum (Dict[str, Any]): Meta datum
        meta_datum : Dict[str, Any]
            Meta datum
-        Returns
+        Returns:
-        -------
+            Dict[str, Any]: Converted example
        Dict[str, Any]
            Converted example
        """
        example = {}
        for field in self.fields:
@ -118,16 +105,11 @@ class DataTable(Dataset):
    def __getitem__(self, idx: int) -> Dict[str, Any]:
        """Get an example given an index.
        Args:
            idx (int): Index of the example to get
-        Parameters
+        Returns:
-        ----------
+            Dict[str, Any]: A converted example
        idx : int
            Index of the example to get
        Returns
        -------
        Dict[str, Any]
            A converted example
        """
        if self.use_cache and self.caches[idx] is not None:
            return self.caches[idx]
--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@ -18,14 +18,10 @@ import re
 def get_phn_dur(file_name):
    '''
    read MFA duration.txt
-    Parameters
+    Args:
-    ----------
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
-    file_name : str or Path
+    Returns: 
-        path of gen_duration_from_textgrid.py's result
+        Dict: sentence: {'utt': ([char], [int])}
    Returns
    ----------
    Dict
        sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@ -48,10 +44,8 @@ def get_phn_dur(file_name):
 def merge_silence(sentence):
    '''
    merge silences
-    Parameters
+    Args:
-    ----------
+        sentence (Dict): sentence: {'utt': (([char], [int]), str)}
    sentence : Dict
        sentence: {'utt': (([char], [int]), str)}
    '''
    for utt in sentence:
        cur_phn, cur_dur, speaker = sentence[utt]
@ -81,12 +75,9 @@ def merge_silence(sentence):
 def get_input_token(sentence, output_path, dataset="baker"):
    '''
    get phone set from training data and save it
-    Parameters
+    Args:
-    ----------
+        sentence (Dict): sentence: {'utt': ([char], [int])}
-    sentence : Dict
+        output_path (str or path):path to save phone_id_map
        sentence: {'utt': ([char], [int])}
    output_path : str or path
        path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
@ -112,14 +103,10 @@ def get_phones_tones(sentence,
                     dataset="baker"):
    '''
    get phone set and tone set from training data and save it
-    Parameters
+    Args:
-    ----------
+        sentence (Dict): sentence: {'utt': ([char], [int])}
-    sentence : Dict
+        phones_output_path (str or path): path to save phone_id_map
-        sentence: {'utt': ([char], [int])}
+        tones_output_path (str or path): path to save tone_id_map
    phones_output_path : str or path
        path to save phone_id_map
    tones_output_path : str or path
        path to save tone_id_map
    '''
    phn_token = set()
    tone_token = set()
@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
 def compare_duration_and_mel_length(sentences, utt, mel):
    '''
    check duration error, correct sentences[utt] if possible, else pop sentences[utt]
-    Parameters
+    Args:
-    ----------
+        sentences (Dict): sentences[utt] = [phones_list ,durations_list]
-    sentences : Dict
+        utt (str): utt_id
-        sentences[utt] = [phones_list ,durations_list]
+        mel (np.ndarry): features (num_frames, n_mels)
    utt : str
        utt_id
    mel : np.ndarry
        features (num_frames, n_mels)
    '''
    if utt in sentences:
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@ -29,15 +29,11 @@ class Clip(object):
            hop_size=256,
            aux_context_window=0, ):
        """Initialize customized collater for DataLoader.
        Args:
-        Parameters
+            batch_max_steps (int): The maximum length of input signal in batch.
-        ----------
+            hop_size (int): Hop size of auxiliary features.
-        batch_max_steps : int
+            aux_context_window (int): Context window size for auxiliary feature conv.
            The maximum length of input signal in batch.
        hop_size : int
            Hop size of auxiliary features.
        aux_context_window : int
            Context window size for auxiliary feature conv.
        """
        if batch_max_steps % hop_size != 0:
@ -56,18 +52,15 @@ class Clip(object):
    def __call__(self, batch):
        """Convert into batch tensors.
-        Parameters
+        Args:
-        ----------
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
        batch : list
            list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
-        Returns
+        Returns: 
-        ----------
+            Tensor:
-        Tensor
+                Auxiliary feature batch (B, C, T'), where
-            Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
-            T = (T' - 2 * aux_context_window) * hop_size.
+            Tensor:
-        Tensor
+                Target signal batch (B, 1, T).
            Target signal batch (B, 1, T).
        """
        # check length
@ -104,11 +97,10 @@ class Clip(object):
    def _adjust_length(self, x, c):
        """Adjust the audio and feature lengths.
-        Note
+        Note:
-        -------
+            Basically we assume that the length of x and c are adjusted
-        Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
-        through preprocessing stage, but if we use other library processed
+            features, this process will be needed.
        features, this process will be needed.
        """
        if len(x) < c.shape[0] * self.hop_size:
@ -162,22 +154,14 @@ class WaveRNNClip(Clip):
        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
        """Convert into batch tensors.
-
+        Args:
-        Parameters
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
-        ----------
+
-        batch : list
+        Returns:
-            list of tuple of the pair of audio and features. 
+            Tensor: Input signal batch (B, 1, T).
-            Audio shape (T, ), features shape(T', C).
+            Tensor: Target signal batch (B, 1, T).
-
+            Tensor: Auxiliary feature batch (B, C, T'), 
-        Returns
+                where T = (T' - 2 * aux_context_window) * hop_size.
        ----------
        Tensor
            Input signal batch (B, 1, T).
        Tensor
            Target signal batch (B, 1, T).
        Tensor
            Auxiliary feature batch (B, C, T'), where
            T = (T' - 2 * aux_context_window) * hop_size.
        """
        # check length
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English
 def get_lj_sentences(file_name, frontend):
-    '''
+    '''read MFA duration.txt
-    read MFA duration.txt
+
-    Parameters
+    Args:
-    ----------
+        file_name (str or Path)
-    file_name : str or Path
+    Returns:
-    Returns
+        Dict: sentence: {'utt': ([char], [int])}
    ----------
    Dict
        sentence: {'utt': ([char], [int])}
    '''
    f = open(file_name, 'r')
    sentence = {}
@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):
 def get_input_token(sentence, output_path):
-    '''
+    '''get phone set from training data and save it
-    get phone set from training data and save it
+    
-    Parameters
+    Args:
-    ----------
+        sentence (Dict): sentence: {'utt': ([char], str)}
-    sentence : Dict
+        output_path (str or path): path to save phone_id_map
        sentence: {'utt': ([char], str)}
    output_path : str or path
        path to save phone_id_map
    '''
    phn_token = set()
    for utt in sentence:
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@ -133,16 +133,11 @@ class ARPABET(Phonetics):
    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
        Args:
            sentence (str): The input text sequence.
-        Parameters
+        Returns:
-        -----------
+            List[str]: The list of pronunciation sequence.
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation sequence.
        """
        phonemes = [
            self._remove_vowels(item) for item in self.backend(sentence)
@ -156,16 +151,12 @@ class ARPABET(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
+
-        Parameters
+        Args:
-        -----------
+            phonemes (List[str]): The list of pronunciation sequence.
        phonemes: List[str]
            The list of pronunciation sequence.
-        Returns
+        Returns:
-        ----------
+            List[int]: The list of pronunciation id sequence.
        List[int]
            The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
@ -173,30 +164,23 @@ class ARPABET(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
+        Args:
-        -----------
+            ids( List[int]): The list of pronunciation id sequence.
        ids: List[int]
            The list of pronunciation id sequence.
-        Returns
+        Returns: 
-        ----------
+            List[str]: 
-        List[str]
+                The list of pronunciation sequence.
            The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
+        Args:
-        -----------
+            sentence (str): The input text sequence.
        sentence: str
            The input text sequence.
-        Returns
+        Returns:
-        ----------
+            List[str]: The list of pronunciation id sequence.
        List[str]
            The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))
@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
    def phoneticize(self, sentence, add_start_end=False):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
+        Args: 
-        -----------
+            sentence (str): The input text sequence.
        sentence: str
            The input text sequence.
-        Returns
+        Returns: 
-        ----------
+            List[str]: The list of pronunciation sequence.
        List[str]
            The list of pronunciation sequence.
        """
        phonemes = self.backend(sentence)
        if add_start_end:
@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        
+
-        Parameters
+        Args:
-        -----------
+            phonemes (List[str]): The list of pronunciation sequence.
        phonemes: List[str]
            The list of pronunciation sequence.
-        Returns
+        Returns:
-        ----------
+            List[int]: The list of pronunciation id sequence.
        List[int]
            The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        
+        Args:
-        Parameters
+            ids (List[int]): The list of pronunciation id sequence.
        -----------
        ids: List[int]
            The list of pronunciation id sequence.
-        Returns
+        Returns: 
-        ----------
+            List[str]: The list of pronunciation sequence.
        List[str]
            The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
    def __call__(self, sentence, add_start_end=False):
        """ Convert the input text sequence into pronunciation id sequence.
        Args:
            sentence (str): The input text sequence.
-        Parameters
+        Returns: 
-        -----------
+            List[str]: The list of pronunciation id sequence.
        sentence: str
            The input text sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation id sequence.
        """
        return self.numericalize(
            self.phoneticize(sentence, add_start_end=add_start_end))
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@ -65,14 +65,10 @@ class English(Phonetics):
    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
+        Args:
-        -----------
+            sentence (str): The input text sequence.
-        sentence: str
+        Returns: 
-            The input text sequence.
+            List[str]: The list of pronunciation sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation sequence.
        """
        start = self.vocab.start_symbol
        end = self.vocab.end_symbol
@ -123,14 +119,10 @@ class English(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
+        Args:
-        -----------
+            phonemes (List[str]): The list of pronunciation sequence.
-        phonemes: List[str]
+        Returns: 
-            The list of pronunciation sequence.
+            List[int]: The list of pronunciation id sequence.
        Returns
        ----------
        List[int]
            The list of pronunciation id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in phonemes
@ -140,27 +132,19 @@ class English(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
+        Args:
-        -----------
+            ids (List[int]): The list of pronunciation id sequence.
-        ids: List[int]
+        Returns: 
-            The list of pronunciation id sequence.
+            List[str]: The list of pronunciation sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
+        Args:
-        -----------
+            sentence(str): The input text sequence.
-        sentence: str
+        Returns: 
-            The input text sequence.
+            List[str]: The list of pronunciation id sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))
@ -183,28 +167,21 @@ class EnglishCharacter(Phonetics):
    def phoneticize(self, sentence):
        """ Normalize the input text sequence.
-        Parameters
+        Args:
-        -----------
+            sentence(str): The input text sequence.
-        sentence: str
+        Returns:
-            The input text sequence.
+            str: A text sequence after normalize.
        Returns
        ----------
        str
            A text sequence after normalize.
        """
        words = normalize(sentence)
        return words
    def numericalize(self, sentence):
        """ Convert a text sequence into ids.
-        Parameters
+        Args:
-        -----------
+            sentence (str): The input text sequence.
-        sentence: str
+        Returns:
-            The input text sequence.
+            List[int]:
-        Returns
+                List of a character id sequence.
        ----------
        List[int]
            List of a character id sequence.
        """
        ids = [
            self.vocab.lookup(item) for item in sentence
@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics):
    def reverse(self, ids):
        """ Convert a character id sequence into text.
-        Parameters
+        Args:
-        -----------
+            ids (List[int]): List of a character id sequence.
-        ids: List[int]
+        Returns:
-            List of a character id sequence.
+            str: The input text sequence.
        Returns
        ----------
        str
            The input text sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
    def __call__(self, sentence):
        """ Normalize the input text sequence and convert it into character id sequence.
-        Parameters
+        Args:
-        -----------
+            sentence (str): The input text sequence.
-        sentence: str
+        Returns: 
-            The input text sequence.
+            List[int]: List of a character id sequence.
        Returns
        ----------
        List[int]
            List of a character id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))
@ -264,14 +233,10 @@ class Chinese(Phonetics):
    def phoneticize(self, sentence):
        """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
+        Args:
-        -----------
+            sentence(str): The input text sequence.
-        sentence: str
+        Returns: 
-            The input text sequence.
+            List[str]: The list of pronunciation sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation sequence.
        """
        # simplified = self.opencc_backend.convert(sentence)
        simplified = sentence
@ -296,28 +261,20 @@ class Chinese(Phonetics):
    def numericalize(self, phonemes):
        """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
+        Args:
-        -----------
+            phonemes(List[str]): The list of pronunciation sequence.
-        phonemes: List[str]
+        Returns:
-            The list of pronunciation sequence.
+                List[int]: The list of pronunciation id sequence.
        Returns
        ----------
        List[int]
            The list of pronunciation id sequence.
        """
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
    def __call__(self, sentence):
        """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
+        Args:
-        -----------
+            sentence (str): The input text sequence.
-        sentence: str
+        Returns:
-            The input text sequence.
+            List[str]: The list of pronunciation id sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation id sequence.
        """
        return self.numericalize(self.phoneticize(sentence))
@ -329,13 +286,9 @@ class Chinese(Phonetics):
    def reverse(self, ids):
        """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
+        Args:
-        -----------
+        ids (List[int]): The list of pronunciation id sequence.
-        ids: List[int]
+        Returns: 
-            The list of pronunciation id sequence.
+            List[str]: The list of pronunciation sequence.
        Returns
        ----------
        List[str]
            The list of pronunciation sequence.
        """
        return [self.vocab.reverse(i) for i in ids]
--- a/paddlespeech/t2s/frontend/vocab.py
+++ b/paddlespeech/t2s/frontend/vocab.py
@ -20,22 +20,12 @@ __all__ = ["Vocab"]
 class Vocab(object):
    """  Vocabulary.
-    Parameters
+    Args:
-    -----------
+        symbols (Iterable[str]): Common symbols.
-    symbols: Iterable[str]
+        padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
-        Common symbols.
+        unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
-
+        start_symbol (str, optional): Symbol for start. Defaults to "<s>"
-    padding_symbol: str, optional
+        end_symbol (str, optional): Symbol for end. Defaults to "</s>"
        Symbol for pad. Defaults to "<pad>".
    unk_symbol: str, optional
        Symbol for unknow. Defaults to "<unk>"
    start_symbol: str, optional
        Symbol for start. Defaults to "<s>"
    end_symbol: str, optional
        Symbol for end. Defaults to "</s>"
    """
    def __init__(self,
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 def replace_time(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    is_range = len(match.groups()) > 5
@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 def replace_date(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    year = match.group(1)
    month = match.group(3)
@ -114,12 +110,10 @@ RE_DATE2 = re.compile(
 def replace_date2(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    year = match.group(1)
    month = match.group(3)
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
 def replace_frac(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    sign = match.group(1)
    nominator = match.group(2)
@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
 def replace_percentage(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    sign = match.group(1)
    percent = match.group(2)
@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')
 def replace_negative_num(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    sign = match.group(1)
    number = match.group(2)
@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
 def replace_default_num(match):
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    number = match.group(0)
    return verbalize_digit(number)
@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
 def replace_positive_quantifier(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    number = match.group(1)
    match_2 = match.group(2)
@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str:
 def replace_number(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    sign = match.group(1)
    number = match.group(2)
@ -169,12 +157,10 @@ RE_RANGE = re.compile(
 def replace_range(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    first, second = match.group(1), match.group(8)
    first = RE_NUMBER.sub(replace_number, first)
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str:
 def replace_phone(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    return phone2str(match.group(0), mobile=False)
 def replace_mobile(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    return phone2str(match.group(0))
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
 def replace_temperature(match) -> str:
    """
-    Parameters
+    Args:
-    ----------
+        match (re.Match)
-    match : re.Match
+    Returns:
-    Returns
+        str
    ----------
    str
    """
    sign = match.group(1)
    temperature = match.group(2)
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@ -55,14 +55,10 @@ class TextNormalizer():
    def _split(self, text: str, lang="zh") -> List[str]:
        """Split long text into sentences with sentence-splitting punctuations.
-        Parameters
+        Args:
-        ----------
+            text (str): The input text.
-        text : str
+        Returns:
-            The input text.
+            List[str]: Sentences.
        Returns
        -------
        List[str]
            Sentences.
        """
        # Only for pure Chinese here
        if lang == "zh":
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 class FastSpeech2(nn.Layer):
    """FastSpeech2 module.
-
+    
    This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
    High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
    energy, we use token-averaged value introduced in `FastPitch: Parallel
    Text-to-speech with Pitch Prediction`_.
-
+    
    .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
        https://arxiv.org/abs/2006.04558
    .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
        https://arxiv.org/abs/2006.06873
    Args:
    Returns:
    """
    def __init__(
@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer):
            init_enc_alpha: float=1.0,
            init_dec_alpha: float=1.0, ):
        """Initialize FastSpeech2 module.
-        Parameters
+        Args:
-        ----------
+            idim (int): Dimension of the inputs.
-        idim : int
+            odim (int): Dimension of the outputs.
-            Dimension of the inputs.
+            adim (int): Attention dimension.
-        odim : int
+            aheads (int): Number of attention heads.
-            Dimension of the outputs.
+            elayers (int): Number of encoder layers.
-        adim : int
+            eunits (int): Number of encoder hidden units.
-            Attention dimension.
+            dlayers (int): Number of decoder layers.
-        aheads : int
+            dunits (int): Number of decoder hidden units.
-            Number of attention heads.
+            postnet_layers (int): Number of postnet layers.
-        elayers : int
+            postnet_chans (int): Number of postnet channels.
-            Number of encoder layers.
+            postnet_filts (int): Kernel size of postnet.
-        eunits : int
+            postnet_dropout_rate (float): Dropout rate in postnet.
-            Number of encoder hidden units.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
-        dlayers : int
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
-            Number of decoder layers.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
-        dunits : int
+            decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
-            Number of decoder hidden units.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
-        postnet_layers : int
+            decoder_concat_after (bool): Whether to concatenate attention layer's input  and output in decoder.
-            Number of postnet layers.
+            reduction_factor (int): Reduction factor.
-        postnet_chans : int
+            encoder_type (str): Encoder type ("transformer" or "conformer").
-            Number of postnet channels.
+            decoder_type (str): Decoder type ("transformer" or "conformer").
-        postnet_filts : int
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
-            Kernel size of postnet.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
-        postnet_dropout_rate : float
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
-            Dropout rate in postnet.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
-        use_scaled_pos_enc : bool
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
-            Whether to use trainable scaled pos encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
-        use_batch_norm : bool
+            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
-            Whether to use batch normalization in encoder prenet.
+            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
-        encoder_normalize_before : bool
+            conformer_activation_type (str): Activation function type in conformer.
-            Whether to apply layernorm layer before encoder block.
+            use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
-        decoder_normalize_before : bool
+            use_cnn_in_conformer (bool): Whether to use CNN in conformer.
-            Whether to apply layernorm layer before
+            zero_triu (bool): Whether to use zero triu in relative self-attention module.
-            decoder block.
+            conformer_enc_kernel_size (int): Kernel size of encoder conformer.
-        encoder_concat_after : bool
+            conformer_dec_kernel_size (int): Kernel size of decoder conformer.
-            Whether to concatenate attention layer's input and output in encoder.
+            duration_predictor_layers (int): Number of duration predictor layers.
-        decoder_concat_after : bool
+            duration_predictor_chans (int): Number of duration predictor channels.
-            Whether to concatenate attention layer's input  and output in decoder.
+            duration_predictor_kernel_size (int): Kernel size of duration predictor.
-        reduction_factor : int
+            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
-            Reduction factor.
+            pitch_predictor_layers (int): Number of pitch predictor layers.
-        encoder_type : str
+            pitch_predictor_chans (int): Number of pitch predictor channels.
-            Encoder type ("transformer" or "conformer").
+            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
-        decoder_type : str
+            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
-            Decoder type ("transformer" or "conformer").
+            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
-        transformer_enc_dropout_rate : float
+            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
-            Dropout rate in encoder except attention and positional encoding.
+            stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
-        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
+            energy_predictor_layers (int): Number of energy predictor layers.
-            positional encoding.
+            energy_predictor_chans (int): Number of energy predictor channels.
-        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
+            energy_predictor_kernel_size (int): Kernel size of energy predictor.
-            self-attention module.
+            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
-        transformer_dec_dropout_rate (float): Dropout rate in decoder except
+            energy_embed_kernel_size (float): Kernel size of energy embedding.
-            attention & positional encoding.
+            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
-        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
+            stop_gradient_from_energy_predictor（bool): Whether to stop gradient from energy predictor to encoder.
-            positional encoding.
+            spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
-        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
+                spk_ids will be provided as the input and use spk_embedding_table.
-            self-attention module.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, 
-        conformer_pos_enc_layer_type : str
+                assume that spk_emb will be provided as the input or spk_num is not None.
-            Pos encoding layer type in conformer.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
-        conformer_self_attn_layer_type : str
+            tone_num (Optional[int]): Number of tones. If not None, assume that the
-            Self-attention layer type in conformer
+                tone_ids will be provided as the input and use tone_embedding_table.
-        conformer_activation_type : str
+            tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
-            Activation function type in conformer.
+            tone_embed_integration_type (str): How to integrate tone embedding.
-        use_macaron_style_in_conformer : bool
+            init_type (str): How to initialize transformer parameters.
-            Whether to use macaron style FFN.
+            init_enc_alpha （float): Initial value of alpha in scaled pos encoding of the encoder.
-        use_cnn_in_conformer : bool
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
            Whether to use CNN in conformer.
        zero_triu : bool
            Whether to use zero triu in relative self-attention module.
        conformer_enc_kernel_size : int
            Kernel size of encoder conformer.
        conformer_dec_kernel_size : int
            Kernel size of decoder conformer.
        duration_predictor_layers : int
            Number of duration predictor layers.
        duration_predictor_chans : int
            Number of duration predictor channels.
        duration_predictor_kernel_size : int
            Kernel size of duration predictor.
        duration_predictor_dropout_rate : float
            Dropout rate in duration predictor.
        pitch_predictor_layers : int
            Number of pitch predictor layers.
        pitch_predictor_chans : int
            Number of pitch predictor channels.
        pitch_predictor_kernel_size : int
            Kernel size of pitch predictor.
        pitch_predictor_dropout_rate : float
            Dropout rate in pitch predictor.
        pitch_embed_kernel_size : float
            Kernel size of pitch embedding.
        pitch_embed_dropout_rate : float
            Dropout rate for pitch embedding.
        stop_gradient_from_pitch_predictor : bool
            Whether to stop gradient from pitch predictor to encoder.
        energy_predictor_layers : int
            Number of energy predictor layers.
        energy_predictor_chans : int
            Number of energy predictor channels.
        energy_predictor_kernel_size : int
            Kernel size of energy predictor.
        energy_predictor_dropout_rate : float
            Dropout rate in energy predictor.
        energy_embed_kernel_size : float
            Kernel size of energy embedding.
        energy_embed_dropout_rate : float
            Dropout rate for energy embedding.
        stop_gradient_from_energy_predictor : bool 
            Whether to stop gradient from energy predictor to encoder.
        spk_num : Optional[int]
            Number of speakers. If not None, assume that the spk_embed_dim is not None,
            spk_ids will be provided as the input and use spk_embedding_table.
        spk_embed_dim : Optional[int]
            Speaker embedding dimension. If not None, 
            assume that spk_emb will be provided as the input or spk_num is not None.
        spk_embed_integration_type : str
            How to integrate speaker embedding.
        tone_num : Optional[int]
            Number of tones. If not None, assume that the
            tone_ids will be provided as the input and use tone_embedding_table.
        tone_embed_dim : Optional[int]
            Tone embedding dimension. If not None, assume that tone_num is not None.
        tone_embed_integration_type : str
            How to integrate tone embedding.
        init_type : str
            How to initialize transformer parameters.
        init_enc_alpha : float
            Initial value of alpha in scaled pos encoding of the encoder.
        init_dec_alpha : float
            Initial value of alpha in scaled pos encoding of the decoder.
        """
        assert check_argument_types()
@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
-        text : Tensor(int64)
+            text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
-            Batch of padded token ids (B, Tmax).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
-        text_lengths : Tensor(int64)
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
-            Batch of lengths of each input (B,).
+            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
-        speech : Tensor
+            pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
-            Batch of padded target features (B, Lmax, odim).
+            energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
-        speech_lengths : Tensor(int64)
+            tone_id(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
-            Batch of the lengths of each target (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
-        durations : Tensor(int64)
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
-            Batch of padded durations (B, Tmax).
+
-        pitch : Tensor
+        Returns:
-            Batch of padded token-averaged pitch (B, Tmax, 1).
+
-        energy : Tensor
+        
            Batch of padded token-averaged energy (B, Tmax, 1).
        tone_id : Tensor, optional(int64)
                Batch of padded tone ids  (B, Tmax).
        spk_emb : Tensor, optional
            Batch of speaker embeddings (B, spk_embed_dim).
        spk_id : Tnesor, optional(int64)
            Batch of speaker ids (B,)
        Returns
        ----------
        Tensor
            mel outs before postnet
        Tensor
            mel outs after postnet
        Tensor
            duration predictor's output
        Tensor
            pitch predictor's output
        Tensor
            energy predictor's output
        Tensor
            speech
        Tensor
            speech_lengths, modified if reduction_factor > 1
        """
        # input of embedding must be int64
@ -680,34 +596,22 @@ class FastSpeech2(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.
-        Parameters
+        Args:
-        ----------
+            text(Tensor(int64)): Input sequence of characters (T,).
-        text : Tensor(int64)
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
-            Input sequence of characters (T,).
+            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
-        speech : Tensor, optional
+            pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
-            Feature sequence to extract style (N, idim).
+            energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
-        durations : Tensor, optional (int64)
+            alpha(float, optional): Alpha to control the speed.
-            Groundtruth of duration (T,).
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
-        pitch : Tensor, optional
+                If true, groundtruth of duration, pitch and energy will be used.
-            Groundtruth of token-averaged pitch (T, 1).
+            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
-        energy : Tensor, optional
+            spk_id(Tensor, optional(int64), optional): Batch of padded spk ids  (1,). (Default value = None)
-            Groundtruth of token-averaged energy (T, 1).
+            tone_id(Tensor, optional(int64), optional): Batch of padded tone ids  (T,). (Default value = None)
-        alpha : float, optional
+
-            Alpha to control the speed.
+        Returns:
-        use_teacher_forcing : bool, optional
+
-            Whether to use teacher forcing.
+        
            If true, groundtruth of duration, pitch and energy will be used.
        spk_emb : Tensor, optional
            peaker embedding vector (spk_embed_dim,).
        spk_id : Tensor, optional(int64)
            Batch of padded spk ids  (1,).
        tone_id : Tensor, optional(int64)
            Batch of padded tone ids  (T,).
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        """
        # input of embedding must be int64
        x = paddle.cast(text, 'int64')
@ -761,17 +665,13 @@ class FastSpeech2(nn.Layer):
    def _integrate_with_spk_embed(self, hs, spk_emb):
        """Integrate speaker embedding with hidden states.
-        Parameters
+        Args:
-        ----------
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
-        hs : Tensor
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-            Batch of hidden state sequences (B, Tmax, adim).
+
-        spk_emb : Tensor
+        Returns:
-            Batch of speaker embeddings (B, spk_embed_dim).
+
-
+        
        Returns
        ----------
        Tensor
            Batch of integrated hidden state sequences (B, Tmax, adim)
        """
        if self.spk_embed_integration_type == "add":
            # apply projection and then add to hidden states
@ -790,17 +690,13 @@ class FastSpeech2(nn.Layer):
    def _integrate_with_tone_embed(self, hs, tone_embs):
        """Integrate speaker embedding with hidden states.
-        Parameters
+        Args:
-        ----------
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
-        hs : Tensor
+            tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
-            Batch of hidden state sequences (B, Tmax, adim).
+
-        tone_embs : Tensor
+        Returns:
-            Batch of speaker embeddings (B, Tmax, tone_embed_dim).
+
-
+        
        Returns
        ----------
        Tensor
            Batch of integrated hidden state sequences (B, Tmax, adim)
        """
        if self.tone_embed_integration_type == "add":
            # apply projection and then add to hidden states
@ -819,24 +715,17 @@ class FastSpeech2(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.
-        Parameters
+        Args:
-        ----------
+            ilens(Tensor): Batch of lengths (B,).
        ilens : Tensor
            Batch of lengths (B,).
-        Returns
+        Returns:
-        -------
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool
        Tensor
            Mask tensor for self-attention.
            dtype=paddle.bool
        Examples
        -------
        >>> ilens = [5, 3]
        >>> self._source_mask(ilens)
        tensor([[[1, 1, 1, 1, 1],
                    [1, 1, 1, 0, 0]]]) bool
        Examples:
            >>> ilens = [5, 3]
            >>> self._source_mask(ilens)
            tensor([[[1, 1, 1, 1, 1],
                        [1, 1, 1, 0, 0]]]) bool
        """
        x_masks = make_non_pad_mask(ilens)
        return x_masks.unsqueeze(-2)
@ -910,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
                spk_emb=None,
                spk_id=None):
        """
-        Parameters
+
-        ----------
+        Args:
-        text : Tensor(int64)
+            text(Tensor(int64)): Input sequence of characters (T,).
-            Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
-        speech : Tensor, optional
+            durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
-            Feature sequence to extract style (N, idim).
+            durations_scale(int/float, optional): 
-        durations : paddle.Tensor/np.ndarray, optional (int64)
+            durations_bias(int/float, optional): 
-            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+            pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-        durations_scale: int/float, optional
+            pitch_scale(int/float, optional): In denormed HZ domain.
-        durations_bias: int/float, optional
+            pitch_bias(int/float, optional): In denormed HZ domain.
-        pitch : paddle.Tensor/np.ndarray, optional
+            energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
-            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+            energy_scale(int/float, optional): In denormed domain.
-        pitch_scale: int/float, optional
+            energy_bias(int/float, optional): In denormed domain.
-            In denormed HZ domain.
+            robot: bool:  (Default value = False)
-        pitch_bias: int/float, optional
+            spk_emb: (Default value = None)
-            In denormed HZ domain.
+            spk_id: (Default value = None)
-        energy : paddle.Tensor/np.ndarray, optional
+
-            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+        Returns:
-        energy_scale: int/float, optional
+            Tensor: logmel
-            In denormed domain.
+
        energy_bias: int/float, optional
            In denormed domain.
        robot : bool, optional
            Weather output robot style
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        """
        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
            text,
@ -1011,13 +892,9 @@ class FastSpeech2Loss(nn.Layer):
    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
-
+        Args:
-        Parameters
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
-        ----------
+            use_weighted_masking (bool): Whether to weighted masking in loss calculation.
        use_masking : bool
            Whether to apply masking for padded part in loss calculation.
        use_weighted_masking : bool
            Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
@ -1048,42 +925,22 @@ class FastSpeech2Loss(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
-        after_outs : Tensor
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
-            Batch of outputs after postnets (B, Lmax, odim).
+            d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
-        before_outs : Tensor
+            p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
-            Batch of outputs before postnets (B, Lmax, odim).
+            e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
-        d_outs : Tensor
+            ys(Tensor): Batch of target features (B, Lmax, odim).
-                Batch of outputs of duration predictor (B, Tmax).
+            ds(Tensor): Batch of durations (B, Tmax).
-        p_outs : Tensor
+            ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
-            Batch of outputs of pitch predictor (B, Tmax, 1).
+            es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
-        e_outs : Tensor
+            ilens(Tensor): Batch of the lengths of each input (B,).
-            Batch of outputs of energy predictor (B, Tmax, 1).
+            olens(Tensor): Batch of the lengths of each target (B,).
-        ys : Tensor
+
-            Batch of target features (B, Lmax, odim).
+        Returns:
-        ds : Tensor
+
-            Batch of durations (B, Tmax).
+        
        ps : Tensor
            Batch of target token-averaged pitch (B, Tmax, 1).
        es : Tensor
            Batch of target token-averaged energy (B, Tmax, 1).
        ilens : Tensor
            Batch of the lengths of each input (B,).
        olens : Tensor
            Batch of the lengths of each target (B,).
        Returns
        ----------
        Tensor
            L1 loss value.
        Tensor
            Duration predictor loss value.
        Tensor
            Pitch predictor loss value.
        Tensor
            Energy predictor loss value.
        """
        # apply mask to remove padded part
        if self.use_masking:
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANGenerator module.
-        Parameters
+        Args:
-        ----------
+            in_channels (int): Number of input channels.
-        in_channels : int
+            out_channels (int): Number of output channels.
-            Number of input channels.
+            channels (int): Number of hidden representation channels.
-        out_channels : int
+            kernel_size (int): Kernel size of initial and final conv layer.
-            Number of output channels.
+            upsample_scales (list): List of upsampling scales.
-        channels : int
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
-            Number of hidden representation channels.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
-        kernel_size : int
+            resblock_dilations (list): List of dilation list for residual blocks.
-            Kernel size of initial and final conv layer.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
-        upsample_scales : list
+            bias (bool): Whether to add bias parameter in convolution layers.
-            List of upsampling scales.
+            nonlinear_activation (str): Activation function module name.
-        upsample_kernel_sizes : list
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
-            List of kernel sizes for upsampling layers.
+            use_weight_norm (bool): Whether to use weight norm.
-        resblock_kernel_sizes : list
+                If set to true, it will be applied to all of the conv layers.
            List of kernel sizes for residual blocks.
        resblock_dilations : list
            List of dilation list for residual blocks.
        use_additional_convs : bool
            Whether to use additional conv layers in residual blocks.
        bias : bool
            Whether to add bias parameter in convolution layers.
        nonlinear_activation : str
            Activation function module name.
        nonlinear_activation_params : dict
            Hyperparameters for activation function.
        use_weight_norm : bool
            Whether to use weight norm.
            If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):
    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
+        
-        ----------
+        Args:
-        c : Tensor
+            c (Tensor): Input tensor (B, in_channels, T).
-            Input tensor (B, in_channels, T).
+        Returns:
-        Returns
+            Tensor: Output tensor (B, out_channels, T).
        ----------
        Tensor
            Output tensor (B, out_channels, T).
        """
        c = self.input_conv(c)
        for i in range(self.num_upsamples):
@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer):
    def inference(self, c):
        """Perform inference.
-        Parameters
+        Args:
-        ----------
+            c (Tensor): Input tensor (T, in_channels).
-        c : Tensor 
+                normalize_before (bool): Whether to perform normalization.
-            Input tensor (T, in_channels).
+        Returns:
-            normalize_before (bool): Whether to perform normalization.
+            Tensor:
-        Returns
+                Output tensor (T ** prod(upsample_scales), out_channels).
        ----------
        Tensor
            Output tensor (T ** prod(upsample_scales), out_channels).
        """
        c = self.forward(c.transpose([1, 0]).unsqueeze(0))
        return c.squeeze(0).transpose([1, 0])
@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANPeriodDiscriminator module.
-        Parameters
+
-        ----------
+        Args:
-        in_channels : int
+            in_channels (int): Number of input channels.
-            Number of input channels.
+            out_channels (int): Number of output channels.
-        out_channels : int
+            period (int): Period.
-            Number of output channels.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
-        period : int
+            channels (int): Number of initial channels.
-            Period.
+            downsample_scales (list): List of downsampling scales.
-        kernel_sizes : list
+            max_downsample_channels (int): Number of maximum downsampling channels.
-            Kernel sizes of initial conv layers and the final conv layer.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
-        channels : int
+            bias (bool): Whether to add bias parameter in convolution layers.
-            Number of initial channels.
+            nonlinear_activation (str): Activation function module name.
-        downsample_scales : list
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
-            List of downsampling scales.
+            use_weight_norm (bool): Whether to use weight norm.
-        max_downsample_channels : int
+                If set to true, it will be applied to all of the conv layers.
-            Number of maximum downsampling channels.
+            use_spectral_norm (bool): Whether to use spectral norm.
-        use_additional_convs : bool
+                If set to true, it will be applied to all of the conv layers.
            Whether to use additional conv layers in residual blocks.
        bias : bool
            Whether to add bias parameter in convolution layers.
        nonlinear_activation : str
            Activation function module name.
        nonlinear_activation_params : dict
            Hyperparameters for activation function.
        use_weight_norm : bool
            Whether to use weight norm.
            If set to true, it will be applied to all of the conv layers.
        use_spectral_norm : bool
            Whether to use spectral norm.
            If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        c : Tensor
+            c (Tensor): Input tensor (B, in_channels, T).
-            Input tensor (B, in_channels, T).
+        Returns:
-        Returns
+            list: List of each layer's tensors.
        ----------
        list
            List of each layer's tensors.
        """
        # transform 1d to 2d -> (B, C, T/P, P)
        b, c, t = paddle.shape(x)
@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initialize HiFiGANMultiPeriodDiscriminator module.
-        Parameters
+
-        ----------
+        Args:
-        periods : list
+            periods (list): List of periods.
-            List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
-        discriminator_params : dict
+                The period parameter will be overwritten.
            Parameters for hifi-gan period discriminator module.
            The period parameter will be overwritten.
        """
        super().__init__()
        # initialize parameters
@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        x : Tensor
+            x (Tensor): Input noise signal (B, 1, T).
-            Input noise signal (B, 1, T).
+        Returns:
-        Returns
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        ----------
        List
            List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer):
            use_spectral_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN scale discriminator module.
-        Parameters
+
-        ----------
+        Args:
-        in_channels : int
+            in_channels (int): Number of input channels.
-            Number of input channels.
+            out_channels (int): Number of output channels.
-        out_channels : int
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
-            Number of output channels.
+                and the second is for downsampling part, and the remaining two are for output layers.
-        kernel_sizes : list
+            channels (int): Initial number of channels for conv layer.
-            List of four kernel sizes. The first will be used for the first conv layer,
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
-            and the second is for downsampling part, and the remaining two are for output layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
-        channels : int
+            downsample_scales (list): List of downsampling scales.
-            Initial number of channels for conv layer.
+            nonlinear_activation (str): Activation function module name.
-        max_downsample_channels : int
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
-            Maximum number of channels for downsampling layers.
+            use_weight_norm (bool): Whether to use weight norm.
-        bias : bool
+                If set to true, it will be applied to all of the conv layers.
-            Whether to add bias parameter in convolution layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
-        downsample_scales : list
+                If set to true, it will be applied to all of the conv layers.
            List of downsampling scales.
        nonlinear_activation : str
            Activation function module name.
        nonlinear_activation_params : dict
            Hyperparameters for activation function.
        use_weight_norm : bool
            Whether to use weight norm.
            If set to true, it will be applied to all of the conv layers.
        use_spectral_norm : bool
            Whether to use spectral norm.
            If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        x : Tensor
+            x (Tensor): Input noise signal (B, 1, T).
-            Input noise signal (B, 1, T).
+        Returns:
-        Returns
+            List: List of output tensors of each layer.
        ----------
        List
            List of output tensors of each layer.
        """
        outs = []
        for f in self.layers:
@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
            follow_official_norm: bool=False,
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale discriminator module.
-        Parameters
+   
-        ----------
+        Args:
-        scales : int
+            scales (int): Number of multi-scales.
-            Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
-        downsample_pooling : str
+            downsample_pooling_params (dict): Parameters for the above pooling module.
-            Pooling module name for downsampling of the inputs.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
-        downsample_pooling_params : dict
+            follow_official_norm (bool): Whether to follow the norm setting of the official
-            Parameters for the above pooling module.
+                implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
        discriminator_params : dict
            Parameters for hifi-gan scale discriminator module.
        follow_official_norm : bool
            Whether to follow the norm setting of the official
            implementaion. The first discriminator uses spectral norm and the other
            discriminators use weight norm.
        """
        super().__init__()
@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        x : Tensor
+            x (Tensor): Input noise signal (B, 1, T).
-            Input noise signal (B, 1, T).
+        Returns:
-        Returns
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        ----------
        List
            List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
            },
            init_type: str="xavier_uniform", ):
        """Initilize HiFiGAN multi-scale + multi-period discriminator module.
-        Parameters
+
-        ----------
+        Args:
-        scales : int
+            scales (int): Number of multi-scales.
-            Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
-        scale_downsample_pooling : str
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
-            Pooling module name for downsampling of the inputs.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
-        scale_downsample_pooling_params : dict
+            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
-            Parameters for the above pooling module.
+                The first discriminator uses spectral norm and the other discriminators use weight norm.
-        scale_discriminator_params : dict
+            periods (list): List of periods.
-            Parameters for hifi-gan scale discriminator module.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
-        follow_official_norm : bool): Whether to follow the norm setting of the official
+                The period parameter will be overwritten.
            implementaion. The first discriminator uses spectral norm and the other
            discriminators use weight norm.
        periods : list
            List of periods.
        period_discriminator_params : dict
            Parameters for hifi-gan period discriminator module.
            The period parameter will be overwritten.
        """
        super().__init__()
@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        x : Tensor
+            x (Tensor): Input noise signal (B, 1, T).
-            Input noise signal (B, 1, T).
+        Returns:
-        Returns
+            List:
-        ----------
+                List of list of each discriminator outputs,
-        List:
+                which consists of each layer output tensors.
-            List of list of each discriminator outputs,
+                Multi scale and multi period ones are concatenated.
            which consists of each layer output tensors.
            Multi scale and multi period ones are concatenated.
        """
        msd_outs = self.msd(x)
        mpd_outs = self.mpd(x)
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
            use_causal_conv: bool=False,
            init_type: str="xavier_uniform", ):
        """Initialize MelGANGenerator module.
-        Parameters
+
-        ----------
+        Args:
-        in_channels : int
+            in_channels (int): Number of input channels.
-            Number of input channels.
+            out_channels (int): Number of output channels,
-        out_channels : int
+                the number of sub-band is out_channels in multi-band melgan.
-            Number of output channels,
+            kernel_size (int): Kernel size of initial and final conv layer.
-            the number of sub-band is out_channels in multi-band melgan.
+            channels (int): Initial number of channels for conv layer.
-        kernel_size : int
+            bias (bool): Whether to add bias parameter in convolution layers.
-            Kernel size of initial and final conv layer.
+            upsample_scales (List[int]): List of upsampling scales.
-        channels : int
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
-            Initial number of channels for conv layer.
+            stacks (int): Number of stacks in a single residual stack.
-        bias : bool
+            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
-            Whether to add bias parameter in convolution layers.
+            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
-        upsample_scales : List[int]
+                by default {}
-            List of upsampling scales.
+            pad (str): Padding function module name before dilated convolution layer.
-        stack_kernel_size : int
+            pad_params （dict): Hyperparameters for padding function.
-            Kernel size of dilated conv layers in residual stack.
+            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
-        stacks : int
+            use_weight_norm (bool): Whether to use weight norm.
-            Number of stacks in a single residual stack.
+                If set to true, it will be applied to all of the conv layers.
-        nonlinear_activation : Optional[str], optional
+            use_causal_conv (bool): Whether to use causal convolution.
            Non linear activation in upsample network, by default None
        nonlinear_activation_params : Dict[str, Any], optional
            Parameters passed to the linear activation in the upsample network, 
            by default {}
        pad : str
            Padding function module name before dilated convolution layer.
        pad_params : dict
            Hyperparameters for padding function.
        use_final_nonlinear_activation : nn.Layer
            Activation function for the final layer.
        use_weight_norm : bool
            Whether to use weight norm.
            If set to true, it will be applied to all of the conv layers.
        use_causal_conv : bool
            Whether to use causal convolution.
        """
        super().__init__()
@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):
    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        c : Tensor
+            c (Tensor): Input tensor (B, in_channels, T).
-            Input tensor (B, in_channels, T).
+        Returns:
-        Returns
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        ----------
        Tensor
            Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        out = self.melgan(c)
        return out
@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):
    def inference(self, c):
        """Perform inference.
-        Parameters
+
-        ----------
+        Args:
-        c : Union[Tensor, ndarray]
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
-            Input tensor (T, in_channels).
+        Returns:
-        Returns
+            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
        ----------
        Tensor
            Output tensor (out_channels*T ** prod(upsample_scales), 1).
        """
        # pseudo batch
        c = c.transpose([1, 0]).unsqueeze(0)
@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN discriminator module.
-        Parameters
+
-        ----------
+        Args:
-        in_channels : int
+            in_channels (int): Number of input channels.
-            Number of input channels.
+            out_channels (int): Number of output channels.
-        out_channels : int
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
-            Number of output channels.
+                and the first and the second kernel sizes will be used for the last two layers.
-        kernel_sizes : List[int]
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
-            List of two kernel sizes. The prod will be used for the first conv layer,
+                the last two layers' kernel size will be 5 and 3, respectively.
-            and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
-            For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
-            the last two layers' kernel size will be 5 and 3, respectively.
+            bias (bool): Whether to add bias parameter in convolution layers.
-        channels : int
+            downsample_scales (List[int]): List of downsampling scales.
-            Initial number of channels for conv layer.
+            nonlinear_activation (str): Activation function module name.
-        max_downsample_channels : int
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
-            Maximum number of channels for downsampling layers.
+            pad (str): Padding function module name before dilated convolution layer.
-        bias : bool
+            pad_params (dict): Hyperparameters for padding function.
            Whether to add bias parameter in convolution layers.
        downsample_scales : List[int]
            List of downsampling scales.
        nonlinear_activation : str
            Activation function module name.
        nonlinear_activation_params : dict
            Hyperparameters for activation function.
        pad : str
            Padding function module name before dilated convolution layer.
        pad_params : dict
            Hyperparameters for padding function.
        """
        super().__init__()
@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input noise signal (B, 1, T).
-        x : Tensor
+        Returns:
-            Input noise signal (B, 1, T).
+            List: List of output tensors of each layer (for feat_match_loss).
        Returns
        ----------
        List
            List of output tensors of each layer (for feat_match_loss).
        """
        outs = []
        for f in self.layers:
@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize MelGAN multi-scale discriminator module.
-        Parameters
+
-        ----------
+        Args:
-        in_channels : int
+            in_channels (int): Number of input channels.
-            Number of input channels.
+            out_channels (int): Number of output channels.
-        out_channels : int
+            scales (int): Number of multi-scales.
-            Number of output channels.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
-        scales : int
+            downsample_pooling_params (dict): Parameters for the above pooling module.
-            Number of multi-scales.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
-        downsample_pooling : str
+                and the first and the second kernel sizes will be used for the last two layers.
-            Pooling module name for downsampling of the inputs.
+            channels (int): Initial number of channels for conv layer.
-        downsample_pooling_params : dict
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
-            Parameters for the above pooling module.
+            bias (bool): Whether to add bias parameter in convolution layers.
-        kernel_sizes : List[int]
+            downsample_scales (List[int]): List of downsampling scales.
-            List of two kernel sizes. The sum will be used for the first conv layer,
+            nonlinear_activation (str): Activation function module name.
-            and the first and the second kernel sizes will be used for the last two layers.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
-        channels : int
+            pad (str): Padding function module name before dilated convolution layer.
-            Initial number of channels for conv layer.
+            pad_params (dict): Hyperparameters for padding function.
-        max_downsample_channels : int
+            use_causal_conv (bool): Whether to use causal convolution.
            Maximum number of channels for downsampling layers.
        bias : bool
            Whether to add bias parameter in convolution layers.
        downsample_scales : List[int]
            List of downsampling scales.
        nonlinear_activation : str
            Activation function module name.
        nonlinear_activation_params : dict
            Hyperparameters for activation function.
        pad : str
            Padding function module name before dilated convolution layer.
        pad_params : dict
            Hyperparameters for padding function.
        use_causal_conv : bool
            Whether to use causal convolution.
        """
        super().__init__()
@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input noise signal (B, 1, T).
-        x : Tensor
+        Returns:
-            Input noise signal (B, 1, T).
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
        Returns
        ----------
        List
            List of list of each discriminator outputs, which consists of each layer output tensors.
        """
        outs = []
        for f in self.discriminators:
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN generator.
-        Parameters
+
-        ----------
+        Args:
-        in_channels : int
+            in_channels (int): Number of input noise channels.
-            Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
-        aux_channels : int
+            channels (int): Number of channels for conv layer.
-            Number of auxiliary input channels.
+            out_channels (int): Number of output channels.
-        channels : int
+            kernel_size (int): Kernel size of conv layers.
-            Number of channels for conv layer.
+            dilation (int): Dilation factor for conv layers.
-        out_channels : int
+            bias (bool): Whether to add bias parameter in convolution layers.
-            Number of output channels.
+            noise_upsample_scales (list): List of noise upsampling scales.
-        kernel_size : int
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
-            Kernel size of conv layers.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
-        dilation : int
+            upsample_scales (list): List of upsampling scales.
-            Dilation factor for conv layers.
+            upsample_mode (str): Upsampling mode in TADE layer.
-        bias : bool
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
-            Whether to add bias parameter in convolution layers.
+            use_weight_norm (bool): Whether to use weight norm.
-        noise_upsample_scales : list
+                If set to true, it will be applied to all of the conv layers.
            List of noise upsampling scales.
        noise_upsample_activation : str
            Activation function module name for noise upsampling.
        noise_upsample_activation_params : dict
            Hyperparameters for the above activation function.
        upsample_scales : list
            List of upsampling scales.
        upsample_mode : str
            Upsampling mode in TADE layer.
        gated_function : str
            Gated function in TADEResBlock ("softmax" or "sigmoid").
        use_weight_norm : bool
            Whether to use weight norm.
            If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):
    def forward(self, c, z=None):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        c : Tensor
+            c (Tensor): Auxiliary input tensor (B, channels, T).
-            Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
-        z : Tensor
+        Returns:
-            Input noise tensor (B, in_channels, 1).
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
        Returns
        ----------
        Tensor
            Output tensor (B, out_channels, T ** prod(upsample_scales)).
        """
        # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
        if z is None:
@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):
    def inference(self, c):
        """Perform inference.
-        Parameters
+        Args:
-        ----------
+            c (Tensor): Input tensor (T, in_channels).
-        c : Tensor
+        Returns:
-            Input tensor (T, in_channels).
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
        Returns
        ----------
        Tensor
            Output tensor (T ** prod(upsample_scales), out_channels).
        """
        # (1, in_channels, T)
        c = c.transpose([1, 0]).unsqueeze(0)
@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
            use_weight_norm: bool=True,
            init_type: str="xavier_uniform", ):
        """Initilize Style MelGAN discriminator.
-        Parameters
+
-        ----------
+        Args:
-        repeats : int
+            repeats (int): Number of repititons to apply RWD.
-            Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
-        window_sizes : list
+            pqmf_params (list): List of list of Parameters for PQMF modules
-            List of random window sizes.
+            discriminator_params (dict): Parameters for base discriminator module.
-        pqmf_params : list
+            use_weight_nom (bool): Whether to apply weight normalization.
            List of list of Parameters for PQMF modules
        discriminator_params : dict
            Parameters for base discriminator module.
        use_weight_nom : bool
            Whether to apply weight normalization.
        """
        super().__init__()
@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, 1, T).
-        x : Tensor
+        Returns:
-            Input tensor (B, 1, T).
+            List: List of discriminator outputs, #items in the list will be
-        Returns
+                equal to repeats * #discriminators.
        ----------
        List
            List of discriminator outputs, #items in the list will be
            equal to repeats * #discriminators.
        """
        outs = []
        for _ in range(self.repeats):
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
 class PWGGenerator(nn.Layer):
    """Wave Generator for Parallel WaveGAN
-    Parameters
+    Args:
-    ----------
+        in_channels (int, optional): Number of channels of the input waveform, by default 1
-    in_channels : int, optional
+        out_channels (int, optional): Number of channels of the output waveform, by default 1
-        Number of channels of the input waveform, by default 1
+        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
-    out_channels : int, optional
+        layers (int, optional): Number of residual blocks inside, by default 30
-        Number of channels of the output waveform, by default 1
+        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
-    kernel_size : int, optional
+            Within each group, the dilation of the residual block grows exponentially.
-        Kernel size of the residual blocks inside, by default 3
+        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
-    layers : int, optional
+        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
-        Number of residual blocks inside, by default 30
+        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
-    stacks : int, optional
+        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
-        The number of groups to split the residual blocks into, by default 3
+        aux_context_window (int, optional): The context window size of the first convolution applied to the 
-        Within each group, the dilation of the residual block grows 
+            auxiliary input, by default 2
-        exponentially.
+        dropout (float, optional): Dropout of the residual blocks, by default 0.
-    residual_channels : int, optional
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
-        Residual channel of the residual blocks, by default 64
+        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
-    gate_channels : int, optional
+        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
-        Gate channel of the residual blocks, by default 128
+            blocks, by default False
-    skip_channels : int, optional
+        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
-        Skip channel of the residual blocks, by default 64
+        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
-    aux_channels : int, optional
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
-        Auxiliary channel of the residual blocks, by default 80
+            by default {}
-    aux_context_window : int, optional
+        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
-        The context window size of the first convolution applied to the 
+        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
        auxiliary input, by default 2
    dropout : float, optional
        Dropout of the residual blocks, by default 0.
    bias : bool, optional
        Whether to use bias in residual blocks, by default True
    use_weight_norm : bool, optional
        Whether to use weight norm in all convolutions, by default True
    use_causal_conv : bool, optional
        Whether to use causal padding in the upsample network and residual 
        blocks, by default False
    upsample_scales : List[int], optional
        Upsample scales of the upsample network, by default [4, 4, 4, 4]
    nonlinear_activation : Optional[str], optional
        Non linear activation in upsample network, by default None
    nonlinear_activation_params : Dict[str, Any], optional
        Parameters passed to the linear activation in the upsample network, 
        by default {}
    interpolate_mode : str, optional
        Interpolation mode of the upsample network, by default "nearest"
    freq_axis_kernel_size : int, optional
        Kernel size along the frequency axis of the upsample network, by default 1
    """
    def __init__(
@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
    def forward(self, x, c):
        """Generate waveform.
-        Parameters
+        Args:
-        ----------
+            x(Tensor): Shape (N, C_in, T), The input waveform.
-        x : Tensor
+            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
            Shape (N, C_in, T), The input waveform.
        c : Tensor
            Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It 
            is upsampled to match the time resolution of the input.
-        Returns
+        Returns:
-        -------
+            Tensor: Shape (N, C_out, T), the generated waveform.
        Tensor
            Shape (N, C_out, T), the generated waveform.
        """
        c = self.upsample_net(c)
        assert c.shape[-1] == x.shape[-1]
@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
        self.apply(_remove_weight_norm)
    def inference(self, c=None):
-        """Waveform generation. This function is used for single instance 
+        """Waveform generation. This function is used for single instance inference.
-        inference.
+
-        Parameters
+        Args:
-        ----------
+            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
-        c : Tensor, optional
+            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
-            Shape (T', C_aux), the auxiliary input, by default None
+
-        x : Tensor, optional
+        Returns:
-            Shape (T, C_in), the noise waveform, by default None
+            Tensor: Shape (T, C_out), the generated waveform
            If not provided, a sample is drawn from a gaussian distribution.
        Returns
        -------
        Tensor
            Shape (T, C_out), the generated waveform
        """
        # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
        x = paddle.randn(
@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer):
 class PWGDiscriminator(nn.Layer):
    """A convolutional discriminator for audio.
-    Parameters
+    Args:
-    ----------
+        in_channels (int, optional): Number of channels of the input audio, by default 1
-    in_channels : int, optional
+        out_channels (int, optional): Output feature size, by default 1
-        Number of channels of the input audio, by default 1
+        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
-    out_channels : int, optional
+        layers (int, optional): Number of layers, by default 10
-        Output feature size, by default 1
+        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
-    kernel_size : int, optional
+        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
-        Kernel size of convolutional sublayers, by default 3
+            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
-    layers : int, optional
+            by default 1
-        Number of layers, by default 10
+        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
-    conv_channels : int, optional
+        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
-        Feature size of the convolutional sublayers, by default 64
+            {"negative_slope": 0.2}
-    dilation_factor : int, optional
+        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
-        The factor with which dilation of each convolutional sublayers grows 
+        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
-        exponentially if it is greater than 1, else the dilation of each 
+            by default True
        convolutional sublayers grows linearly, by default 1
    nonlinear_activation : str, optional
        The activation after each convolutional sublayer, by default "leakyrelu"
    nonlinear_activation_params : Dict[str, Any], optional
        The parameters passed to the activation's initializer, by default 
        {"negative_slope": 0.2}
    bias : bool, optional
        Whether to use bias in convolutional sublayers, by default True
    use_weight_norm : bool, optional
        Whether to use weight normalization at all convolutional sublayers, 
        by default True
    """
    def __init__(
@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):
    def forward(self, x):
        """
-        Parameters
+
-        ----------
+        Args:
-        x : Tensor
+            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
-            Shape (N, in_channels, num_samples), the input audio.
+
-
+        Returns:
-        Returns
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        -------
        Tensor
            Shape (N, out_channels, num_samples), the predicted logits.
        """
        return self.conv_layers(x)
@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
 class ResidualPWGDiscriminator(nn.Layer):
    """A wavenet-style discriminator for audio.
-    Parameters
+    Args:
-    ----------
+        in_channels (int, optional): Number of channels of the input audio, by default 1
-    in_channels : int, optional
+        out_channels (int, optional): Output feature size, by default 1
-        Number of channels of the input audio, by default 1
+        kernel_size (int, optional): Kernel size of residual blocks, by default 3
-    out_channels : int, optional
+        layers (int, optional): Number of residual blocks, by default 30
-        Output feature size, by default 1
+        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
-    kernel_size : int, optional
+            of each residual blocks grows exponentially, by default 3
-        Kernel size of residual blocks, by default 3
+        residual_channels (int, optional): Residual channels of residual blocks, by default 64
-    layers : int, optional
+        gate_channels (int, optional): Gate channels of residual blocks, by default 128
-        Number of residual blocks, by default 30
+        skip_channels (int, optional): Skip channels of residual blocks, by default 64
-    stacks : int, optional
+        dropout (float, optional): Dropout probability of residual blocks, by default 0.
-        Number of groups of residual blocks, within which the dilation 
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
-        of each residual blocks grows exponentially, by default 3
+        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
-    residual_channels : int, optional
+            by default True
-        Residual channels of residual blocks, by default 64
+        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
-    gate_channels : int, optional
+        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
-        Gate channels of residual blocks, by default 128
+            by default "leakyrelu"
-    skip_channels : int, optional
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
-        Skip channels of residual blocks, by default 64
+            by default {"negative_slope": 0.2}
    dropout : float, optional
        Dropout probability of residual blocks, by default 0.
    bias : bool, optional
        Whether to use bias in residual blocks, by default True
    use_weight_norm : bool, optional
        Whether to use weight normalization in all convolutional layers, 
        by default True
    use_causal_conv : bool, optional
        Whether to use causal convolution in residual blocks, by default False
    nonlinear_activation : str, optional
        Activation after convolutions other than those in residual blocks, 
        by default "leakyrelu"
    nonlinear_activation_params : Dict[str, Any], optional
        Parameters to pass to the activation, by default {"negative_slope": 0.2}
    """
    def __init__(
@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):
    def forward(self, x):
        """
-        Parameters
+        Args:
-        ----------
+            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
-        x : Tensor
+
-            Shape (N, in_channels, num_samples), the input audio.
+        Returns:
-
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
        Returns
        -------
        Tensor
            Shape (N, out_channels, num_samples), the predicted logits.
        """
        x = self.first_conv(x)
        skip = 0
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@ -81,69 +81,39 @@ class Tacotron2(nn.Layer):
            # training related
            init_type: str="xavier_uniform", ):
        """Initialize Tacotron2 module.
-        Parameters
+        Args:
-        ----------
+            idim (int): Dimension of the inputs.
-        idim : int
+            odim (int): Dimension of the outputs.
-            Dimension of the inputs.
+            embed_dim (int): Dimension of the token embedding.
-        odim : int
+            elayers (int): Number of encoder blstm layers.
-            Dimension of the outputs.
+            eunits (int): Number of encoder blstm units.
-        embed_dim : int
+            econv_layers (int): Number of encoder conv layers.
-            Dimension of the token embedding.
+            econv_filts (int): Number of encoder conv filter size.
-        elayers : int
+            econv_chans (int): Number of encoder conv filter channels.
-            Number of encoder blstm layers.
+            dlayers (int): Number of decoder lstm layers.
-        eunits : int
+            dunits (int): Number of decoder lstm units.
-            Number of encoder blstm units.
+            prenet_layers (int): Number of prenet layers.
-        econv_layers : int
+            prenet_units (int): Number of prenet units.
-            Number of encoder conv layers.
+            postnet_layers (int): Number of postnet layers.
-        econv_filts : int
+            postnet_filts (int): Number of postnet filter size.
-            Number of encoder conv filter size.
+            postnet_chans (int): Number of postnet filter channels.
-        econv_chans : int
+            output_activation (str): Name of activation function for outputs.
-            Number of encoder conv filter channels.
+            adim (int): Number of dimension of mlp in attention.
-        dlayers : int
+            aconv_chans (int): Number of attention conv filter channels.
-            Number of decoder lstm layers.
+            aconv_filts (int): Number of attention conv filter size.
-        dunits : int
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
-            Number of decoder lstm units.
+            use_batch_norm (bool): Whether to use batch normalization.
-        prenet_layers : int
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
-            Number of prenet layers.
+            reduction_factor (int): Reduction factor.
-        prenet_units : int
+            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
-            Number of prenet units.
+                sids will be provided as the input and use sid embedding layer.
-        postnet_layers : int
+            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
-            Number of postnet layers.
+                lids will be provided as the input and use sid embedding layer.
-        postnet_filts : int
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
-            Number of postnet filter size.
+                assume that spk_emb will be provided as the input.
-        postnet_chans : int
+            spk_embed_integration_type (str): How to integrate speaker embedding.
-            Number of postnet filter channels.
+            dropout_rate (float): Dropout rate.
-        output_activation : str
+            zoneout_rate (float): Zoneout rate.
            Name of activation function for outputs.
        adim : int
            Number of dimension of mlp in attention.
        aconv_chans : int
            Number of attention conv filter channels.
        aconv_filts : int
            Number of attention conv filter size.
        cumulate_att_w : bool
            Whether to cumulate previous attention weight.
        use_batch_norm : bool
            Whether to use batch normalization.
        use_concate : bool
            Whether to concat enc outputs w/ dec lstm outputs.
        reduction_factor : int
            Reduction factor.
        spk_num : Optional[int]
            Number of speakers. If set to > 1, assume that the
            sids will be provided as the input and use sid embedding layer.
        lang_num : Optional[int]
            Number of languages. If set to > 1, assume that the
            lids will be provided as the input and use sid embedding layer.
        spk_embed_dim : Optional[int]
            Speaker embedding dimension. If set to > 0,
            assume that spk_emb will be provided as the input.
        spk_embed_integration_type : str
            How to integrate speaker embedding.
        dropout_rate : float
            Dropout rate.
        zoneout_rate : float
            Zoneout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -258,31 +228,19 @@ class Tacotron2(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            text (Tensor(int64)): Batch of padded character ids (B, T_text).
-        text : Tensor(int64)
+            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
-            Batch of padded character ids (B, T_text).
+            speech (Tensor): Batch of padded target features (B, T_feats, odim).
-        text_lengths : Tensor(int64)
+            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
-            Batch of lengths of each input batch (B,).
+            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
-        speech : Tensor
+            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
-            Batch of padded target features (B, T_feats, odim).
+            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
-        speech_lengths : Tensor(int64)
+
-            Batch of the lengths of each target (B,).
+        Returns:
-        spk_emb : Optional[Tensor]
+            Tensor: Loss scalar value.
-            Batch of speaker embeddings (B, spk_embed_dim).
+            Dict: Statistics to be monitored.
-        spk_id : Optional[Tensor]
+            Tensor: Weight value if not joint training else model outputs.
            Batch of speaker IDs (B, 1).
        lang_id : Optional[Tensor]
            Batch of language IDs (B, 1).
        Returns
        ----------
        Tensor
            Loss scalar value.
        Dict
            Statistics to be monitored.
        Tensor
            Weight value if not joint training else model outputs.
        """
        text = text[:, :text_lengths.max()]
@ -369,40 +327,26 @@ class Tacotron2(nn.Layer):
            use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.
-        Parameters
+        Args:
-        ----------
+            text (Tensor(int64)): Input sequence of characters (T_text,).
-        text Tensor(int64)
+            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
-            Input sequence of characters (T_text,).
+            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
-        speech : Optional[Tensor]
+            spk_id (Optional[Tensor]): Speaker ID (1,).
-            Feature sequence to extract style (N, idim).
+            lang_id (Optional[Tensor]): Language ID (1,).
-        spk_emb : ptional[Tensor]
+            threshold (float): Threshold in inference.
-            Speaker embedding (spk_embed_dim,).
+            minlenratio (float): Minimum length ratio in inference.
-        spk_id : Optional[Tensor]
+            maxlenratio (float): Maximum length ratio in inference.
-            Speaker ID (1,).
+            use_att_constraint (bool): Whether to apply attention constraint.
-        lang_id : Optional[Tensor]
+            backward_window (int): Backward window in attention constraint.
-            Language ID (1,).
+            forward_window (int): Forward window in attention constraint.
-        threshold : float
+            use_teacher_forcing (bool): Whether to use teacher forcing.
-            Threshold in inference.
+
-        minlenratio : float
+        Returns:
-            Minimum length ratio in inference.
+            Dict[str, Tensor]
-        maxlenratio : float
+            Output dict including the following items:
-            Maximum length ratio in inference.
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
-        use_att_constraint : bool
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
-            Whether to apply attention constraint.
+                * att_w (Tensor): Attention weights (T_feats, T).
        backward_window : int
            Backward window in attention constraint.
        forward_window : int
            Forward window in attention constraint.
        use_teacher_forcing : bool
            Whether to use teacher forcing.
        Return
        ----------
        Dict[str, Tensor]
        Output dict including the following items:
            * feat_gen (Tensor): Output sequence of features (T_feats, odim).
            * prob (Tensor): Output sequence of stop probabilities (T_feats,).
            * att_w (Tensor): Attention weights (T_feats, T).
        """
        x = text
@ -458,18 +402,13 @@ class Tacotron2(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.
-        Parameters
+        Args:
-        ----------
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
-         hs : Tensor
+            spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-            Batch of hidden state sequences (B, Tmax, eunits).
+
-         spk_emb : Tensor
+        Returns:
-            Batch of speaker embeddings (B, spk_embed_dim).
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
-
+                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
        Returns
        ----------
         Tensor
            Batch of integrated hidden state sequences (B, Tmax, eunits) if
            integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
        """
        if self.spk_embed_integration_type == "add":
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer):
    .. _`Neural Speech Synthesis with Transformer Network`:
        https://arxiv.org/pdf/1809.08895.pdf
-    Parameters
+    Args:
-    ----------
+        idim (int): Dimension of the inputs.
-    idim : int
+        odim (int): Dimension of the outputs.
-        Dimension of the inputs.
+        embed_dim (int, optional): Dimension of character embedding.
-    odim : int
+        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
-        Dimension of the outputs.
+        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
-    embed_dim : int, optional
+        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
-        Dimension of character embedding.
+        dprenet_layers (int, optional): Number of decoder prenet layers.
-    eprenet_conv_layers : int, optional
+        dprenet_units (int, optional): Number of decoder prenet hidden units.
-        Number of encoder prenet convolution layers.
+        elayers (int, optional): Number of encoder layers.
-    eprenet_conv_chans : int, optional
+        eunits (int, optional): Number of encoder hidden units.
-        Number of encoder prenet convolution channels.
+        adim (int, optional): Number of attention transformation dimensions.
-    eprenet_conv_filts : int, optional
+        aheads (int, optional): Number of heads for multi head attention.
-        Filter size of encoder prenet convolution.
+        dlayers (int, optional): Number of decoder layers.
-    dprenet_layers : int, optional
+        dunits (int, optional): Number of decoder hidden units.
-        Number of decoder prenet layers.
+        postnet_layers (int, optional): Number of postnet layers.
-    dprenet_units : int, optional
+        postnet_chans (int, optional): Number of postnet channels.
-        Number of decoder prenet hidden units.
+        postnet_filts (int, optional): Filter size of postnet.
-    elayers : int, optional
+        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
-        Number of encoder layers.
+        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
-    eunits : int, optional
+        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
-        Number of encoder hidden units.
+        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
-    adim : int, optional
+        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
-        Number of attention transformation dimensions.
+        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
-    aheads : int, optional
+        positionwise_layer_type (str, optional): Position-wise operation type.
-        Number of heads for multi head attention.
+        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
-    dlayers : int, optional
+        reduction_factor (int, optional): Reduction factor.
-        Number of decoder layers.
+        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
-    dunits : int, optional
+        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
-        Number of decoder hidden units.
+        use_gst (str, optional): Whether to use global style token.
-    postnet_layers : int, optional
+        gst_tokens (int, optional): The number of GST embeddings.
-        Number of postnet layers.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
-    postnet_chans : int, optional
+        gst_conv_layers (int, optional): The number of conv layers in GST.
-        Number of postnet channels.
+        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
-    postnet_filts : int, optional
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
-        Filter size of postnet.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
-    use_scaled_pos_enc : pool, optional
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
-        Whether to use trainable scaled positional encoding.
+        gst_gru_units (int, optional): The number of GRU units in GST.
-    use_batch_norm : bool, optional
+        transformer_lr (float, optional): Initial value of learning rate.
-        Whether to use batch normalization in encoder prenet.
+        transformer_warmup_steps (int, optional): Optimizer warmup steps.
-    encoder_normalize_before : bool, optional
+        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
-        Whether to perform layer normalization before encoder block.
+        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
-    decoder_normalize_before : bool, optional
+        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
-        Whether to perform layer normalization before decoder block.
+        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
-    encoder_concat_after : bool, optional
+        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
-        Whether to concatenate attention layer's input and output in encoder.
+        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
-    decoder_concat_after : bool, optional
+        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
-        Whether to concatenate attention layer's input and output in decoder.
+        init_type (str, optional): How to initialize transformer parameters.
-    positionwise_layer_type : str, optional
+        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
-        Position-wise operation type.
+        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
-    positionwise_conv_kernel_size : int, optional
+        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
-        Kernel size in position wise conv 1d.
+        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
-    reduction_factor : int, optional
+        postnet_dropout_rate (float, optional): Dropout rate in postnet.
-        Reduction factor.
+        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
-    spk_embed_dim : int, optional
+        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
-        Number of speaker embedding dimenstions.
+        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
-    spk_embed_integration_type : str, optional
+        loss_type (str, optional): How to calculate loss.
-        How to integrate speaker embedding.
+        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
-    use_gst : str, optional
+        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
-        Whether to use global style token.
+        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
-    gst_tokens : int, optional
+            List of module names to apply guided attention loss.
        The number of GST embeddings.
    gst_heads : int, optional
        The number of heads in GST multihead attention.
    gst_conv_layers : int, optional
        The number of conv layers in GST.
    gst_conv_chans_list : Sequence[int], optional
            List of the number of channels of conv layers in GST.
    gst_conv_kernel_size : int, optional
        Kernal size of conv layers in GST.
    gst_conv_stride : int, optional
        Stride size of conv layers in GST.
    gst_gru_layers : int, optional
        The number of GRU layers in GST.
    gst_gru_units : int, optional
        The number of GRU units in GST.
    transformer_lr : float, optional
        Initial value of learning rate.
    transformer_warmup_steps : int, optional
        Optimizer warmup steps.
    transformer_enc_dropout_rate : float, optional
        Dropout rate in encoder except attention and positional encoding.
    transformer_enc_positional_dropout_rate : float, optional
        Dropout rate after encoder positional encoding.
    transformer_enc_attn_dropout_rate : float, optional
        Dropout rate in encoder self-attention module.
    transformer_dec_dropout_rate : float, optional
        Dropout rate in decoder except attention & positional encoding.
    transformer_dec_positional_dropout_rate : float, optional
        Dropout rate after decoder positional encoding.
    transformer_dec_attn_dropout_rate : float, optional
        Dropout rate in deocoder self-attention module.
    transformer_enc_dec_attn_dropout_rate : float, optional
        Dropout rate in encoder-deocoder attention module.
    init_type : str, optional
        How to initialize transformer parameters.
    init_enc_alpha : float, optional
        Initial value of alpha in scaled pos encoding of the encoder.
    init_dec_alpha : float, optional
        Initial value of alpha in scaled pos encoding of the decoder.
    eprenet_dropout_rate : float, optional
        Dropout rate in encoder prenet.
    dprenet_dropout_rate : float, optional
        Dropout rate in decoder prenet.
    postnet_dropout_rate : float, optional
        Dropout rate in postnet.
    use_masking : bool, optional
        Whether to apply masking for padded part in loss calculation.
    use_weighted_masking : bool, optional
        Whether to apply weighted masking in loss calculation.
    bce_pos_weight : float, optional
        Positive sample weight in bce calculation (only for use_masking=true).
    loss_type : str, optional
        How to calculate loss.
    use_guided_attn_loss : bool, optional
        Whether to use guided attention loss.
    num_heads_applied_guided_attn : int, optional
        Number of heads in each layer to apply guided attention loss.
    num_layers_applied_guided_attn : int, optional
        Number of layers to apply guided attention loss.
        List of module names to apply guided attention loss.
    """
    def __init__(
@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            text(Tensor(int64)): Batch of padded character ids (B, Tmax).
-        text : Tensor(int64)
+            text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
-            Batch of padded character ids (B, Tmax).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
-        text_lengths : Tensor(int64)
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
-            Batch of lengths of each input batch (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
-        speech : Tensor
+
-            Batch of padded target features (B, Lmax, odim).
+        Returns:
-        speech_lengths : Tensor(int64)
+            Tensor: Loss scalar value.
-            Batch of the lengths of each target (B,).
+            Dict: Statistics to be monitored.
        spk_emb : Tensor, optional
            Batch of speaker embeddings (B, spk_embed_dim).
        Returns
        ----------
        Tensor
            Loss scalar value.
        Dict
            Statistics to be monitored.
        """
        # input of embedding must be int64
@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
        """Generate the sequence of features given the sequences of characters.
-        Parameters
+        Args:
-        ----------
+            text(Tensor(int64)): Input sequence of characters (T,).
-        text : Tensor(int64)
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
-            Input sequence of characters (T,).
+            spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
-        speech : Tensor, optional
+            threshold(float, optional): Threshold in inference.
-            Feature sequence to extract style (N, idim).
+            minlenratio(float, optional): Minimum length ratio in inference.
-        spk_emb : Tensor, optional
+            maxlenratio(float, optional): Maximum length ratio in inference.
-            Speaker embedding vector (spk_embed_dim,).
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
-        threshold : float, optional
+
-            Threshold in inference.
+        Returns:
-        minlenratio : float, optional
+            Tensor: Output sequence of features (L, odim).
-            Minimum length ratio in inference.
+            Tensor: Output sequence of stop probabilities (L,).
-        maxlenratio : float, optional
+            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).
            Maximum length ratio in inference.
        use_teacher_forcing : bool, optional
            Whether to use teacher forcing.
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        Tensor
            Output sequence of stop probabilities (L,).
        Tensor
            Encoder-decoder (source) attention weights (#layers, #heads, L, T).
        """
        # input of embedding must be int64
@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer):
    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for self-attention.
-        Parameters
+        Args:
-        ----------
+            ilens(Tensor): Batch of lengths (B,).
        ilens : Tensor
            Batch of lengths (B,).
-        Returns
+        Returns:
-        -------
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool
        Tensor
            Mask tensor for self-attention.
            dtype=paddle.bool
-        Examples
+        Examples:
-        -------
+            >>> ilens = [5, 3]
-        >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
-        >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
-        tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool
                    [1, 1, 1, 0, 0]]]) bool
        """
        x_masks = make_non_pad_mask(ilens)
@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer):
    def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
        """Make masks for masked self-attention.
-        Parameters
+        Args:
-        ----------
+            olens (Tensor(int64)): Batch of lengths (B,).
-            olens : LongTensor
+
-                Batch of lengths (B,).
+        Returns:
-
+            Tensor: Mask tensor for masked self-attention.
-        Returns
+
-        ----------
+        Examples:
-        Tensor
+            >>> olens = [5, 3]
-            Mask tensor for masked self-attention.
+            >>> self._target_mask(olens)
-
+            tensor([[[1, 0, 0, 0, 0],
-        Examples
+                        [1, 1, 0, 0, 0],
-        ----------
+                        [1, 1, 1, 0, 0],
-        >>> olens = [5, 3]
+                        [1, 1, 1, 1, 0],
-        >>> self._target_mask(olens)
+                        [1, 1, 1, 1, 1]],
-        tensor([[[1, 0, 0, 0, 0],
+                    [[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 0],
+                        [1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 1]],
+                        [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
                [[1, 0, 0, 0, 0],
                    [1, 1, 0, 0, 0],
                    [1, 1, 1, 0, 0],
                    [1, 1, 1, 0, 0],
                    [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
        """
        y_masks = make_non_pad_mask(olens)
@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
                                  spk_emb: paddle.Tensor) -> paddle.Tensor:
        """Integrate speaker embedding with hidden states.
-        Parameters
+        Args:
-        ----------
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
-        hs : Tensor
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
-            Batch of hidden state sequences (B, Tmax, adim).
+
-        spk_emb : Tensor
+        Returns:
-            Batch of speaker embeddings (B, spk_embed_dim).
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
        Returns
        ----------
        Tensor
            Batch of integrated hidden state sequences (B, Tmax, adim).
        """
        if self.spk_embed_integration_type == "add":
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
 def fold(x, n_group):
-    r"""Fold audio or spectrogram's temporal dimension in to groups.
+    """Fold audio or spectrogram's temporal dimension in to groups.
-    Parameters
+    Args:
-    ----------
+        x(Tensor): The input tensor. shape=(\*, time_steps)
-    x : Tensor [shape=(\*, time_steps)
+        n_group(int): The size of a group.
        The input tensor.
-    n_group : int
+    Returns:
-        The size of a group.
+        Tensor: Folded tensor. shape=(\*, time_steps // n_group, group)
    Returns
    ---------
    Tensor : [shape=(\*, time_steps // n_group, group)]
        Folded tensor.
    """
    spatial_shape = list(x.shape[:-1])
    time_steps = paddle.shape(x)[-1]
@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList):
    It consists of several conv2dtranspose layers which perform deconvolution
    on mel and time dimension.
-    Parameters
+    Args:
-    ----------
+        upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
-    upscale_factors : List[int], optional
+            The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
-        Time upsampling factors for each Conv2DTranspose Layer.
+            Layers. Each upscale_factor is used as the ``stride`` for the
-
+            corresponding Conv2DTranspose. Defaults to [16, 16], this the default
-        The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
+            upsampling factor is 256.
        Layers. Each upscale_factor is used as the ``stride`` for the
        corresponding Conv2DTranspose. Defaults to [16, 16], this the default
        upsampling factor is 256.
-    Notes
+    Notes:
-    ------
+        ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
-    ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
+        transformation used to extract spectrogram features from audio.
    transformation used to extract spectrogram features from audio.
-    For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
+        For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
-    transformation whose ``hop_length`` equals 256 is suitable.
+        transformation whose ``hop_length`` equals 256 is suitable.
-    See Also
+        See Also
-    ---------
+    
-    ``librosa.core.stft``
+        ``librosa.core.stft``
    """
    def __init__(self, upsample_factors):
@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList):
        self.upsample_factors = upsample_factors
    def forward(self, x, trim_conv_artifact=False):
-        r"""Forward pass of the ``UpsampleNet``.
+        """Forward pass of the ``UpsampleNet``
-        Parameters
+        Args:
-        -----------
+            x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
-        x : Tensor [shape=(batch_size, input_channels, time_steps)]
+            trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
            The input spectrogram.
-        trim_conv_artifact : bool, optional
+        Returns:
-            Trim deconvolution artifact at each layer. Defaults to False.
+           Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps \* upsample_factor)
-        Returns
+        Notes:
-        --------
+            If trim_conv_artifact is ``True``, the output time steps is less
-        Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)]
+            than ``time_steps \* upsample_factors``.
            The upsampled spectrogram.
        Notes
        --------
        If trim_conv_artifact is ``True``, the output time steps is less
        than ``time_steps \* upsample_factors``.
        """
        x = paddle.unsqueeze(x, 1)  # (B, C, T) -> (B, 1, C, T)
        for layer in self:
@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer):
    same paddign in width dimension. It also has projection for the condition
    and output.
-    Parameters
+    Args:
-    ----------
+        channels (int): Feature size of the input.
-    channels : int
+        cond_channels (int): Featuer size of the condition.
-        Feature size of the input.
+        kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
-
+        dilations (int): Dilations of the Convolution2d applied to the input.
    cond_channels : int
        Featuer size of the condition.
    kernel_size : Tuple[int]
        Kernel size of the Convolution2d applied to the input.
    dilations : int
        Dilations of the Convolution2d applied to the input.
    """
    def __init__(self, channels, cond_channels, kernel_size, dilations):
@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer):
    def forward(self, x, condition):
        """Compute output for a whole folded sequence.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): The input. [shape=(batch_size, channel, height, width)]
-        x : Tensor [shape=(batch_size, channel, height, width)]
+            condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
            The input.
        condition : Tensor [shape=(batch_size, condition_channel, height, width)]
            The local condition.
-        Returns
+        Returns: 
-        -------
+            res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
-        res : Tensor [shape=(batch_size, channel, height, width)]
+            skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
            The residual output.
        skip : Tensor [shape=(batch_size, channel, height, width)]
            The skip output.
        """
        x_in = x
        x = self.conv(x)
@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer):
    def add_input(self, x_row, condition_row):
        """Compute the output for a row and update the buffer.
-        Parameters
+        Args:
-        ----------
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
-        x_row : Tensor [shape=(batch_size, channel, 1, width)]
+            condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
            A row of the input.
        condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
            A row of the condition.
-        Returns
+        Returns:
-        -------
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
-        res : Tensor [shape=(batch_size, channel, 1, width)]
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
            A row of the the residual output.
        skip : Tensor [shape=(batch_size, channel, 1, width)]
            A row of the skip output.
        """
        x_row_in = x_row
        if len(paddle.shape(self._conv_buffer)) == 1:
@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer):
 class ResidualNet(nn.LayerList):
    """A stack of several ResidualBlocks. It merges condition at each layer.
-    Parameters
+    Args:
-    ----------
+        n_layer (int): Number of ResidualBlocks in the ResidualNet.
-    n_layer : int
+        residual_channels (int): Feature size of each ResidualBlocks.
-        Number of ResidualBlocks in the ResidualNet.
+        condition_channels (int): Feature size of the condition.
-
+        kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
-    residual_channels : int
+        dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
        Feature size of each ResidualBlocks.
    condition_channels : int
        Feature size of the condition.
-    kernel_size : Tuple[int]
+    Raises:
-        Kernel size of each ResidualBlock.
+        ValueError: If the length of dilations_h does not equals n_layers.
    dilations_h : List[int]
        Dilation in height dimension of every ResidualBlock.
    Raises
    ------
    ValueError
        If the length of dilations_h does not equals n_layers.
    """
    def __init__(self,
@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList):
    def forward(self, x, condition):
        """Comput the output of given the input and the condition.
-        Parameters
+        Args:
-        -----------
+            x (Tensor): The input. shape=(batch_size, channel, height, width)
-        x : Tensor [shape=(batch_size, channel, height, width)]
+            condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
-            The input.
+            
-
+        Returns: 
-        condition : Tensor [shape=(batch_size, condition_channel, height, width)]
+            Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
-            The local condition.
+            
        Returns
        --------
        Tensor : [shape=(batch_size, channel, height, width)]
            The output, which is an aggregation of all the skip outputs.
        """
        skip_connections = []
        for layer in self:
@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList):
    def add_input(self, x_row, condition_row):
        """Compute the output for a row and update the buffers.
-        Parameters
+        Args:
-        ----------
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
-        x_row : Tensor [shape=(batch_size, channel, 1, width)]
+            condition_row (Tensor):  A row of the condition. shape=(batch_size, condition_channel, 1, width)
-            A row of the input.
+            
-
+        Returns:
-        condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) 
-            A row of the condition.
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
-
+                
        Returns
        -------
        res : Tensor [shape=(batch_size, channel, 1, width)]
            A row of the the residual output.
        skip : Tensor [shape=(batch_size, channel, 1, width)]
            A row of the skip output.
        """
        skip_connections = []
        for layer in self:
@ -400,22 +336,12 @@ class Flow(nn.Layer):
    probability density estimation. The ``inverse`` method implements the
    sampling.
-    Parameters
+    Args:
-    ----------
+        n_layers (int): Number of ResidualBlocks in the Flow.
-    n_layers : int
+        channels (int): Feature size of the ResidualBlocks.
-        Number of ResidualBlocks in the Flow.
+        mel_bands (int): Feature size of the mel spectrogram (mel bands).
-
+        kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
-    channels : int
+        n_group (int): Number of timesteps to the folded into a group.
        Feature size of the ResidualBlocks.
    mel_bands : int
        Feature size of the mel spectrogram (mel bands).
    kernel_size : Tuple[int]
        Kernel size of each ResisualBlocks in the Flow.
    n_group : int
        Number of timesteps to the folded into a group.
    """
    dilations_dict = {
        8: [1, 1, 1, 1, 1, 1, 1, 1],
@ -466,26 +392,16 @@ class Flow(nn.Layer):
        """Probability density estimation. It is done by inversely transform
        a sample from p(X) into a sample from p(Z).
-        Parameters
+        Args:
-        -----------
+            x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
-        x : Tensor [shape=(batch, 1, height, width)]
+            condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
-            A input sample of the distribution p(X).
+            
-
+        Returns:
-        condition : Tensor [shape=(batch, condition_channel, height, width)]
+            z (Tensor): shape(batch, 1, height, width), the transformed sample.
-            The local condition.
+            Tuple[Tensor, Tensor]:
-
+                The parameter of the transformation.
-        Returns
+                logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
-        --------
+                b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z.
        z (Tensor): shape(batch, 1, height, width), the transformed sample.
        Tuple[Tensor, Tensor]
            The parameter of the transformation.
            logs (Tensor): shape(batch, 1, height - 1, width), the log scale
            of the transformation from x to z.
            b (Tensor): shape(batch, 1, height - 1, width), the shift of the
            transformation from x to z.
        """
        # (B, C, H-1, W)
        logs, b = self._predict_parameters(x[:, :, :-1, :],
@ -516,27 +432,12 @@ class Flow(nn.Layer):
        """Sampling from the the distrition p(X). It is done by sample form
        p(Z) and transform the sample. It is a auto regressive transformation.
-        Parameters
+        Args:
-        -----------
+            z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
-        z : Tensor [shape=(batch, 1, height, width)]
+            condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
-            A sample of the distribution p(Z).
+        Returns:
-
+            Tensor:
-        condition : Tensor [shape=(batch, condition_channel, height, width)]
+                The transformed sample. shape=(batch, 1, height, width)
            The local condition.
        Returns
        ---------
        x : Tensor [shape=(batch, 1, height, width)]
            The transformed sample.
        Tuple[Tensor, Tensor]
            The parameter of the transformation.
            logs (Tensor): shape(batch, 1, height - 1, width), the log scale
            of the transformation from x to z.
            b (Tensor): shape(batch, 1, height - 1, width), the shift of the
            transformation from x to z.
        """
        z_0 = z[:, :, :1, :]
        x = paddle.zeros_like(z)
@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList):
    """An Deep Reversible layer that is composed of severel auto regressive
    flows.
-    Parameters
+    Args:
-    -----------
+        n_flows (int): Number of flows in the WaveFlow model.
-    n_flows : int
+        n_layers (int): Number of ResidualBlocks in each Flow.
-        Number of flows in the WaveFlow model.
+        n_group (int): Number of timesteps to fold as a group.
-
+        channels (int): Feature size of each ResidualBlock.
-    n_layers : int
+        mel_bands (int): Feature size of mel spectrogram (mel bands).
-        Number of ResidualBlocks in each Flow.
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
    n_group : int
        Number of timesteps to fold as a group.
    channels : int
        Feature size of each ResidualBlock.
    mel_bands : int
        Feature size of mel spectrogram (mel bands).
    kernel_size : Union[int, List[int]]
        Kernel size of the convolution layer in each ResidualBlock.
    """
    def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList):
        """Probability density estimation of random variable x given the
        condition.
-        Parameters
+        Args:
-        -----------
+            x (Tensor): The audio. shape=(batch_size, time_steps)
-        x : Tensor [shape=(batch_size, time_steps)]
+            condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
-            The audio.
+                
-
+        Returns:
-        condition : Tensor [shape=(batch_size, condition channel, time_steps)]
+            Tensor: The transformed random variable. shape=(batch_size, time_steps)
-            The local condition (mel spectrogram here).
+            Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
        Returns
        --------
        z : Tensor [shape=(batch_size, time_steps)]
            The transformed random variable.
        log_det_jacobian: Tensor [shape=(1,)]
            The log determinant of the jacobian of the transformation from x
            to z.
        """
        # x: (B, T)
        # condition: (B, C, T) upsampled condition
@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList):
        Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
        autoregressive manner.
-        Parameters
+        Args:
-        ----------
+            z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
-        z : Tensor [shape=(batch, 1, time_steps]
+            condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)    
            A sample of the distribution p(Z).
        condition : Tensor [shape=(batch, condition_channel, time_steps)]
            The local condition.
-        Returns
+        Returns: 
-        --------
+            Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
-        x : Tensor [shape=(batch_size, time_steps)]
+            
            The transformed sample (audio here).
        """
        z, condition = self._trim(z, condition)
@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList):
 class ConditionalWaveFlow(nn.LayerList):
    """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
-    Parameters
+    Args:
-    ----------
+        upsample_factors (List[int]): Upsample factors for the upsample net.
-    upsample_factors : List[int]
+        n_flows (int): Number of flows in the WaveFlow model.
-        Upsample factors for the upsample net.
+        n_layers (int): Number of ResidualBlocks in each Flow.
-
+        n_group (int): Number of timesteps to fold as a group.
-    n_flows : int
+        channels (int): Feature size of each ResidualBlock.
-        Number of flows in the WaveFlow model.
+        n_mels (int): Feature size of mel spectrogram (mel bands).
-
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
-    n_layers : int
+        """
        Number of ResidualBlocks in each Flow.
    n_group : int
        Number of timesteps to fold as a group.
    channels : int
        Feature size of each ResidualBlock.
    n_mels : int
        Feature size of mel spectrogram (mel bands).
    kernel_size : Union[int, List[int]]
        Kernel size of the convolution layer in each ResidualBlock.
    """
    def __init__(self,
                 upsample_factors: List[int],
@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList):
        """Compute the transformed random variable z (x to z) and the log of
        the determinant of the jacobian of the transformation from x to z.
-        Parameters
+        Args:
-        ----------
+            audio(Tensor): The audio. shape=(B, T)
-        audio : Tensor [shape=(B, T)]
+            mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
            The audio.
-        mel : Tensor [shape=(B, C_mel, T_mel)]
+        Returns:
-            The mel spectrogram.
+            Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
-
+            Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
        Returns
        -------
        z : Tensor [shape=(B, T)]
            The inversely transformed random variable z (x to z)
        log_det_jacobian: Tensor [shape=(1,)]
            the log of the determinant of the jacobian of the transformation
            from x to z.
        """
        condition = self.encoder(mel)
        z, log_det_jacobian = self.decoder(audio, condition)
@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList):
    @paddle.no_grad()
    def infer(self, mel):
-        r"""Generate raw audio given mel spectrogram.
+        """Generate raw audio given mel spectrogram.
-        Parameters
+        Args:
-        ----------
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
        mel : Tensor [shape=(B, C_mel, T_mel)]
            Mel spectrogram (in log-magnitude).
-        Returns
+        Returns:
-        -------
+            Tensor: The synthesized audio, where``T <= T_mel \* upsample_factors``. shape=(B, T)
        Tensor : [shape=(B, T)]
            The synthesized audio, where``T <= T_mel \* upsample_factors``.
        """
        start = time.time()
        condition = self.encoder(mel, trim_conv_artifact=True)  # (B, C, T)
@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList):
    def predict(self, mel):
        """Generate raw audio given mel spectrogram.
-        Parameters
+        Args:
-        ----------
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
        mel : np.ndarray [shape=(C_mel, T_mel)]
            Mel spectrogram of an utterance(in log-magnitude).
-        Returns
+        Returns:
-        -------
+            np.ndarray: The synthesized audio. shape=(T,)
        np.ndarray [shape=(T,)]
            The synthesized audio.
        """
        mel = paddle.to_tensor(mel)
        mel = paddle.unsqueeze(mel, 0)
@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList):
    def from_pretrained(cls, config, checkpoint_path):
        """Build a ConditionalWaveFlow model from a pretrained model.
-        Parameters
+        Args:
-        ----------
+            config(yacs.config.CfgNode): model configs
-        config: yacs.config.CfgNode
+            checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
            model configs
-        checkpoint_path: Path or str
+        Returns:
-            the path of pretrained model checkpoint, without extension name
+            ConditionalWaveFlow The model built from pretrained result.
        Returns
        -------
        ConditionalWaveFlow
            The model built from pretrained result.
        """
        model = cls(upsample_factors=config.model.upsample_factors,
                    n_flows=config.model.n_flows,
@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList):
 class WaveFlowLoss(nn.Layer):
    """Criterion of a WaveFlow model.
-    Parameters
+    Args:
-    ----------
+        sigma (float): The standard deviation of the gaussian noise used in WaveFlow, 
-    sigma : float
+            by default 1.0.
        The standard deviation of the gaussian noise used in WaveFlow, by
        default 1.0.
    """
    def __init__(self, sigma=1.0):
@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer):
        """Compute the loss given the transformed random variable z and the
        log_det_jacobian of transformation from x to z.
-        Parameters
+        Args:
-        ----------
+            z(Tensor): The transformed random variable (x to z). shape=(B, T)
-        z : Tensor [shape=(B, T)]
+            log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
-            The transformed random variable (x to z).
+                transformation from x to z.  shape=(1,)
        log_det_jacobian : Tensor [shape=(1,)]
            The log of the determinant of the jacobian matrix of the
            transformation from x to z.
-        Returns
+        Returns:
-        -------
+            Tensor: The loss. shape=(1,)
        Tensor [shape=(1,)]
            The loss.
        """
        loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
                                    ) - log_det_jacobian
@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
    def forward(self, mel):
        """Generate raw audio given mel spectrogram.
-        Parameters
+        Args:
-        ----------
+            mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
-        mel : np.ndarray [shape=(C_mel, T_mel)]
+            
-            Mel spectrogram of an utterance(in log-magnitude).
+        Returns:
-
+            np.ndarray: The synthesized audio. shape=(T,)
-        Returns
+            
        -------
        np.ndarray [shape=(T,)]
            The synthesized audio.
        """
        audio = self.predict(mel)
        return audio
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@ -67,14 +67,10 @@ class MelResNet(nn.Layer):
    def forward(self, x):
        '''
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, in_dims, T).
-        x : Tensor
+        Returns:
-            Input tensor (B, in_dims, T).
+            Tensor: Output tensor (B, res_out_dims, T).
        Returns
        ----------
        Tensor
            Output tensor (B, res_out_dims, T).
        '''
        x = self.conv_in(x)
@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer):
    def forward(self, m):
        '''
-        Parameters
+        Args:
-        ----------
+            c (Tensor): Input tensor (B, C_aux, T).
-        c : Tensor
+        Returns:
-            Input tensor (B, C_aux, T).
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
-        Returns
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        ----------
        Tensor
            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
        Tensor
            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
        '''
        # aux: [B, C_aux, T] 
        # -> [B, res_out_dims, T - 2 * aux_context_window]
@ -172,32 +163,20 @@ class WaveRNN(nn.Layer):
            mode='RAW',
            init_type: str="xavier_uniform", ):
        '''
-        Parameters
+        Args:
-        ----------
+            rnn_dims (int, optional): Hidden dims of RNN Layers.
-        rnn_dims : int, optional
+            fc_dims (int, optional): Dims of FC Layers.
-            Hidden dims of RNN Layers.
+            bits (int, optional): bit depth of signal.
-        fc_dims : int, optional
+            aux_context_window (int, optional): The context window size of the first convolution applied to the 
-             Dims of FC Layers.
+                auxiliary input, by default 2
-        bits : int, optional
+            upsample_scales (List[int], optional): Upsample scales of the upsample network.
-            bit depth of signal.
+            aux_channels (int, optional): Auxiliary channel of the residual blocks.
-        aux_context_window : int, optional
+            compute_dims (int, optional): Dims of Conv1D in MelResNet.
-            The context window size of the first convolution applied to the 
+            res_out_dims (int, optional): Dims of output in MelResNet.
-            auxiliary input, by default 2
+            res_blocks (int, optional): Number of residual blocks.
-        upsample_scales : List[int], optional
+            mode (str, optional): Output mode of the WaveRNN vocoder. 
-            Upsample scales of the upsample network.
+                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
-        aux_channels : int, optional
+            init_type (str): How to initialize parameters.
            Auxiliary channel of the residual blocks.
        compute_dims : int, optional
            Dims of Conv1D in MelResNet.
        res_out_dims : int, optional
            Dims of output in MelResNet.
        res_blocks : int, optional
            Number of residual blocks.
        mode : str, optional
            Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
            and `RAW` for quantized bits as the model's output.
        init_type : str
            How to initialize parameters.
        '''
        super().__init__()
        self.mode = mode
@ -245,18 +224,13 @@ class WaveRNN(nn.Layer):
    def forward(self, x, c):
        '''
-        Parameters
+        Args:
-        ----------
+            x (Tensor): wav sequence, [B, T]
-        x : Tensor
+            c (Tensor): mel spectrogram [B, C_aux, T']
-            wav sequence, [B, T]
+
-        c : Tensor
+            T = (T' - 2 * aux_context_window ) * hop_length
-            mel spectrogram [B, C_aux, T']
+        Returns:
-        
+            Tensor: [B, T, n_classes]
        T = (T' - 2 * aux_context_window ) * hop_length
        Returns
        ----------
        Tensor
            [B, T, n_classes]
        '''
        # Although we `_flatten_parameters()` on init, when using DataParallel
        # the model gets replicated, making it no longer guaranteed that the
@ -304,22 +278,14 @@ class WaveRNN(nn.Layer):
                 mu_law: bool=True,
                 gen_display: bool=False):
        """
-        Parameters
+        Args:
-        ----------
+            c(Tensor): input mels, (T', C_aux)
-        c : Tensor
+            batched(bool): generate in batch or not
-            input mels, (T', C_aux)
+            target(int): target number of samples to be generated in each batch entry
-        batched : bool
+            overlap(int): number of samples for crossfading between batches
-            generate in batch or not
+            mu_law(bool)
-        target : int
+        Returns: 
-            target number of samples to be generated in each batch entry
+            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
        overlap : int
            number of samples for crossfading between batches
        mu_law : bool
            use mu law or not
        Returns
        ----------
        wav sequence
            Output (T' * prod(upsample_scales), out_channels, C_out).
        """
        self.eval()
@ -434,16 +400,13 @@ class WaveRNN(nn.Layer):
    def pad_tensor(self, x, pad, side='both'):
        '''
-        Parameters
+        Args:
-        ----------
+            x(Tensor): mel, [1, n_frames, 80]
-        x : Tensor
+            pad(int): 
-            mel, [1, n_frames, 80]
+            side(str, optional):  (Default value = 'both')
-        pad : int
+
-        side : str 
+        Returns:
-            'both', 'before' or 'after'
+            Tensor
        Returns
        ----------
        Tensor
        '''
        b, t, _ = paddle.shape(x)
        # for dygraph to static graph
@ -461,38 +424,29 @@ class WaveRNN(nn.Layer):
        Fold the tensor with overlap for quick batched inference.
        Overlap will be used for crossfading in xfade_and_unfold()
-        Parameters
+        Args:
-        ----------
+            x(Tensor): Upsampled conditioning features. mels or aux
-        x : Tensor
+                shape=(1, T, features)
-            Upsampled conditioning features. mels or aux
+                mels: [1, T, 80]
-            shape=(1, T, features)
+                aux: [1, T, 128]
-            mels: [1, T, 80]
+            target(int): Target timesteps for each index of batch
-            aux: [1, T, 128]
+            overlap(int): Timesteps for both xfade and rnn warmup
-        target : int
+
-            Target timesteps for each index of batch
+        Returns:
-        overlap : int
+            Tensor: 
-            Timesteps for both xfade and rnn warmup
+                shape=(num_folds, target + 2 * overlap, features)
-            overlap = hop_length * 2
+                num_flods = (time_seq - overlap) // (target + overlap)
-
+                mel: [num_folds, target + 2 * overlap, 80]
-        Returns
+                aux: [num_folds, target + 2 * overlap, 128]
-        ----------
+
-        Tensor 
+        Details:
-            shape=(num_folds, target + 2 * overlap, features)
+            x = [[h1, h2, ... hn]]
-            num_flods = (time_seq - overlap) // (target + overlap)
+            Where each h is a vector of conditioning features
-            mel: [num_folds, target + 2 * overlap, 80]
+            Eg: target=2, overlap=1 with x.size(1)=10
-            aux: [num_folds, target + 2 * overlap, 128]
+
-
+            folded = [[h1, h2, h3, h4],
-        Details
+                    [h4, h5, h6, h7],
-        ----------
+                    [h7, h8, h9, h10]]
        x = [[h1, h2, ... hn]]
        Where each h is a vector of conditioning features
        Eg: target=2, overlap=1 with x.size(1)=10
        folded = [[h1, h2, h3, h4],
                  [h4, h5, h6, h7],
                  [h7, h8, h9, h10]]
        '''
        _, total_len, features = paddle.shape(x)
@ -520,37 +474,33 @@ class WaveRNN(nn.Layer):
    def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
        ''' Applies a crossfade and unfolds into a 1d array.
-        Parameters
+        Args:
-        ----------
+            y (Tensor): 
-        y : Tensor
+                Batched sequences of audio samples
-            Batched sequences of audio samples
+                shape=(num_folds, target + 2 * overlap)
-            shape=(num_folds, target + 2 * overlap)
+                dtype=paddle.float32
-            dtype=paddle.float32
+            overlap (int): Timesteps for both xfade and rnn warmup
-        overlap : int
+
-            Timesteps for both xfade and rnn warmup
+        Returns:
-
+            Tensor
-        Returns
+                audio samples in a 1d array
-        ----------
+                shape=(total_len)
-        Tensor
+                dtype=paddle.float32
-            audio samples in a 1d array
+
-            shape=(total_len)
+        Details:
-            dtype=paddle.float32
+            y = [[seq1],
-
+                [seq2],
-        Details
+                [seq3]]
-        ----------
+
-        y = [[seq1],
+            Apply a gain envelope at both ends of the sequences
-            [seq2],
+
-            [seq3]]
+            y = [[seq1_in, seq1_target, seq1_out],
-
+                [seq2_in, seq2_target, seq2_out],
-        Apply a gain envelope at both ends of the sequences
+                [seq3_in, seq3_target, seq3_out]]
-
+
-        y = [[seq1_in, seq1_target, seq1_out],
+            Stagger and add up the groups of samples:
-            [seq2_in, seq2_target, seq2_out],
+
-            [seq3_in, seq3_target, seq3_out]]
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
        Stagger and add up the groups of samples:
        [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
        '''
        # num_folds = (total_len - overlap) // (target + overlap)
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, in_channels, T).
-        x : Tensor
+        Returns: 
-            Input tensor (B, in_channels, T).
+            Tensor: Output tensor (B, out_channels, T).
        Returns
        ----------
        Tensor
            Output tensor (B, out_channels, T).
        """
        return self.conv(self.pad(x))[:, :, :x.shape[2]]
@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, in_channels, T_in).
-        x : Tensor
+        Returns:
-            Input tensor (B, in_channels, T_in).
+            Tensor: Output tensor (B, out_channels, T_out).
        Returns
        ----------
        Tensor
            Output tensor (B, out_channels, T_out).
        """
        return self.deconv(x)[:, :, :-self.stride]
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@ -18,12 +18,10 @@ from paddle import nn
 class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model.
-    Parameters
+
-    ----------
+    Args:
-    channels : int
+        channels (int): The number of channels of conv layers.
-        The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
    kernel_size : int
        Kernerl size of conv layers.
    """
    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):
    def forward(self, x):
        """Compute convolution module.
-        Parameters
+
-        ----------
+        Args:
-        x : paddle.Tensor
+            x (Tensor): Input tensor (#batch, time, channels).
-            Input tensor (#batch, time, channels).
+        Returns:
-        Returns
+            Tensor: Output tensor (#batch, time, channels).
        ----------
        paddle.Tensor
            Output tensor (#batch, time, channels).
        """
        # exchange the temporal dimension and the feature dimension
        x = x.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
 class EncoderLayer(nn.Layer):
    """Encoder layer module.
-    Parameters
+    
-    ----------
+    Args:
-    size : int
+        size (int): Input dimension.
-        Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
-    self_attn : nn.Layer
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
-        Self-attention module instance.
+            can be used as the argument.
-        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+        feed_forward (nn.Layer): Feed-forward module instance.
-        can be used as the argument.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-    feed_forward : nn.Layer
+            can be used as the argument.
-        Feed-forward module instance.
+        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
+            can be used as the argument.
-    feed_forward_macaron : nn.Layer
+        conv_module (nn.Layer): Convolution module instance.
-        Additional feed-forward module instance.
+            `ConvlutionModule` instance can be used as the argument.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+        dropout_rate (float): Dropout rate.
-        can be used as the argument.
+        normalize_before (bool): Whether to use layer_norm before the first block.
-    conv_module : nn.Layer
+        concat_after (bool): Whether to concat attention layer's input and output.
-        Convolution module instance.
+            if True, additional linear will be applied.
-        `ConvlutionModule` instance can be used as the argument.
+            i.e. x -> x + linear(concat(x, att(x)))
-    dropout_rate : float
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        Dropout rate.
+        stochastic_depth_rate (float): Proability to skip this layer.
-    normalize_before : bool
+            During training, the layer may skip residual computation and return input
-        Whether to use layer_norm before the first block.
+            as-is with given probability.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    stochastic_depth_rate : float
        Proability to skip this layer.
        During training, the layer may skip residual computation and return input
        as-is with given probability.
    """
    def __init__(
@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):
    def forward(self, x_input, mask, cache=None):
        """Compute encoded features.
-        Parameters
+
-        ----------
+        Args:
-        x_input : Union[Tuple, paddle.Tensor]
+            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
-            Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
-            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
-            - w/o pos emb: Tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
-        mask : paddle.Tensor
+            cache (Tensor): 
-            Mask tensor for the input (#batch, time).
+
-        cache paddle.Tensor
+        Returns:
-            Cache tensor of the input (#batch, time - 1, size).
+            Tensor: Output tensor (#batch, time, size).
-        Returns
+            Tensor: Mask tensor (#batch, time).
        ----------
        paddle.Tensor
            Output tensor (#batch, time, size).
        paddle.Tensor
            Mask tensor (#batch, time).
        """
        if isinstance(x_input, tuple):
            x, pos_emb = x_input[0], x_input[1]
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D):
    2. padding must be a causal padding (recpetive_field - 1, 0).
    Thus, these arguments are removed from the ``__init__`` method of this
    class.
-    
+
-    Parameters
+    Args:
-    ----------
+        in_channels (int): The feature size of the input.
-    in_channels: int
+        out_channels (int): The feature size of the output.
-        The feature size of the input.
+        kernel_size (int or Tuple[int]): The size of the kernel.
-    out_channels: int
+        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
-        The feature size of the output.
+        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
-    kernel_size: int or Tuple[int]
+            by default None.
-        The size of the kernel.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
-    dilation: int or Tuple[int]
+            If ``False``, this layer does not have a bias, by default None.
-        The dilation of the convolution, by default 1
+            
-    weight_attr: ParamAttr, Initializer, str or bool, optional
+    Examples: 
-        The parameter attribute of the convolution kernel, by default None.
+        >>> cell = Conv1dCell(3, 4, kernel_size=5)
-    bias_attr: ParamAttr, Initializer, str or bool, optional
+        >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
-        The parameter attribute of the bias. If ``False``, this layer does not
+        >>> outputs = []
-        have a bias, by default None.
+        >>> cell.eval()
-        
+        >>> cell.start_sequence()
-    Examples
+        >>> for xt in inputs:
-    --------
+        >>>     outputs.append(cell.add_input(xt))
-    >>> cell = Conv1dCell(3, 4, kernel_size=5)
+        >>> len(outputs))
-    >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+        16
-    >>> outputs = []
+        >>> outputs[0].shape
-    >>> cell.eval()
+        [4, 4]
    >>> cell.start_sequence()
    >>> for xt in inputs:
    >>>     outputs.append(cell.add_input(xt))
    >>> len(outputs))
    16
    >>> outputs[0].shape
    [4, 4]
    """
    def __init__(self,
@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D):
    def start_sequence(self):
        """Prepare the layer for a series of incremental forward.
-        Warnings
+        Warnings:
-        ---------
+            This method should be called before a sequence of calls to
-        This method should be called before a sequence of calls to
+            ``add_input``.
        ``add_input``.
-        Raises
+        Raises:
-        ------
+            Exception
-        Exception
+                If this method is called when the layer is in training mode.
            If this method is called when the layer is in training mode.
        """
        if self.training:
            raise Exception("only use start_sequence in evaluation")
@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
    def initialize_buffer(self, x_t):
        """Initialize the buffer for the step input.
-        Parameters
+        Args:
-        ----------
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
-        x_t : Tensor [shape=(batch_size, in_channels)]
+            
            The step input.
        """
        batch_size, _ = x_t.shape
        self._buffer = paddle.zeros(
@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D):
    def update_buffer(self, x_t):
        """Shift the buffer by one step.
-        Parameters
+        Args:
-        ----------
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
-        x_t : Tensor [shape=(batch_size, in_channels)]
+            
            The step input.
        """
        self._buffer = paddle.concat(
            [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
    def add_input(self, x_t):
        """Add step input and compute step output.
-        
+
-        Parameters
+        Args:
-        -----------
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
-        x_t : Tensor [shape=(batch_size, in_channels)]
+          
-            The step input.
+        Returns: 
-            
+            y_t (Tensor): The step output. shape=(batch_size, out_channels)
-        Returns
+
        -------
        y_t :Tensor [shape=(batch_size, out_channels)]
            The step output.
        """
        batch_size = x_t.shape[0]
        if self.receptive_field > 1:
@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
 class Conv1dBatchNorm(nn.Layer):
    """A Conv1D Layer followed by a BatchNorm1D.
-    Parameters
+    Args:
-    ----------
+        in_channels (int): The feature size of the input.
-    in_channels : int
+        out_channels (int): The feature size of the output.
-        The feature size of the input.
+        kernel_size (int): The size of the convolution kernel.
-    out_channels : int
+        stride (int, optional): The stride of the convolution, by default 1.
-        The feature size of the output.
+        padding (int, str or Tuple[int], optional):
-    kernel_size : int
+            The padding of the convolution.
-        The size of the convolution kernel.
+            If int, a symmetrical padding is applied before convolution;
-    stride : int, optional
+            If str, it should be "same" or "valid";
-        The stride of the convolution, by default 1.
+            If Tuple[int], its length should be 2, meaning
-    padding : int, str or Tuple[int], optional
+            ``(pad_before, pad_after)``, by default 0.
-        The padding of the convolution.
+        weight_attr (ParamAttr, Initializer, str or bool, optional):
-        If int, a symmetrical padding is applied before convolution;
+            The parameter attribute of the convolution kernel,
-        If str, it should be "same" or "valid";
+            by default None.
-        If Tuple[int], its length should be 2, meaning
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
-        ``(pad_before, pad_after)``, by default 0.
+            The parameter attribute of the bias of the convolution,
-    weight_attr : ParamAttr, Initializer, str or bool, optional
+            by defaultNone.
-        The parameter attribute of the convolution kernel, by default None.
+        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
-    bias_attr : ParamAttr, Initializer, str or bool, optional
+        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
-        The parameter attribute of the bias of the convolution, by default
+        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
        None.
    data_format : str ["NCL" or "NLC"], optional
        The data layout of the input, by default "NCL"
    momentum : float, optional
        The momentum of the BatchNorm1D layer, by default 0.9
    epsilon : [type], optional
        The epsilon of the BatchNorm1D layer, by default 1e-05
    """
    def __init__(self,
@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer):
    def forward(self, x):
        """Forward pass of the Conv1dBatchNorm layer.
-
+        
-        Parameters
+        Args:
-        ----------
+            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
-        x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
+            shape=(B, C_in, T_in) or (B, T_in, C_in)
-            The input tensor. Its data layout depends on ``data_format``.
+    
-
+        Returns:
-        Returns
+            Tensor: The output tensor. 
-        -------
+                shape=(B, C_out, T_out) or (B, T_out, C_out)
-        Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
+                
            The output tensor. 
        """
        x = self.conv(x)
        x = self.bn(x)
--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@ -17,24 +17,18 @@ import paddle
 def shuffle_dim(x, axis, perm=None):
    """Permute input tensor along aixs given the permutation or randomly.
    Args:
        x (Tensor): The input tensor.
        axis (int): The axis to shuffle.
        perm (List[int], ndarray, optional): 
            The order to reorder the tensor along the ``axis``-th dimension.
            It is a permutation of ``[0, d)``, where d is the size of the
            ``axis``-th dimension of the input tensor. If not provided,
            a random permutation is used. Defaults to None.
-    Parameters
+    Returns:
-    ----------
+        Tensor: The shuffled tensor, which has the same shape as x does.
    x : Tensor
        The input tensor.
    axis : int
        The axis to shuffle.
    perm : List[int], ndarray, optional
        The order to reorder the tensor along the ``axis``-th dimension.
        It is a permutation of ``[0, d)``, where d is the size of the
        ``axis``-th dimension of the input tensor. If not provided,
        a random permutation is used. Defaults to None.
    Returns
    ---------
    Tensor
        The shuffled tensor, which has the same shape as x does.
    """
    size = x.shape[axis]
    if perm is not None and len(perm) != size:
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@ -18,13 +18,9 @@ from paddle import nn
 class LayerNorm(nn.LayerNorm):
    """Layer normalization module.
-
+    Args:
-    Parameters
+        nout (int): Output dim size.
-    ----------
+        dim (int): Dimension to be normalized.
    nout : int
        Output dim size.
    dim : int
        Dimension to be normalized.
    """
    def __init__(self, nout, dim=-1):
@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
    def forward(self, x):
        """Apply layer normalization.
-        Parameters
+        Args:
-        ----------
+            x (Tensor):Input tensor.
        x : paddle.Tensor
            Input tensor.
-        Returns
+        Returns: 
-        ----------
+            Tensor: Normalized tensor.
        paddle.Tensor
            Normalized tensor.
        """
        if self.dim == -1:
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat,
 def sample_from_discretized_mix_logistic(y, log_scale_min=None):
    """
    Sample from discretized mixture of logistic distributions
-    Parameters
+
-    ----------
+    Args:
-    y : Tensor 
+        y(Tensor): (B, C, T)
-        (B, C, T)
+        log_scale_min(float, optional):  (Default value = None)
-    log_scale_min : float
+
-        Log scale minimum value
+    Returns:
-    Returns
+        Tensor: sample in range of [-1, 1].
    ----------
    Tensor
        sample in range of [-1, 1].
    """
    if log_scale_min is None:
        log_scale_min = float(np.log(1e-14))
@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer):
    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
        """Initialize guided attention loss module.
-        Parameters
+        Args:
-        ----------
+            sigma (float, optional): Standard deviation to control how close attention to a diagonal.
-        sigma : float, optional
+            alpha (float, optional): Scaling coefficient (lambda).
-            Standard deviation to control how close attention to a diagonal.
+            reset_always (bool, optional): Whether to always reset masks.
        alpha : float, optional
            Scaling coefficient (lambda).
        reset_always : bool, optional
            Whether to always reset masks.
        """
        super().__init__()
@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer):
    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
-        att_ws : Tensor
+            ilens(Tensor(int64)): Batch of input lenghts (B,).
-            Batch of attention weights (B, T_max_out, T_max_in).
+            olens(Tensor(int64)): Batch of output lenghts (B,).
-        ilens : Tensor(int64)
+
-            Batch of input lenghts (B,).
+        Returns:
-        olens : Tensor(int64)
+            Tensor: Guided attention loss value.
            Batch of output lenghts (B,).
        Returns
        ----------
        Tensor
            Guided attention loss value.
        """
        if self.guided_attn_masks is None:
@ -282,39 +269,33 @@ class GuidedAttentionLoss(nn.Layer):
    def _make_masks(ilens, olens):
        """Make masks indicating non-padded part.
-        Parameters
+        Args:
-        ----------
+            ilens(Tensor(int64) or List): Batch of lengths (B,).
-        ilens : Tensor(int64) or List
+            olens(Tensor(int64) or List): Batch of lengths (B,).
-            Batch of lengths (B,).
+
-        olens : Tensor(int64) or List
+        Returns:
-            Batch of lengths (B,).
+            Tensor: Mask tensor indicating non-padded part.
-
+
-        Returns
+        Examples:
-        ----------
+            >>> ilens, olens = [5, 2], [8, 5]
-        Tensor
+            >>> _make_mask(ilens, olens)
-            Mask tensor indicating non-padded part.
+            tensor([[[1, 1, 1, 1, 1],
-
+                    [1, 1, 1, 1, 1],
-        Examples
+                    [1, 1, 1, 1, 1],
-        ----------
+                    [1, 1, 1, 1, 1],
-        >>> ilens, olens = [5, 2], [8, 5]
+                    [1, 1, 1, 1, 1],
-        >>> _make_mask(ilens, olens)
+                    [1, 1, 1, 1, 1],
-        tensor([[[1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1]],
-                [1, 1, 1, 1, 1],
+
-                [1, 1, 1, 1, 1],
+                    [[1, 1, 0, 0, 0],
-                [1, 1, 1, 1, 1],
+                    [1, 1, 0, 0, 0],
-                [1, 1, 1, 1, 1],
+                    [1, 1, 0, 0, 0],
-                [1, 1, 1, 1, 1],
+                    [1, 1, 0, 0, 0],
-                [1, 1, 1, 1, 1]],
+                    [1, 1, 0, 0, 0],
-
+                    [0, 0, 0, 0, 0],
-                [[1, 1, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
-                [1, 1, 0, 0, 0],
+                    [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
                [1, 1, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [1, 1, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
        """
        # (B, T_in)
@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer):
 class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
    """Guided attention loss function module for multi head attention.
-    Parameters
+    Args:
-    ----------
+        sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
-    sigma : float, optional
+            how close attention to a diagonal.
-        Standard deviation to controlGuidedAttentionLoss
+        alpha (float, optional): Scaling coefficient (lambda).
-        how close attention to a diagonal.
+        reset_always (bool, optional): Whether to always reset masks.
    alpha : float, optional
        Scaling coefficient (lambda).
    reset_always : bool, optional
        Whether to always reset masks.
    """
    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-        att_ws : Tensor
+            ilens(Tensor): Batch of input lenghts (B,).
-            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            olens(Tensor): Batch of output lenghts (B,).
-        ilens : Tensor
+
-            Batch of input lenghts (B,).
+        Returns:
-        olens : Tensor
+            Tensor: Guided attention loss value.
            Batch of output lenghts (B,).
        Returns
        ----------
        Tensor
            Guided attention loss value.
        """
        if self.guided_attn_masks is None:
@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer):
                 use_weighted_masking=False,
                 bce_pos_weight=20.0):
        """Initialize Tactoron2 loss module.
-        Parameters
+
-        ----------
+        Args:
-        use_masking : bool
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
-            Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
-        use_weighted_masking : bool
+            bce_pos_weight (float): Weight of positive sample of stop token.
            Whether to apply weighted masking in loss calculation.
        bce_pos_weight : float
            Weight of positive sample of stop token.
        """
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer):
    def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        after_outs : Tensor
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
-            Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
-        before_outs : Tensor
+            logits(Tensor): Batch of stop logits (B, Lmax).
-            Batch of outputs before postnets (B, Lmax, odim).
+            ys(Tensor): Batch of padded target features (B, Lmax, odim).
-        logits : Tensor
+            stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
-            Batch of stop logits (B, Lmax).
+            olens(Tensor(int64)): 
-        ys : Tensor
+
-            Batch of padded target features (B, Lmax, odim).
+        Returns:
-        stop_labels : Tensor(int64)
+            Tensor: L1 loss value.
-            Batch of the sequences of stop token labels (B, Lmax).
+            Tensor: Mean square error loss value.
-        olens : Tensor(int64)
+            Tensor: Binary cross entropy loss value.
            Batch of the lengths of each target (B,).
        Returns
        ----------
        Tensor
            L1 loss value.
        Tensor
            Mean square error loss value.
        Tensor
            Binary cross entropy loss value.
        """
        # make mask and apply it
        if self.use_masking:
@ -513,28 +472,20 @@ def stft(x,
         center=True,
         pad_mode='reflect'):
    """Perform STFT and convert to magnitude spectrogram.
-    Parameters
+    Args:
-    ----------
+        x(Tensor): Input signal tensor (B, T).
-    x : Tensor
+        fft_size(int): FFT size.
-        Input signal tensor (B, T).
+        hop_size(int): Hop size.
-    fft_size : int
+        win_length(int, optional): window : str, optional (Default value = None)
-        FFT size.
+        window(str, optional): Name of window function, see `scipy.signal.get_window` for more
-    hop_size : int
+            details. Defaults to "hann".
-        Hop size.
+        center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
-    win_length : int
+            :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
-        window : str, optional
+        pad_mode(str, optional, optional):  (Default value = 'reflect')
-    window : str
+        hop_length:  (Default value = None)
-        Name of window function, see `scipy.signal.get_window` for more
+
-        details. Defaults to "hann".
+    Returns:
-    center : bool, optional
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
        center (bool, optional): Whether to pad `x` to make that the
        :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
    pad_mode : str, optional
        Choose padding pattern when `center` is `True`.
    Returns
    ----------
    Tensor:
        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
    """
    # calculate window
    window = signal.get_window(window, win_length, fftbins=True)
@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer):
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
-        Parameters
+        Args: 
-        ----------
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        x_mag : Tensor
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        Returns:
-        y_mag : Tensor)
+            Tensor: Spectral convergence loss value.
            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns
        ----------
        Tensor
            Spectral convergence loss value.
        """
        return paddle.norm(
            y_mag - x_mag, p="fro") / paddle.clip(
@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer):
    def forward(self, x_mag, y_mag):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        x_mag : Tensor
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+        Returns:
-        y_mag : Tensor
+            Tensor: Log STFT magnitude loss value.
            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
        Returns
        ----------
        Tensor
            Log STFT magnitude loss value.
        """
        return F.l1_loss(
            paddle.log(paddle.clip(y_mag, min=self.epsilon)),
@ -625,18 +566,12 @@ class STFTLoss(nn.Layer):
    def forward(self, x, y):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Predicted signal (B, T).
-        x : Tensor
+            y (Tensor): Groundtruth signal (B, T).
-            Predicted signal (B, T).
+        Returns:
-        y : Tensor
+            Tensor: Spectral convergence loss value.
-            Groundtruth signal (B, T).
+            Tensor: Log STFT magnitude loss value.
        Returns
        ----------
        Tensor
            Spectral convergence loss value.
        Tensor
            Log STFT magnitude loss value.
        """
        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
                     self.window)
@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer):
            win_lengths=[600, 1200, 240],
            window="hann", ):
        """Initialize Multi resolution STFT loss module.
-        Parameters
+        Args:
-        ----------
+            fft_sizes (list): List of FFT sizes.
-        fft_sizes : list
+            hop_sizes (list): List of hop sizes.
-            List of FFT sizes.
+            win_lengths (list): List of window lengths.
-        hop_sizes : list
+            window (str): Window function type.
            List of hop sizes.
        win_lengths : list
            List of window lengths.
        window : str
            Window function type.
        """
        super().__init__()
        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer):
    def forward(self, x, y):
        """Calculate forward propagation.
-        Parameters
+        
-        ----------
+        Args:
-        x : Tensor
+            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
-            Predicted signal (B, T) or (B, #subband, T).
+            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
-        y : Tensor
+        Returns:
-            Groundtruth signal (B, T) or (B, #subband, T).
+            Tensor: Multi resolution spectral convergence loss value.
-        Returns
+            Tensor: Multi resolution log STFT magnitude loss value.
        ----------
        Tensor
            Multi resolution spectral convergence loss value.
        Tensor
            Multi resolution log STFT magnitude loss value.
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B x C, T)
@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer):
    def forward(self, outputs):
        """Calcualate generator adversarial loss.
-        Parameters
+        Args:
-        ----------
+            outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
-        outputs: Tensor or List
+        Returns:
-        Discriminator outputs or list of discriminator outputs.
+            Tensor: Generator adversarial loss value.
        Returns
        ----------
        Tensor
            Generator adversarial loss value.
        """
        if isinstance(outputs, (tuple, list)):
            adv_loss = 0.0
@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):
    def forward(self, outputs_hat, outputs):
        """Calcualate discriminator adversarial loss.
-        Parameters
+
-        ----------
+        Args:
-        outputs_hat : Tensor or list
+            outputs_hat (Tensor or list): Discriminator outputs or list of
-            Discriminator outputs or list of
+                discriminator outputs calculated from generator outputs.
-            discriminator outputs calculated from generator outputs.
+            outputs (Tensor or list): Discriminator outputs or list of
-        outputs : Tensor or list
+                discriminator outputs calculated from groundtruth.
-            Discriminator outputs or list of
+        Returns:
-            discriminator outputs calculated from groundtruth.
+            Tensor: Discriminator real loss value.
-        Returns
+            Tensor: Discriminator fake loss value.
        ----------
        Tensor
            Discriminator real loss value.
        Tensor
            Discriminator fake loss value.
        """
        if isinstance(outputs, (tuple, list)):
            real_loss = 0.0
@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True):
 def weighted_mean(input, weight):
    """Weighted mean. It can also be used as masked mean.
-    Parameters
+    Args:
-    -----------
+        input(Tensor): The input tensor.
-    input : Tensor 
+        weight(Tensor): The weight tensor with broadcastable shape with the input.
-        The input tensor.
+
-    weight : Tensor
+    Returns:
-        The weight tensor with broadcastable shape with the input.
+        Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
-
+            
    Returns
    ----------
    Tensor [shape=(1,)]
        Weighted mean tensor with the same dtype as input.
    """
    weight = paddle.cast(weight, input.dtype)
    # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
@ -889,20 +801,15 @@ def weighted_mean(input, weight):
 def masked_l1_loss(prediction, target, mask):
    """Compute maksed L1 loss.
-    Parameters
+    Args:
-    ----------
+        prediction(Tensor): The prediction.
-    prediction : Tensor
+        target(Tensor): The target. The shape should be broadcastable to ``prediction``.
-        The prediction.
+        mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
-    target : Tensor
+            ``prediction`` and ``target``.
-        The target. The shape should be broadcastable to ``prediction``.
+
-    mask : Tensor
+    Returns:
-        The mask. The shape should be broadcatable to the broadcasted shape of
+        Tensor: The masked L1 loss. shape=(1,)
-        ``prediction`` and ``target``.
+        
    Returns
    -------
    Tensor [shape=(1,)]
        The masked L1 loss.
    """
    abs_error = F.l1_loss(prediction, target, reduction='none')
    loss = weighted_mean(abs_error, mask)
@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer):
    def forward(self, x):
        """Calculate Mel-spectrogram.
-        Parameters
+        Args:
-        ----------
+        
-        x : Tensor
+            x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
-            Input waveform tensor (B, T) or (B, 1, T).
+        Returns:
-        Returns
+            Tensor: Mel-spectrogram (B, #mels, #frames).
        ----------
        Tensor
            Mel-spectrogram (B, #mels, #frames).
        """
        if len(x.shape) == 3:
            # (B, C, T) -> (B*C, T)
@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer):
    def forward(self, y_hat, y):
        """Calculate Mel-spectrogram loss.
-        Parameters
+        Args:
-        ----------
+            y_hat(Tensor): Generated single tensor (B, 1, T).
-        y_hat : Tensor
+            y(Tensor): Groundtruth single tensor (B, 1, T).
-            Generated single tensor (B, 1, T).
+
-        y : Tensor
+        Returns:
-            Groundtruth single tensor (B, 1, T).
+            Tensor: Mel-spectrogram loss value.
        Returns
        ----------
        Tensor
            Mel-spectrogram loss value.
        """
        mel_hat = self.mel_spectrogram(y_hat)
        mel = self.mel_spectrogram(y)
@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer):
    def forward(self, feats_hat, feats):
        """Calcualate feature matching loss.
-        Parameters
+
-        ----------
+        Args:
-        feats_hat : list
+            feats_hat(list): List of list of discriminator outputs
-            List of list of discriminator outputs
+                calcuated from generater outputs.
-            calcuated from generater outputs.
+            feats(list): List of list of discriminator outputs
-        feats : list
+
-            List of list of discriminator outputs
+        Returns:
-            calcuated from groundtruth.
+            Tensor: Feature matching loss value.
        Returns
        ----------
        Tensor
            Feature matching loss value.
        """
        feat_match_loss = 0.0
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -20,27 +20,21 @@ from typeguard import check_argument_types
 def pad_list(xs, pad_value):
    """Perform padding for the list of tensors.
-    Parameters
+    Args:
-    ----------
+        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-    xs : List[Tensor]
+        pad_value (float): Value for padding.
-        List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+
-    pad_value : float)
+    Returns:
-        Value for padding.
+        Tensor: Padded tensor (B, Tmax, `*`).
-
+
-    Returns
+    Examples:
-    ----------
+        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
-    Tensor
+        >>> x
-        Padded tensor (B, Tmax, `*`).
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
-
+        >>> pad_list(x, 0)
-    Examples
+        tensor([[1., 1., 1., 1.],
-    ----------
+                [1., 1., 0., 0.],
-    >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
+                [1., 0., 0., 0.]])
    >>> x
    [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
    >>> pad_list(x, 0)
    tensor([[1., 1., 1., 1.],
            [1., 1., 0., 0.],
            [1., 0., 0., 0.]])
    """
    n_batch = len(xs)
    max_len = max(x.shape[0] for x in xs)
@ -55,25 +49,20 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of padded part.
-    Parameters
+    Args:
-    ----------
+        lengths (Tensor(int64)): Batch of lengths (B,).
-    lengths : LongTensor
+
-            Batch of lengths (B,).
+    Returns: 
-
+        Tensor(bool): Mask tensor containing indices of padded part bool.
-    Returns
+
-    ----------
+    Examples:
-    Tensor(bool)
+        With only lengths.
-        Mask tensor containing indices of padded part bool.
+
-
+        >>> lengths = [5, 3, 2]
-    Examples
+        >>> make_non_pad_mask(lengths)
-    ----------
+        masks = [[0, 0, 0, 0 ,0],
-    With only lengths.
+                    [0, 0, 0, 1, 1],
-
+                    [0, 0, 1, 1, 1]]
    >>> lengths = [5, 3, 2]
    >>> make_non_pad_mask(lengths)
    masks = [[0, 0, 0, 0 ,0],
                [0, 0, 0, 1, 1],
                [0, 0, 1, 1, 1]]
    """
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
    """Make mask tensor containing indices of non-padded part.
-    Parameters
+    Args:
-    ----------
+        lengths (Tensor(int64) or List): Batch of lengths (B,).
-    lengths : LongTensor or List
+        xs (Tensor, optional): The reference tensor.
-            Batch of lengths (B,).
+            If set, masks will be the same shape as this tensor.
-    xs : Tensor, optional
+        length_dim (int, optional): Dimension indicator of the above tensor.
-        The reference tensor.
+            See the example.
-        If set, masks will be the same shape as this tensor.
+
-    length_dim : int, optional
+    Returns:
-        Dimension indicator of the above tensor.
+        Tensor(bool): mask tensor containing indices of padded part bool.
-        See the example.
+
-
+    Examples: 
-    Returns
+        With only lengths.
-    ----------
+
-    Tensor(bool)
+        >>> lengths = [5, 3, 2]
-        mask tensor containing indices of padded part bool.
+        >>> make_non_pad_mask(lengths)
-
+        masks = [[1, 1, 1, 1 ,1],
-    Examples
+                    [1, 1, 1, 0, 0],
-    ----------
+                    [1, 1, 0, 0, 0]]
    With only lengths.
    >>> lengths = [5, 3, 2]
    >>> make_non_pad_mask(lengths)
    masks = [[1, 1, 1, 1 ,1],
                [1, 1, 1, 0, 0],
                [1, 1, 0, 0, 0]]
    """
    return paddle.logical_not(make_pad_mask(lengths, length_dim))
@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):
    Custom initialization routines can be implemented into submodules
-    Parameters
+    Args:
-    ----------
+        model (nn.Layer): Target.
-    model : nn.Layer
+        init (str): Method of initialization.
        Target.
    init : str
        Method of initialization.
    """
    assert check_argument_types()
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
    """Design prototype filter for PQMF.
    This method is based on `A Kaiser window approach for the design of prototype
    filters of cosine modulated filterbanks`_.
-    Parameters
+
-    ----------
+    Args:
-    taps : int
+        taps (int): The number of filter taps.
-        The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
-    cutoff_ratio : float
+        beta (float): Beta coefficient for kaiser window.
-        Cut-off frequency ratio.
+    Returns:
-    beta : float
+        ndarray:
-        Beta coefficient for kaiser window.
+            Impluse response of prototype filter (taps + 1,).
-    Returns
+        .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
-    ----------
+            https://ieeexplore.ieee.org/abstract/document/681427
    ndarray
        Impluse response of prototype filter (taps + 1,).
    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
        https://ieeexplore.ieee.org/abstract/document/681427
    """
    # check the arguments are valid
    assert taps % 2 == 0, "The number of taps mush be even number."
@ -68,16 +64,12 @@ class PQMF(nn.Layer):
        """Initilize PQMF module.
        The cutoff_ratio and beta parameters are optimized for #subbands = 4.
        See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
-        Parameters
+
-        ----------
+        Args:
-        subbands : int
+            subbands (int): The number of subbands.
-            The number of subbands.
+            taps (int): The number of filter taps.
-        taps : int
+            cutoff_ratio (float): Cut-off frequency ratio.
-            The number of filter taps.
+            beta (float): Beta coefficient for kaiser window.
        cutoff_ratio : float
            Cut-off frequency ratio.
        beta : float
            Beta coefficient for kaiser window.
        """
        super().__init__()
@ -110,28 +102,20 @@ class PQMF(nn.Layer):
    def analysis(self, x):
        """Analysis with PQMF.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, 1, T).
-        x : Tensor
+        Returns:
-            Input tensor (B, 1, T).
+            Tensor: Output tensor (B, subbands, T // subbands).
        Returns
        ----------
        Tensor
            Output tensor (B, subbands, T // subbands).
        """
        x = F.conv1d(self.pad_fn(x), self.analysis_filter)
        return F.conv1d(x, self.updown_filter, stride=self.subbands)
    def synthesis(self, x):
        """Synthesis with PQMF.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, subbands, T // subbands).
-        x : Tensor
+        Returns:
-            Input tensor (B, subbands, T // subbands).
+            Tensor: Output tensor (B, 1, T).
        Returns
        ----------
        Tensor
            Output tensor (B, 1, T).
        """
        x = F.conv1d_transpose(
            x, self.updown_filter * self.subbands, stride=self.subbands)
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
                 offset=1.0):
        """Initilize duration predictor module.
-        Parameters
+        Args:
-        ----------
+            idim (int):Input dimension.
-        idim : int
+            n_layers (int, optional): Number of convolutional layers.
-            Input dimension.
+            n_chans (int, optional): Number of channels of convolutional layers.
-        n_layers : int, optional
+            kernel_size (int, optional): Kernel size of convolutional layers.
-                Number of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
-        n_chans : int, optional
+            offset (float, optional): Offset value to avoid nan in log domain.
            Number of channels of convolutional layers.
        kernel_size : int, optional
            Kernel size of convolutional layers.
        dropout_rate : float, optional
                Dropout rate.
        offset : float, optional
            Offset value to avoid nan in log domain.
        """
        super().__init__()
@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):
    def forward(self, xs, x_masks=None):
        """Calculate forward propagation.
        Args:
            xs(Tensor): Batch of input sequences (B, Tmax, idim).
            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
-        Parameters
+        Returns:
-        ----------
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
        xs : Tensor
            Batch of input sequences (B, Tmax, idim).
        x_masks : ByteTensor, optional
            Batch of masks indicating padded part (B, Tmax).
        Returns
        ----------
        Tensor
            Batch of predicted durations in log domain (B, Tmax).
        """
        return self._forward(xs, x_masks, False)
    def inference(self, xs, x_masks=None):
        """Inference duration.
        Args:
            xs(Tensor): Batch of input sequences (B, Tmax, idim).
            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
-        Parameters
+        Returns:
-        ----------
+            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
        xs : Tensor
            Batch of input sequences (B, Tmax, idim).
        x_masks : Tensor(bool), optional
            Batch of masks indicating padded part (B, Tmax).
        Returns
        ----------
        Tensor
            Batch of predicted durations in linear domain int64 (B, Tmax).
        """
        return self._forward(xs, x_masks, True)
@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):
    def __init__(self, offset=1.0, reduction="mean"):
        """Initilize duration predictor loss module.
-
+        Args:
-        Parameters
+            offset (float, optional): Offset value to avoid nan in log domain.
-        ----------
+            reduction (str): Reduction type in loss calculation.
        offset : float, optional
            Offset value to avoid nan in log domain.
        reduction : str
            Reduction type in loss calculation.
        """
        super().__init__()
        self.criterion = nn.MSELoss(reduction=reduction)
@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer):
    def forward(self, outputs, targets):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            outputs(Tensor): Batch of prediction durations in log domain (B, T)
-        outputs : Tensor
+            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
-            Batch of prediction durations in log domain (B, T)
+
-        targets : Tensor
+        Returns: 
-            Batch of groundtruth durations in linear domain (B, T)
+            Tensor: Mean squared error loss value.
-
+
-        Returns
+        Note: 
-        ----------
+            `outputs` is in log domain but `targets` is in linear domain.
        Tensor
            Mean squared error loss value.
        Note
        ----------
        `outputs` is in log domain but `targets` is in linear domain.
        """
        # NOTE: outputs is in log domain while targets in linear
        targets = paddle.log(targets.cast(dtype='float32') + self.offset)
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer):
    def __init__(self, pad_value=0.0):
        """Initilize length regulator module.
-        Parameters
+        Args:
-        ----------
+            pad_value (float, optional): Value used for padding.
        pad_value : float, optional
            Value used for padding.
        """
        super().__init__()
@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer):
    def forward(self, xs, ds, alpha=1.0, is_inference=False):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-        xs : Tensor
+            ds (Tensor(int64)): Batch of durations of each frame (B, T).
-            Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            alpha (float, optional): Alpha value to control speed of speech.
        ds : Tensor(int64)
            Batch of durations of each frame (B, T).
        alpha : float, optional
            Alpha value to control speed of speech.
-        Returns
+        Returns:
-        ----------
+            Tensor: replicated input tensor based on durations (B, T*, D).
        Tensor
            replicated input tensor based on durations (B, T*, D).
        """
        if alpha != 1.0:
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer):
            dropout_rate: float=0.5, ):
        """Initilize duration predictor module.
-        Parameters
+        Args:
-        ----------
+            idim (int): Input dimension.
-        idim : int
+            n_layers (int, optional): Number of convolutional layers.
-            Input dimension.
+            n_chans (int, optional): Number of channels of convolutional layers.
-        n_layers : int, optional
+            kernel_size (int, optional): Kernel size of convolutional layers.
-            Number of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
        n_chans : int, optional
            Number of channels of convolutional layers.
        kernel_size : int, optional
            Kernel size of convolutional layers.
        dropout_rate : float, optional
            Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer):
                x_masks: paddle.Tensor=None) -> paddle.Tensor:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
-            xs : Tensor
+            x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).
                Batch of input sequences (B, Tmax, idim).
            x_masks : Tensor(bool), optional
                Batch of masks indicating padded part (B, Tmax, 1).
-        Returns
+        Returns:
-        ----------
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
            Tensor
                Batch of predicted sequences (B, Tmax, 1).
        """
        # (B, idim, Tmax)
        xs = xs.transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer):
    unit and parametric redidual and skip connections. For more details, 
    refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.
-    Parameters
+    Args:
-    ----------
+        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
-    kernel_size : int, optional
+        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
-        Kernel size of the 1D convolution, by default 3
+        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
-    residual_channels : int, optional
+        skip_channels (int, optional): Feature size of the skip output, by default 64
-        Feature size of the resiaudl output(and also the input), by default 64
+        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
-    gate_channels : int, optional
+        dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
-        Output feature size of the 1D convolution, by default 128
+        dilation (int, optional): Dilation of the 1D convolution, by default 1
-    skip_channels : int, optional
+        bias (bool, optional): Whether to use bias in the 1D convolution, by default True
-        Feature size of the skip output, by default 64
+        use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
    aux_channels : int, optional
        Feature size of the auxiliary input (e.g. spectrogram), by default 80
    dropout : float, optional
        Probability of the dropout before the 1D convolution, by default 0.
    dilation : int, optional
        Dilation of the 1D convolution, by default 1
    bias : bool, optional
        Whether to use bias in the 1D convolution, by default True
    use_causal_conv : bool, optional
        Whether to use causal padding for the 1D convolution, by default False
    """
    def __init__(self,
@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer):
    def forward(self, x, c):
        """
-        Parameters
+        Args:
-        ----------
+            x (Tensor): the input features. Shape (N, C_res, T)
-        x : Tensor
+            c (Tensor): the auxiliary input. Shape (N, C_aux, T)
-            Shape (N, C_res, T), the input features.
+
-        c : Tensor
+        Returns:
-            Shape (N, C_aux, T), the auxiliary input.
+            res (Tensor): Shape (N, C_res, T), the residual output, which is used as the 
-
+                input of the next ResidualBlock in a stack of ResidualBlocks.
-        Returns
+            skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
-        -------
+                each layer in a stack of ResidualBlocks.
        res : Tensor
            Shape (N, C_res, T), the residual output, which is used as the 
            input of the next ResidualBlock in a stack of ResidualBlocks.
        skip : Tensor
            Shape (N, C_skip, T), the skip output, which is collected among
            each layer in a stack of ResidualBlocks.
        """
        x_input = x
        x = F.dropout(x, self.dropout, training=self.training)
@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer):
            nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
    ):
        """Initialize HiFiGANResidualBlock module.
-        Parameters
+        Args:
-        ----------
+            kernel_size (int): Kernel size of dilation convolution layer.
-        kernel_size : int
+            channels (int): Number of channels for convolution layer.
-            Kernel size of dilation convolution layer.
+            dilations (List[int]): List of dilation factors.
-        channels : int
+            use_additional_convs (bool): Whether to use additional convolution layers.
-            Number of channels for convolution layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
-        dilations : List[int]
+            nonlinear_activation (str): Activation function module name.
-            List of dilation factors.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
        use_additional_convs : bool
            Whether to use additional convolution layers.
        bias : bool
            Whether to add bias parameter in convolution layers.
        nonlinear_activation : str
            Activation function module name.
        nonlinear_activation_params : dict
            Hyperparameters for activation function.
        """
        super().__init__()
@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, channels, T).
-        x : Tensor
+        Returns:
-            Input tensor (B, channels, T).
+            Tensor: Output tensor (B, channels, T).
        Returns
        ----------
        Tensor
            Output tensor (B, channels, T).
        """
        for idx in range(len(self.convs1)):
            xt = self.convs1[idx](x)
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@ -37,26 +37,17 @@ class ResidualStack(nn.Layer):
            pad_params: Dict[str, Any]={"mode": "reflect"},
            use_causal_conv: bool=False, ):
        """Initialize ResidualStack module.
-        Parameters
+
-        ----------
+        Args:
-        kernel_size : int
+            kernel_size (int): Kernel size of dilation convolution layer.
-            Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
-        channels : int
+            dilation (int): Dilation factor.
-            Number of channels of convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
-        dilation : int
+            nonlinear_activation (str): Activation function module name.
-            Dilation factor.
+            nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
-        bias : bool
+            pad (str): Padding function module name before dilated convolution layer.
-            Whether to add bias parameter in convolution layers.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
-        nonlinear_activation : str
+            use_causal_conv (bool): Whether to use causal convolution.
            Activation function module name.
        nonlinear_activation_params : Dict[str,Any]
            Hyperparameters for activation function.
        pad : str
            Padding function module name before dilated convolution layer.
        pad_params : Dict[str, Any]
            Hyperparameters for padding function.
        use_causal_conv : bool
            Whether to use causal convolution.
        """
        super().__init__()
        # for compatibility
@ -102,13 +93,10 @@ class ResidualStack(nn.Layer):
    def forward(self, c):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        c : Tensor
+            c (Tensor): Input tensor (B, channels, T).
-            Input tensor (B, channels, T).
+        Returns:     
-        Returns
+            Tensor: Output tensor (B, chennels, T).
        ----------
        Tensor
            Output tensor (B, chennels, T).
        """
        return self.stack(c) + self.skip_layer(c)
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer):
    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
+    
-    Parameters
+    Args:
-    ----------
+        idim (int, optional): Dimension of the input mel-spectrogram.
-    idim : int, optional
+        gst_tokens (int, optional): The number of GST embeddings.
-        Dimension of the input mel-spectrogram.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
-    gst_tokens : int, optional
+        gst_heads (int, optional): The number of heads in GST multihead attention.
-        The number of GST embeddings.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
-    gst_token_dim : int, optional
+        conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
-        Dimension of each GST embedding.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
-    gst_heads : int, optional
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
-        The number of heads in GST multihead attention.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
-    conv_layers : int, optional
+        gru_units (int, optional):The number of GRU units in the reference encoder.
-        The number of conv layers in the reference encoder.
+
-    conv_chans_list : Sequence[int], optional
+    Todo:
-        List of the number of channels of conv layers in the referece encoder.
+        * Support manual weight specification in inference.
    conv_kernel_size : int, optional
        Kernal size of conv layers in the reference encoder.
    conv_stride : int, optional
        Stride size of conv layers in the reference encoder.
    gru_layers : int, optional
        The number of GRU layers in the reference encoder.
    gru_units : int, optional
        The number of GRU units in the reference encoder.
    Todo
    ----------
    * Support manual weight specification in inference.
    """
@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer):
    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            speech (Tensor): Batch of padded target features (B, Lmax, odim).
        speech : Tensor
            Batch of padded target features (B, Lmax, odim).
-        Returns
+        Returns: 
-        ----------
+            Tensor: Style token embeddings (B, token_dim).
        Tensor:
            Style token embeddings (B, token_dim).
        """
        ref_embs = self.ref_enc(speech)
@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer):
    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
+    
-    Parameters
+    Args:
-    ----------
+        idim (int, optional): Dimension of the input mel-spectrogram.
-    idim : int, optional
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
-        Dimension of the input mel-spectrogram.
+        conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
-    conv_layers : int, optional
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
-        The number of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
-    conv_chans_list: : Sequence[int], optional
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
-        List of the number of channels of conv layers in the referece encoder.
+        gru_units (int, optional): The number of GRU units in the reference encoder.
    conv_kernel_size : int, optional
        Kernal size of conv layers in the reference encoder.
    conv_stride : int, optional
        Stride size of conv layers in the reference encoder.
    gru_layers : int, optional
        The number of GRU layers in the reference encoder.
    gru_units : int, optional
        The number of GRU units in the reference encoder.
    """
@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer):
    def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.
        Args:
            speech (Tensor): Batch of padded target features (B, Lmax, idim).
-        Parameters
+        Returns:
-        ----------
+            Tensor: Reference embedding (B, gru_units)
        speech : Tensor
            Batch of padded target features (B, Lmax, idim).
        Return
        ----------
        Tensor
            Reference embedding (B, gru_units)
        """
        batch_size = speech.shape[0]
@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer):
    .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
        Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
+    Args:
-    Parameters
+        ref_embed_dim (int, optional): Dimension of the input reference embedding.
-    ----------
+        gst_tokens (int, optional): The number of GST embeddings.
-    ref_embed_dim : int, optional
+        gst_token_dim (int, optional): Dimension of each GST embedding.
-        Dimension of the input reference embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
-    gst_tokens : int, optional
+        dropout_rate (float, optional): Dropout rate in multi-head attention.
        The number of GST embeddings.
    gst_token_dim : int, optional
        Dimension of each GST embedding.
    gst_heads : int, optional
        The number of heads in GST multihead attention.
    dropout_rate : float, optional
        Dropout rate in multi-head attention.
    """
@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer):
    def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor:
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
        ref_embs : Tensor
            Reference embeddings (B, ref_embed_dim).
-        Returns
+        Returns: 
-        ----------
+            Tensor: Style token embeddings (B, gst_token_dim).
        Tensor
            Style token embeddings (B, gst_token_dim).
        """
        batch_size = ref_embs.shape[0]
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@ -30,21 +30,14 @@ def _apply_attention_constraint(e,
    introduced in `Deep Voice 3: Scaling
    Text-to-Speech with Convolutional Sequence Learning`_.
-    Parameters
+    Args:
-    ----------
+        e(Tensor): Attention energy before applying softmax (1, T).
-    e : Tensor
+       last_attended_idx(int): The index of the inputs of the last attended [0, T].
-        Attention energy before applying softmax (1, T).
+       backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
-    last_attended_idx : int
+       forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
-        The index of the inputs of the last attended [0, T].
+
-    backward_window : int, optional
+    Returns:
-        Backward window size in attention constraint.
+        Tensor: Monotonic constrained attention energy (1, T).
    forward_window : int, optional
        Forward window size in attetion constraint.
    Returns
    ----------
    Tensor
        Monotonic constrained attention energy (1, T).
    .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
        https://arxiv.org/abs/1710.07654
@ -67,20 +60,14 @@ class AttLoc(nn.Layer):
    Reference: Attention-Based Models for Speech Recognition
        (https://arxiv.org/pdf/1506.07503.pdf)
-    Parameters
+
-    ----------
+    Args:
-    eprojs : int
+        eprojs (int): projection-units of encoder
-        projection-units of encoder
+        dunits (int): units of decoder
-    dunits : int
+        att_dim (int): attention dimension
-        units of decoder
+        aconv_chans (int): channels of attention convolution
-    att_dim :  int
+        aconv_filts (int): filter size of attention convolution
-        att_dim: attention dimension
+        han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    aconv_chans : int
        channels of attention convolution
    aconv_filts : int
        filter size of attention convolution
    han_mode : bool
        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
    """
    def __init__(self,
@ -129,33 +116,19 @@ class AttLoc(nn.Layer):
            backward_window=1,
            forward_window=3, ):
        """Calculate AttLoc forward propagation.
-        Parameters
+        Args:
-        ----------
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
-        enc_hs_pad : paddle.Tensor
+            enc_hs_len(Tensor): padded encoder hidden state length (B)
-            padded encoder hidden state (B, T_max, D_enc)
+            dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
-        enc_hs_len : paddle.Tensor
+            att_prev(Tensor): previous attention weight (B, T_max)
-            padded encoder hidden state length (B)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
-        dec_z : paddle.Tensor dec_z
+            forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
-            decoder hidden state (B, D_dec)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
-        att_prev : paddle.Tensor
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
-            previous attention weight (B, T_max)
+            forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
-        scaling : float
+        Returns:
-            scaling parameter before applying softmax
+            Tensor: attention weighted encoder state (B, D_enc)
-        forward_window : paddle.Tensor
+            Tensor: previous attention weights (B, T_max)
            forward window size when constraining attention
        last_attended_idx : int
            index of the inputs of the last attended
        backward_window : int
            backward window size in attention constraint
        forward_window : int
            forward window size in attetion constraint
        Returns
        ----------
        paddle.Tensor
            attention weighted encoder state (B, D_enc)
        paddle.Tensor  
            previous attention weights (B, T_max)
        """
        batch = paddle.shape(enc_hs_pad)[0]
        # pre-compute all h outside the decoder loop
@ -217,19 +190,13 @@ class AttForward(nn.Layer):
    ----------
    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
        (https://arxiv.org/pdf/1807.06736.pdf)
-    
+
-    Parameters
+    Args:
-    ----------
+        eprojs (int): projection-units of encoder
-    eprojs : int
+        dunits (int): units of decoder
-        projection-units of encoder
+        att_dim (int): attention dimension
-    dunits : int
+        aconv_chans (int): channels of attention convolution
-        units of decoder
+        aconv_filts (int): filter size of attention convolution
    att_dim : int
        attention dimension
    aconv_chans : int
        channels of attention convolution
    aconv_filts : int 
        filter size of attention convolution
    """
    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
@ -270,30 +237,20 @@ class AttForward(nn.Layer):
            backward_window=1,
            forward_window=3, ):
        """Calculate AttForward forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        enc_hs_pad : paddle.Tensor
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
-            padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(list): padded encoder hidden state length (B,)
-        enc_hs_len : list
+            dec_z(Tensor): decoder hidden state (B, D_dec)
-            padded encoder hidden state length (B,)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
-        dec_z : paddle.Tensor
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
-            decoder hidden state (B, D_dec)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
-        att_prev : paddle.Tensor
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
-            attention weights of previous step (B, T_max)
+            forward_window(int, optional):  (Default value = 3)
-        scaling : float
+
-            scaling parameter before applying softmax
+        Returns:
-        last_attended_idx : int
+            Tensor: attention weighted encoder state (B, D_enc)
-            index of the inputs of the last attended
+            Tensor: previous attention weights (B, T_max)
        backward_window : int
            backward window size in attention constraint
        forward_window : int
            forward window size in attetion constraint
        Returns
        ----------
        paddle.Tensor
            attention weighted encoder state (B, D_enc)
        paddle.Tensor
            previous attention weights (B, T_max)
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
@ -359,24 +316,17 @@ class AttForward(nn.Layer):
 class AttForwardTA(nn.Layer):
    """Forward attention with transition agent module.
-    Reference
+    Reference:
-    ----------
+        Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
-    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+            (https://arxiv.org/pdf/1807.06736.pdf)
-        (https://arxiv.org/pdf/1807.06736.pdf)
+
-    Parameters
+    Args:
-    ----------
+        eunits (int): units of encoder
-    eunits : int
+        dunits (int): units of decoder
-        units of encoder
+        att_dim (int): attention dimension
-    dunits : int
+        aconv_chans (int): channels of attention convolution
-        units of decoder
+        aconv_filts (int): filter size of attention convolution
-    att_dim : int
+        odim (int): output dimension
        attention dimension
    aconv_chans : int
        channels of attention convolution
    aconv_filts : int
        filter size of attention convolution
    odim : int
        output dimension
    """
    def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
@ -420,32 +370,21 @@ class AttForwardTA(nn.Layer):
            backward_window=1,
            forward_window=3, ):
        """Calculate AttForwardTA forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        enc_hs_pad : paddle.Tensor
+            enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
-            padded encoder hidden state (B, Tmax, eunits)
+            enc_hs_len(list Tensor): padded encoder hidden state length (B,)
-        enc_hs_len : list paddle.Tensor
+            dec_z(Tensor): decoder hidden state (B, dunits)
-            padded encoder hidden state length (B,)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
-        dec_z : paddle.Tensor
+            out_prev(Tensor): decoder outputs of previous step (B, odim)
-            decoder hidden state (B, dunits)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
-        att_prev : paddle.Tensor
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
-            attention weights of previous step (B, T_max)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
-        out_prev : paddle.Tensor
+            forward_window(int, optional):  (Default value = 3)
-            decoder outputs of previous step (B, odim)
+
-        scaling : float
+        Returns:
-            scaling parameter before applying softmax
+            Tensor: attention weighted encoder state (B, dunits)
-        last_attended_idx : int
+            Tensor: previous attention weights (B, Tmax)
            index of the inputs of the last attended
        backward_window : int
            backward window size in attention constraint
        forward_window : int
            forward window size in attetion constraint
        Returns
        ----------
        paddle.Tensor
            attention weighted encoder state (B, dunits)
        paddle.Tensor
            previous attention weights (B, Tmax)
        """
        batch = len(enc_hs_pad)
        # pre-compute all h outside the decoder loop
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@ -44,16 +44,11 @@ class Prenet(nn.Layer):
    def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
        """Initialize prenet module.
-        Parameters
+        Args:
-        ----------
+            idim (int): Dimension of the inputs.
-        idim : int
+            odim (int): Dimension of the outputs.
-            Dimension of the inputs.
+            n_layers (int, optional): The number of prenet layers.
-        odim : int
+            n_units (int, optional): The number of prenet units.
            Dimension of the outputs.
        n_layers : int, optional
            The number of prenet layers.
        n_units : int, optional
            The number of prenet units.
        """
        super().__init__()
        self.dropout_rate = dropout_rate
@ -66,15 +61,11 @@ class Prenet(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Batch of input tensors (B, ..., idim).
        x : Tensor
            Batch of input tensors (B, ..., idim).
-        Returns
+        Returns: 
-        ----------
+            Tensor: Batch of output tensors (B, ..., odim).
        Tensor
            Batch of output tensors (B, ..., odim).
        """
        for i in range(len(self.prenet)):
@ -109,22 +100,14 @@ class Postnet(nn.Layer):
            use_batch_norm=True, ):
        """Initialize postnet module.
-        Parameters
+        Args:
-        ----------
+            idim (int): Dimension of the inputs.
-        idim : int
+            odim (int): Dimension of the outputs.
-            Dimension of the inputs.
+            n_layers (int, optional): The number of layers.
-        odim : int
+            n_filts (int, optional): The number of filter size.
-            Dimension of the outputs.
+            n_units (int, optional): The number of filter channels.
-        n_layers : int, optional
+            use_batch_norm (bool, optional): Whether to use batch normalization..
-            The number of layers.
+            dropout_rate (float, optional): Dropout rate..
        n_filts : int, optional
            The number of filter size.
        n_units : int, optional
            The number of filter channels.
        use_batch_norm : bool, optional
            Whether to use batch normalization..
        dropout_rate : float, optional
            Dropout rate..
        """
        super().__init__()
        self.postnet = nn.LayerList()
@ -184,16 +167,10 @@ class Postnet(nn.Layer):
    def forward(self, xs):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
-        xs : Tensor
+        Returns:
-            Batch of the sequences of padded input tensors (B, idim, Tmax).
+            Tensor: Batch of padded output tensor. (B, odim, Tmax).
        Returns
        ----------
        Tensor
            Batch of padded output tensor. (B, odim, Tmax).
        """
        for i in range(len(self.postnet)):
            xs = self.postnet[i](xs)
@ -217,13 +194,11 @@ class ZoneOutCell(nn.Layer):
    def __init__(self, cell, zoneout_rate=0.1):
        """Initialize zone out cell module.
-        Parameters
+
-        ----------
+        Args:
-        cell : nn.Layer:
+            cell (nn.Layer): Paddle recurrent cell module
-            Paddle recurrent cell module
+                e.g. `paddle.nn.LSTMCell`.
-            e.g. `paddle.nn.LSTMCell`.
+            zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.
        zoneout_rate : float, optional
            Probability of zoneout from 0.0 to 1.0.
        """
        super().__init__()
        self.cell = cell
@ -235,20 +210,18 @@ class ZoneOutCell(nn.Layer):
    def forward(self, inputs, hidden):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        inputs : Tensor
+            inputs (Tensor): Batch of input tensor (B, input_size).
-            Batch of input tensor (B, input_size).
+            hidden (tuple):
-        hidden : tuple
+                - Tensor: Batch of initial hidden states (B, hidden_size).
-            - Tensor: Batch of initial hidden states (B, hidden_size).
+                - Tensor: Batch of initial cell states (B, hidden_size).
-            - Tensor: Batch of initial cell states (B, hidden_size).
+        Returns:
-        Returns
+            Tensor:
-        ----------
+                Batch of next hidden states (B, hidden_size).
-        Tensor
+            tuple:
-            Batch of next hidden states (B, hidden_size).
+                - Tensor: Batch of next hidden states (B, hidden_size).
-        tuple:
+                - Tensor: Batch of next cell states (B, hidden_size).
            - Tensor: Batch of next hidden states (B, hidden_size).
            - Tensor: Batch of next cell states (B, hidden_size).
        """
        # we only use the second output of LSTMCell in paddle
        _, next_hidden = self.cell(inputs, hidden)
@ -302,42 +275,29 @@ class Decoder(nn.Layer):
            zoneout_rate=0.1,
            reduction_factor=1, ):
        """Initialize Tacotron2 decoder module.
-        Parameters
+
-        ----------
+        Args:
-        idim : int
+            idim (int): Dimension of the inputs.
-            Dimension of the inputs.
+            odim (int): Dimension of the outputs.
-        odim : int
+            att (nn.Layer): Instance of attention class.
-            Dimension of the outputs.
+            dlayers (int, optional): The number of decoder lstm layers.
-        att nn.Layer
+            dunits (int, optional): The number of decoder lstm units.
-            Instance of attention class.
+            prenet_layers (int, optional): The number of prenet layers.
-        dlayers int, optional
+            prenet_units (int, optional): The number of prenet units.
-            The number of decoder lstm layers.
+            postnet_layers (int, optional): The number of postnet layers.
-        dunits : int, optional
+            postnet_filts (int, optional): The number of postnet filter size.
-            The number of decoder lstm units.
+            postnet_chans (int, optional): The number of postnet filter channels.
-        prenet_layers : int, optional
+            output_activation_fn (nn.Layer, optional): Activation function for outputs.
-            The number of prenet layers.
+            cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
-        prenet_units : int, optional
+            use_batch_norm (bool, optional): Whether to use batch normalization.
-            The number of prenet units.
+            use_concate : bool, optional
-        postnet_layers : int, optional
+                Whether to concatenate encoder embedding with decoder lstm outputs.
-            The number of postnet layers.
+            dropout_rate : float, optional
-        postnet_filts : int, optional
+                Dropout rate.
-            The number of postnet filter size.
+            zoneout_rate : float, optional
-        postnet_chans : int, optional
+                Zoneout rate.
-            The number of postnet filter channels.
+            reduction_factor : int, optional
-        output_activation_fn : nn.Layer, optional
+                Reduction factor.
            Activation function for outputs.
        cumulate_att_w : bool, optional
            Whether to cumulate previous attention weight.
        use_batch_norm : bool, optional
            Whether to use batch normalization.
        use_concate : bool, optional
            Whether to concatenate encoder embedding with decoder lstm outputs.
        dropout_rate : float, optional
            Dropout rate.
        zoneout_rate : float, optional
            Zoneout rate.
        reduction_factor : int, optional
            Reduction factor.
        """
        super().__init__()
@ -401,26 +361,19 @@ class Decoder(nn.Layer):
    def forward(self, hs, hlens, ys):
        """Calculate forward propagation.
-        Parameters
+
-        ----------
+        Args:
-        hs : Tensor
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
-            Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
-        hlens : Tensor(int64) padded
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
-            Batch of lengths of each input batch (B,).
+
-        ys : Tensor
+        Returns:
-            Batch of the sequences of padded target features (B, Lmax, odim).
+            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
-        Returns
+            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
-        ----------
+            Tensor: Batch of logits of stop prediction (B, Lmax).
-        Tensor
+            Tensor: Batch of attention weights (B, Lmax, Tmax).
-            Batch of output tensors after postnet (B, Lmax, odim).
+            
-        Tensor
+        Note: 
            Batch of output tensors before postnet (B, Lmax, odim).
        Tensor
            Batch of logits of stop prediction (B, Lmax).
        Tensor
            Batch of attention weights (B, Lmax, Tmax).
        Note
        ----------
            This computation is performed in teacher-forcing manner.
        """
        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
@ -517,37 +470,24 @@ class Decoder(nn.Layer):
            backward_window=None,
            forward_window=None, ):
        """Generate the sequence of features given the sequences of characters.
-        Parameters
+        Args:
-        ----------
+            h(Tensor): Input sequence of encoder hidden states (T, C).
-        h : Tensor
+            threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
-            Input sequence of encoder hidden states (T, C).
+            minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
-        threshold : float, optional
+                the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
-            Threshold to stop generation.
+            maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
-        minlenratio : float, optional
+                the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
-            Minimum length ratio.
+            use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
-            If set to 1.0 and the length of input is 10,
+            backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
-            the minimum length of outputs will be 10 * 1 = 10.
+            forward_window(int, optional):  (Default value = None)
-        minlenratio : float, optional
+
-            Minimum length ratio.
+        Returns:
-            If set to 10 and the length of input is 10,
+            Tensor: Output sequence of features (L, odim).
-            the maximum length of outputs will be 10 * 10 = 100.
+            Tensor: Output sequence of stop probabilities (L,).
-        use_att_constraint : bool
+            Tensor: Attention weights (L, T).
-            Whether to apply attention constraint introduced in `Deep Voice 3`_.
+
-        backward_window : int
+        Note: 
-            Backward window size in attention constraint.
+            This computation is performed in auto-regressive manner.
        forward_window : int
            Forward window size in attention constraint.
        Returns
        ----------
        Tensor
            Output sequence of features (L, odim).
        Tensor
            Output sequence of stop probabilities (L,).
        Tensor
            Attention weights (L, T).
        Note
        ----------
        This computation is performed in auto-regressive manner.
    .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
        """
        # setup
@ -683,21 +623,18 @@ class Decoder(nn.Layer):
    def calculate_all_attentions(self, hs, hlens, ys):
        """Calculate all of the attention weights.
-        Parameters
+
-        ----------
+        Args:
-        hs : Tensor
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
-            Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
-        hlens : Tensor(int64)
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
-            Batch of lengths of each input batch (B,).
+
-        ys : Tensor
+        Returns:
-            Batch of the sequences of padded target features (B, Lmax, odim).
+            numpy.ndarray:
-        Returns
+                Batch of attention weights (B, Lmax, Tmax).
-        ----------
+    
-        numpy.ndarray
+        Note:
-            Batch of attention weights (B, Lmax, Tmax).
+            This computation is performed in teacher-forcing manner.
        Note
        ----------
        This computation is performed in teacher-forcing manner.
        """
        # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
        if self.reduction_factor > 1:
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@ -45,31 +45,18 @@ class Encoder(nn.Layer):
            dropout_rate=0.5,
            padding_idx=0, ):
        """Initialize Tacotron2 encoder module.
-
+        Args:
-        Parameters
+            idim (int): Dimension of the inputs.
-        ----------
+            input_layer (str): Input layer type.
-        idim : int
+            embed_dim (int, optional): Dimension of character embedding.
-            Dimension of the inputs.
+            elayers (int, optional): The number of encoder blstm layers.
-        input_layer : str
+            eunits (int, optional): The number of encoder blstm units.
-            Input layer type.
+            econv_layers (int, optional): The number of encoder conv layers.
-        embed_dim : int, optional
+            econv_filts (int, optional): The number of encoder conv filter size.
-            Dimension of character embedding.
+            econv_chans (int, optional): The number of encoder conv filter channels.
-        elayers : int, optional
+            use_batch_norm (bool, optional): Whether to use batch normalization.
-            The number of encoder blstm layers.
+            use_residual (bool, optional): Whether to use residual connection.
-        eunits : int, optional
+            dropout_rate (float, optional): Dropout rate.
            The number of encoder blstm units.
        econv_layers : int, optional
            The number of encoder conv layers.
        econv_filts : int, optional
            The number of encoder conv filter size.
        econv_chans : int, optional
            The number of encoder conv filter channels.
        use_batch_norm : bool, optional
            Whether to use batch normalization.
        use_residual : bool, optional
            Whether to use residual connection.
        dropout_rate : float, optional
            Dropout rate.
        """
        super().__init__()
@ -139,21 +126,15 @@ class Encoder(nn.Layer):
    def forward(self, xs, ilens=None):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
-        xs : Tensor
+                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
-            Batch of the padded sequence. Either character ids (B, Tmax)
+                Padded value should be 0.
-            or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
+            ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
-            Padded value should be 0.
+
-        ilens : Tensor(int64)
+        Returns:
-            Batch of lengths of each input batch (B,).
+            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
-
+            Tensor(int64): Batch of lengths of each sequence (B,)
        Returns
        ----------
        Tensor
            Batch of the sequences of encoder states(B, Tmax, eunits).
        Tensor(int64)
            Batch of lengths of each sequence (B,)
        """
        xs = self.embed(xs).transpose([0, 2, 1])
        if self.convs is not None:
@ -179,16 +160,12 @@ class Encoder(nn.Layer):
    def inference(self, x):
        """Inference.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): The sequeunce of character ids (T,) 
-        x : Tensor
+                or acoustic feature (T, idim * encoder_reduction_factor).
            The sequeunce of character ids (T,) 
            or acoustic feature (T, idim * encoder_reduction_factor).
-        Returns
+        Returns:
-        ----------
+            Tensor: The sequences of encoder states(T, eunits).
        Tensor
            The sequences of encoder states(T, eunits).
        """
        xs = x.unsqueeze(0)
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@ -59,18 +59,12 @@ class TADELayer(nn.Layer):
    def forward(self, x, c):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (B, in_channels, T).
-        x : Tensor
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
-            Input tensor (B, in_channels, T).
+        Returns:
-        c : Tensor
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
-            Auxiliary input tensor (B, aux_channels, T).
+            Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
        Returns
        ----------
        Tensor
            Output tensor (B, in_channels, T * upsample_factor).
        Tensor
            Upsampled aux tensor (B, in_channels, T * upsample_factor).
        """
        x = self.norm(x)
@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer):
    def forward(self, x, c):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+
-        x : Tensor
+            x (Tensor): Input tensor (B, in_channels, T).
-            Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
-        c : Tensor
+        Returns:
-            Auxiliary input tensor (B, aux_channels, T).
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
-        Returns
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
        ----------
        Tensor
            Output tensor (B, in_channels, T * upsample_factor).
        Tensor
            Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
        """
        residual = x
        x, c = self.tade1(x, c)
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill
 class MultiHeadedAttention(nn.Layer):
    """Multi-Head Attention layer.
-
+    Args:
-    Parameters
+        n_head (int): The number of heads.
-    ----------
+        n_feat (int): The number of features.
-    n_head : int
+        dropout_rate (float): Dropout rate.
        The number of heads.
    n_feat : int
        The number of features.
    dropout_rate : float
        Dropout rate.
    """
    def __init__(self, n_head, n_feat, dropout_rate):
@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer):
    def forward_qkv(self, query, key, value):
        """Transform query, key and value.
-        Parameters
+        Args:
-        ----------
+            query(Tensor): query tensor (#batch, time1, size).
-        query : paddle.Tensor
+            key(Tensor): Key tensor (#batch, time2, size).
-            query tensor (#batch, time1, size).
+            value(Tensor): Value tensor (#batch, time2, size).
-        key : paddle.Tensor
+
-            Key tensor (#batch, time2, size).
+        Returns:
-        value : paddle.Tensor
+            Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
-            Value tensor (#batch, time2, size).
+            Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
-
+            Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
        Returns
        ----------
        paddle.Tensor
            Transformed query tensor (#batch, n_head, time1, d_k).
        paddle.Tensor
            Transformed key tensor (#batch, n_head, time2, d_k).
        paddle.Tensor
            Transformed value tensor (#batch, n_head, time2, d_k).
        """
        n_batch = paddle.shape(query)[0]
@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer):
    def forward_attention(self, value, scores, mask=None):
        """Compute attention context vector.
-        Parameters
+        Args:
-        ----------
+            value(Tensor): Transformed value (#batch, n_head, time2, d_k).
-        value : paddle.Tensor
+            scores(Tensor): Attention score (#batch, n_head, time1, time2).
-            Transformed value (#batch, n_head, time2, d_k).
+            mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
-        scores : paddle.Tensor
+
-            Attention score (#batch, n_head, time1, time2).
+        Returns:
-        mask :  paddle.Tensor
+            Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
            Mask (#batch, 1, time2) or (#batch, time1, time2).
        Returns
        ----------
        paddle.Tensor:
            Transformed value (#batch, time1, d_model)
            weighted by the attention score (#batch, time1, time2).
        """
        n_batch = paddle.shape(value)[0]
        softmax = paddle.nn.Softmax(axis=-1)
@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer):
    def forward(self, query, key, value, mask=None):
        """Compute scaled dot product attention.
-        Parameters
+        Args:
-        ----------
+            query(Tensor): Query tensor (#batch, time1, size).
-        query : paddle.Tensor
+            key(Tensor): Key tensor (#batch, time2, size).
-            Query tensor (#batch, time1, size).
+            value(Tensor): Value tensor (#batch, time2, size).
-        key : paddle.Tensor
+            mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
-            Key tensor (#batch, time2, size).
+
-        value : paddle.Tensor
+        Returns:
-            Value tensor (#batch, time2, size).
+            Tensor: Output tensor (#batch, time1, d_model).
        mask : paddle.Tensor
            Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time1, d_model).
        """
        q, k, v = self.forward_qkv(query, key, value)
        scores = paddle.matmul(q, k.transpose(
@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    """Multi-Head Attention layer with relative position encoding (new implementation).
    Details can be found in https://github.com/espnet/espnet/pull/2816.
    Paper: https://arxiv.org/abs/1901.02860
-    Parameters
+
-    ----------
+    Args:
-    n_head : int
+        n_head (int): The number of heads.
-        The number of heads.
+        n_feat (int): The number of features.
-    n_feat : int
+        dropout_rate (float): Dropout rate.
-        The number of features.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
    dropout_rate : float
        Dropout rate.
    zero_triu : bool
        Whether to zero the upper triangular part of attention matrix.
    """
    def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    def rel_shift(self, x):
        """Compute relative positional encoding.
-        Parameters
+        Args:
-        ----------
+            x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
-        x : paddle.Tensor
+
-            Input tensor (batch, head, time1, 2*time1-1).
+        Returns:
-            time1 means the length of query vector.
+            Tensor:Output tensor.
        Returns
        ----------
        paddle.Tensor
            Output tensor.
        """
        b, h, t1, t2 = paddle.shape(x)
        zero_pad = paddle.zeros((b, h, t1, 1))
@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
    def forward(self, query, key, value, pos_emb, mask):
        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
-        Parameters
+
-        ----------
+        Args:
-        query : paddle.Tensor 
+            query(Tensor): Query tensor (#batch, time1, size).
-            Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
-        key : paddle.Tensor
+            value(Tensor): Value tensor (#batch, time2, size).
-            Key tensor (#batch, time2, size).
+            pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
-        value : paddle.Tensor
+            mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
-            Value tensor (#batch, time2, size).
+
-        pos_emb : paddle.Tensor
+        Returns:
-            Positional embedding tensor
+            Tensor: Output tensor (#batch, time1, d_model).
            (#batch, 2*time1-1, size).
        mask : paddle.Tensor
            Mask tensor (#batch, 1, time2) or
            (#batch, time1, time2).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time1, d_model).
        """
        q, k, v = self.forward_qkv(query, key, value)
        # (batch, time1, head, d_k)
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat
 class Decoder(nn.Layer):
    """Transfomer decoder module.
-    Parameters
+    Args:
-    ----------
+        odim (int): Output diminsion.
-    odim : int
+        self_attention_layer_type (str): Self-attention layer type.
-        Output diminsion.
+        attention_dim (int): Dimention of attention.
-    self_attention_layer_type : str
+        attention_heads (int): The number of heads of multi head attention.
-        Self-attention layer type.
+        conv_wshare (int): The number of kernel of convolution. Only used in
-    attention_dim : int
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
-        Dimention of attention.
+        conv_kernel_length (Union[int, str]):Kernel size str of convolution
-    attention_heads : int
+            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
-        The number of heads of multi head attention.
+        conv_usebias (bool): Whether to use bias in convolution. Only used in
-    conv_wshare : int
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
-        The number of kernel of convolution. Only used in
+        linear_units(int): The number of units of position-wise feed forward.
-        self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        num_blocks (int): The number of decoder blocks.
-    conv_kernel_length : Union[int, str])
+        dropout_rate (float): Dropout rate.
-        Kernel size str of convolution
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
-        (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        self_attention_dropout_rate (float): Dropout rate in self-attention.
-    conv_usebias : bool
+        src_attention_dropout_rate (float): Dropout rate in source-attention.
-        Whether to use bias in convolution. Only used in
+        input_layer (Union[str, nn.Layer]): Input layer type.
-        self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        use_output_layer (bool): Whether to use output layer.
-    linear_units : int
+        pos_enc_class (nn.Layer): Positional encoding module class.
-        The number of units of position-wise feed forward.
+            `PositionalEncoding `or `ScaledPositionalEncoding`
-    num_blocks : int
+        normalize_before (bool): Whether to use layer_norm before the first block.
-        The number of decoder blocks.
+        concat_after (bool): Whether to concat attention layer's input and output.
-    dropout_rate : float
+            if True, additional linear will be applied.
-        Dropout rate.
+            i.e. x -> x + linear(concat(x, att(x)))
-    positional_dropout_rate : float
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
        Dropout rate after adding positional encoding.
    self_attention_dropout_rate : float
        Dropout rate in self-attention.
    src_attention_dropout_rate : float
        Dropout rate in source-attention.
    input_layer : (Union[str, nn.Layer])
        Input layer type.
    use_output_layer : bool
        Whether to use output layer.
    pos_enc_class : nn.Layer
        Positional encoding module class.
        `PositionalEncoding `or `ScaledPositionalEncoding`
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    """
@ -161,27 +142,18 @@ class Decoder(nn.Layer):
    def forward(self, tgt, tgt_mask, memory, memory_mask):
        """Forward decoder.
-
+        Args:
-        Parameters
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
-        ----------
+                In the other case, input tensor (#batch, maxlen_out, odim).
-        tgt : paddle.Tensor
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
-            Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". 
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
-            In the other case, input tensor (#batch, maxlen_out, odim).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
-        tgt_mask : paddle.Tensor
+
-            Input token mask (#batch, maxlen_out).
+        Returns:
-        memory : paddle.Tensor
+            Tensor:
-            Encoded memory, float32 (#batch, maxlen_in, feat).
+                Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. 
-        memory_mask : paddle.Tensor
+                In the other case,final block outputs (#batch, maxlen_out, attention_dim).
-            Encoded memory mask (#batch, maxlen_in).
+            Tensor: Score mask before softmax (#batch, maxlen_out).
        Returns
        ----------
        paddle.Tensor
            Decoded token score before softmax (#batch, maxlen_out, odim)
            if use_output_layer is True. In the other case,final block outputs
            (#batch, maxlen_out, attention_dim).
        paddle.Tensor
            Score mask before softmax (#batch, maxlen_out).
        """
        x = self.embed(tgt)
@ -196,23 +168,15 @@ class Decoder(nn.Layer):
    def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
        """Forward one step.
-        Parameters
+        Args:
-        ----------
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
-        tgt : paddle.Tensor
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
-            Input token ids, int64 (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
-        tgt_mask : paddle.Tensor
+            cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
-            Input token mask (#batch, maxlen_out).
+
-        memory : paddle.Tensor
+        Returns:
-            Encoded memory, float32 (#batch, maxlen_in, feat).
+            Tensor: Output tensor (batch, maxlen_out, odim).
-        cache : (List[paddle.Tensor])
+            List[Tensor]: List of cache tensors of each decoder layer.
            List of cached tensors.
            Each tensor shape should be (#batch, maxlen_out - 1, size).
        Returns
        ----------
        paddle.Tensor
            Output tensor (batch, maxlen_out, odim).
        List[paddle.Tensor]
            List of cache tensors of each decoder layer.
        """
        x = self.embed(tgt)
@ -254,20 +218,14 @@ class Decoder(nn.Layer):
                    xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
        """Score new token batch (required).
-        Parameters
+        Args:
-        ----------
+            ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
-        ys : paddle.Tensor
+            states(List[Any]): Scorer states for prefix tokens.
-            paddle.int64 prefix tokens (n_batch, ylen).
+            xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).
        states : List[Any]
            Scorer states for prefix tokens.
        xs : paddle.Tensor
            The encoder feature that generates ys (n_batch, xlen, n_feat).
-        Returns
+        Returns:
-        ----------
+            tuple[Tensor, List[Any]]:
-        tuple[paddle.Tensor, List[Any]]
+                Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys.
        Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)`
        and next state list for ys.
        """
        # merge states
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
 class DecoderLayer(nn.Layer):
    """Single decoder layer module.
-    Parameters
+ 
-    ----------
+    Args:
-    size : int
+        size (int): Input dimension.
-        Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
-    self_attn : nn.Layer
+            `MultiHeadedAttention` instance can be used as the argument.
-        Self-attention module instance.
+        src_attn (nn.Layer): Self-attention module instance.
-        `MultiHeadedAttention` instance can be used as the argument.
+            `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : nn.Layer
+        feed_forward (nn.Layer): Feed-forward module instance.
-        Self-attention module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-        `MultiHeadedAttention` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
-    feed_forward : nn.Layer
+        normalize_before (bool): Whether to use layer_norm before the first block.
-        Feed-forward module instance.
+        concat_after (bool): Whether to concat attention layer's input and output.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+            if True, additional linear will be applied.
-    dropout_rate : float
+            i.e. x -> x + linear(concat(x, att(x)))
-        Dropout rate.
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    """
@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer):
    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
        """Compute decoded features.
-        Parameters
+        Args:
-        ----------
+            tgt(Tensor): Input tensor (#batch, maxlen_out, size).
-        tgt : paddle.Tensor
+            tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
-            Input tensor (#batch, maxlen_out, size).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
-        tgt_mask : paddle.Tensor
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
-            Mask for input tensor (#batch, maxlen_out).
+            cache(List[Tensor], optional): List of cached tensors.
-        memory : paddle.Tensor
+                Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
-            Encoded memory, float32 (#batch, maxlen_in, size).
+        Returns:
-        memory_mask : paddle.Tensor
+            Tensor
-            Encoded memory mask (#batch, maxlen_in).
+                Output tensor(#batch, maxlen_out, size).
-        cache : List[paddle.Tensor]
+            Tensor
-            List of cached tensors.
+                Mask for output tensor (#batch, maxlen_out).
-            Each tensor shape should be (#batch, maxlen_out - 1, size).
+            Tensor
-
+                Encoded memory (#batch, maxlen_in, size).
-        Returns
+            Tensor
-        ----------
+                Encoded memory mask (#batch, maxlen_in).
        paddle.Tensor
            Output tensor(#batch, maxlen_out, size).
        paddle.Tensor
            Mask for output tensor (#batch, maxlen_out).
        paddle.Tensor
            Encoded memory (#batch, maxlen_in, size).
        paddle.Tensor
            Encoded memory mask (#batch, maxlen_in).
        """
        residual = tgt
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@ -22,18 +22,12 @@ from paddle import nn
 class PositionalEncoding(nn.Layer):
    """Positional encoding.
-    Parameters
+    Args:
-    ----------
+        d_model (int):  Embedding dimension.
-    d_model : int
+        dropout_rate (float): Dropout rate.
-        Embedding dimension.
+        max_len (int): Maximum input length.
-    dropout_rate : float
+        reverse (bool): Whether to reverse the input position.
-        Dropout rate.
+        type (str): dtype of param
    max_len : int
        Maximum input length.
    reverse : bool
        Whether to reverse the input position.
    type : str
        dtype of param
    """
    def __init__(self,
@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (batch, time, `*`).
        x : paddle.Tensor
            Input tensor (batch, time, `*`).
-        Returns
+        Returns:
-        ----------
+            Tensor: Encoded tensor (batch, time, `*`).
        paddle.Tensor
            Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        T = paddle.shape(x)[1]
@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer):
 class ScaledPositionalEncoding(PositionalEncoding):
    """Scaled positional encoding module.
    See Sec. 3.2  https://arxiv.org/abs/1809.08895
-    Parameters
+    Args:
-    ----------
+        d_model (int): Embedding dimension.
-    d_model : int
+        dropout_rate (float): Dropout rate.
-        Embedding dimension.
+        max_len (int): Maximum input length.
-    dropout_rate : float
+        dtype (str): dtype of param
        Dropout rate.
    max_len : int
        Maximum input length.
    dtype : str
        dtype of param
    """
    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding):
    def forward(self, x):
        """Add positional encoding.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (batch, time, `*`).
-        x : paddle.Tensor
+        Returns:
-            Input tensor (batch, time, `*`).
+            Tensor: Encoded tensor (batch, time, `*`).
        Returns
        ----------
        paddle.Tensor
            Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        T = paddle.shape(x)[1]
@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer):
    """Relative positional encoding module (new implementation).
    Details can be found in https://github.com/espnet/espnet/pull/2816.
    See : Appendix B in https://arxiv.org/abs/1901.02860
-    Parameters
+
-    ----------
+    Args:
-    d_model : int
+        d_model (int): Embedding dimension.
-        Embedding dimension.
+        dropout_rate (float): Dropout rate.
-    dropout_rate : float
+        max_len (int): Maximum input length.
        Dropout rate.
    max_len : int
        Maximum input length.
    """
    def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer):
    def forward(self, x: paddle.Tensor):
        """Add positional encoding.
-        Parameters
+        Args:
-        ----------
+            x (Tensor):Input tensor (batch, time, `*`).
-        x : paddle.Tensor
+        Returns:
-            Input tensor (batch, time, `*`).
+            Tensor: Encoded tensor (batch, time, `*`).
        Returns
        ----------
        paddle.Tensor
            Encoded tensor (batch, time, `*`).
        """
        self.extend_pe(x)
        x = x * self.xscale
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
 class BaseEncoder(nn.Layer):
    """Base Encoder module.
-    Parameters
+    Args:
-    ----------
+        idim (int): Input dimension.
-    idim : int
+        attention_dim (int): Dimention of attention.
-        Input dimension.
+        attention_heads (int): The number of heads of multi head attention.
-    attention_dim : int
+        linear_units (int): The number of units of position-wise feed forward.
-        Dimention of attention.
+        num_blocks (int): The number of decoder blocks.
-    attention_heads : int
+        dropout_rate (float): Dropout rate.
-        The number of heads of multi head attention.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
-    linear_units : int
+        attention_dropout_rate (float): Dropout rate in attention.
-        The number of units of position-wise feed forward.
+        input_layer (Union[str, nn.Layer]): Input layer type.
-    num_blocks : int
+        normalize_before (bool): Whether to use layer_norm before the first block.
-        The number of decoder blocks.
+        concat_after (bool): Whether to concat attention layer's input and output.
-    dropout_rate : float
+            if True, additional linear will be applied.
-        Dropout rate.
+            i.e. x -> x + linear(concat(x, att(x)))
-    positional_dropout_rate : float
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        Dropout rate after adding positional encoding.
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
-    attention_dropout_rate : float
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
-        Dropout rate in attention.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
-    input_layer : Union[str, nn.Layer]
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
-        Input layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
-    normalize_before : bool
+        activation_type (str): Encoder activation function type.
-        Whether to use layer_norm before the first block.
+        use_cnn_module (bool): Whether to use convolution module.
-    concat_after : bool
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
-        Whether to concat attention layer's input and output.
+        cnn_module_kernel (int): Kernerl size of convolution module.
-        if True, additional linear will be applied.
+        padding_idx (int): Padding idx for input_layer=embed.
-        i.e. x -> x + linear(concat(x, att(x)))
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
-    positionwise_layer_type : str
+            indices start from 1.
-        "linear", "conv1d", or "conv1d-linear".
+            if not None, intermediate outputs are returned (which changes return type
-    positionwise_conv_kernel_size : int
+            signature.)
-        Kernel size of positionwise conv1d layer.
+        encoder_type (str): "transformer", or "conformer".
    macaron_style : bool
        Whether to use macaron style for positionwise layer.
    pos_enc_layer_type : str
        Encoder positional encoding layer type.
    selfattention_layer_type : str
        Encoder attention layer type.
    activation_type : str
        Encoder activation function type.
    use_cnn_module : bool
        Whether to use convolution module.
    zero_triu : bool
        Whether to zero the upper triangular part of attention matrix.
    cnn_module_kernel : int
        Kernerl size of convolution module.
    padding_idx : int
        Padding idx for input_layer=embed.
    stochastic_depth_rate : float
        Maximum probability to skip the encoder layer.
    intermediate_layers : Union[List[int], None]
        indices of intermediate CTC layer.
        indices start from 1.
        if not None, intermediate outputs are returned (which changes return type
        signature.)
    encoder_type: str
         "transformer", or "conformer".
    """
    def __init__(self,
@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer):
    def forward(self, xs, masks):
        """Encode input sequence.
-        Parameters
+        Args:
-        ----------
+            xs (Tensor): Input tensor (#batch, time, idim).
-        xs : paddle.Tensor
+            masks (Tensor): Mask tensor (#batch, 1, time).
-            Input tensor (#batch, time, idim).
+
-        masks : paddle.Tensor
+        Returns: 
-            Mask tensor (#batch, 1, time).
+            Tensor: Output tensor (#batch, time, attention_dim).
-
+            Tensor: Mask tensor (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time, attention_dim).
        paddle.Tensor
            Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer):
 class TransformerEncoder(BaseEncoder):
    """Transformer encoder module.
-    Parameters
+
-    ----------
+    Args:
-    idim : int
+        idim (int): Input dimension.
-        Input dimension.
+        attention_dim (int): Dimention of attention.
-    attention_dim : int
+        attention_heads (int): The number of heads of multi head attention.
-        Dimention of attention.
+        linear_units (int): The number of units of position-wise feed forward.
-    attention_heads : int
+        num_blocks (int): The number of decoder blocks.
-        The number of heads of multi head attention.
+        dropout_rate (float): Dropout rate.
-    linear_units : int
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
-        The number of units of position-wise feed forward.
+        attention_dropout_rate (float): Dropout rate in attention.
-    num_blocks : int
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
-        The number of decoder blocks.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
-    dropout_rate : float
+        normalize_before (bool): Whether to use layer_norm before the first block.
-        Dropout rate.
+        concat_after (bool): Whether to concat attention layer's input and output.
-    positional_dropout_rate : float
+            if True, additional linear will be applied.
-        Dropout rate after adding positional encoding.
+            i.e. x -> x + linear(concat(x, att(x)))
-    attention_dropout_rate : float
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
-        Dropout rate in attention.
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
-    input_layer : Union[str, paddle.nn.Layer]
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
-        Input layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
-    pos_enc_layer_type : str
+        activation_type (str): Encoder activation function type.
-        Encoder positional encoding layer type.
+        padding_idx (int): Padding idx for input_layer=embed.
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    positionwise_layer_type : str
        "linear", "conv1d", or "conv1d-linear".
    positionwise_conv_kernel_size : int
        Kernel size of positionwise conv1d layer.
    selfattention_layer_type : str
        Encoder attention layer type.
    activation_type : str
        Encoder activation function type.
    padding_idx : int
        Padding idx for input_layer=embed.
    """
    def __init__(
@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder):
    def forward(self, xs, masks):
        """Encode input sequence.
-        Parameters
+        Args:
-        ----------
+            xs(Tensor): Input tensor (#batch, time, idim).
-        xs : paddle.Tensor
+            masks(Tensor): Mask tensor (#batch, 1, time).
-            Input tensor (#batch, time, idim).
+
-        masks : paddle.Tensor
+        Returns:
-            Mask tensor (#batch, 1, time).
+            Tensor: Output tensor (#batch, time, attention_dim).
-
+            Tensor:Mask tensor (#batch, 1, time).
        Returns
        ----------
        paddle.Tensor
            Output tensor (#batch, time, attention_dim).
        paddle.Tensor
            Mask tensor (#batch, 1, time).
        """
        xs = self.embed(xs)
        xs, masks = self.encoders(xs, masks)
@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder):
    def forward_one_step(self, xs, masks, cache=None):
        """Encode input frame.
-        Parameters
+        Args:
-        ----------
+            xs (Tensor): Input tensor.
-        xs : paddle.Tensor
+            masks (Tensor): Mask tensor.
-            Input tensor.
+            cache (List[Tensor]): List of cache tensors.
-        masks : paddle.Tensor
+
-            Mask tensor.
+        Returns:
-        cache : List[paddle.Tensor]
+            Tensor: Output tensor.
-            List of cache tensors.
+            Tensor: Mask tensor.
-
+            List[Tensor]: List of new cache tensors.
        Returns
        ----------
        paddle.Tensor
            Output tensor.
        paddle.Tensor
            Mask tensor.
        List[paddle.Tensor]
            List of new cache tensors.
        """
        xs = self.embed(xs)
@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder):
 class ConformerEncoder(BaseEncoder):
    """Conformer encoder module.
-    Parameters
+
-    ----------
+    Args:
-    idim : int
+        idim (int): Input dimension.
-        Input dimension.
+        attention_dim (int): Dimention of attention.
-    attention_dim : int
+        attention_heads (int): The number of heads of multi head attention.
-        Dimention of attention.
+        linear_units (int): The number of units of position-wise feed forward.
-    attention_heads : int
+        num_blocks (int): The number of decoder blocks.
-        The number of heads of multi head attention.
+        dropout_rate (float): Dropout rate.
-    linear_units : int
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
-        The number of units of position-wise feed forward.
+        attention_dropout_rate (float): Dropout rate in attention.
-    num_blocks : int
+        input_layer (Union[str, nn.Layer]): Input layer type.
-        The number of decoder blocks.
+        normalize_before (bool): Whether to use layer_norm before the first block.
-    dropout_rate : float
+        concat_after (bool):Whether to concat attention layer's input and output.
-        Dropout rate.
+            if True, additional linear will be applied.
-    positional_dropout_rate : float
+            i.e. x -> x + linear(concat(x, att(x)))
-        Dropout rate after adding positional encoding.
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
-    attention_dropout_rate : float
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
-        Dropout rate in attention.
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
-    input_layer : Union[str, nn.Layer]
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
-        Input layer type.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
-    normalize_before : bool
+        selfattention_layer_type (str): Encoder attention layer type.
-        Whether to use layer_norm before the first block.
+        activation_type (str): Encoder activation function type.
-    concat_after : bool
+        use_cnn_module (bool): Whether to use convolution module.
-        Whether to concat attention layer's input and output.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
-        if True, additional linear will be applied.
+        cnn_module_kernel (int): Kernerl size of convolution module.
-        i.e. x -> x + linear(concat(x, att(x)))
+        padding_idx (int): Padding idx for input_layer=embed.
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
-    positionwise_layer_type : str
+        intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
-        "linear", "conv1d", or "conv1d-linear".
+            if not None, intermediate outputs are returned (which changes return type signature.)
    positionwise_conv_kernel_size : int
        Kernel size of positionwise conv1d layer.
    macaron_style : bool
        Whether to use macaron style for positionwise layer.
    pos_enc_layer_type : str
        Encoder positional encoding layer type.
    selfattention_layer_type : str
        Encoder attention layer type.
    activation_type : str
        Encoder activation function type.
    use_cnn_module : bool
        Whether to use convolution module.
    zero_triu : bool
        Whether to zero the upper triangular part of attention matrix.
    cnn_module_kernel : int
        Kernerl size of convolution module.
    padding_idx : int
        Padding idx for input_layer=embed.
    stochastic_depth_rate : float
        Maximum probability to skip the encoder layer.
    intermediate_layers : Union[List[int], None]
        indices of intermediate CTC layer.
        indices start from 1.
        if not None, intermediate outputs are returned (which changes return type
        signature.)
    """
    def __init__(
@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder):
    def forward(self, xs, masks):
        """Encode input sequence.
-        Parameters
+
-        ----------
+        Args:
-        xs : paddle.Tensor
+            xs (Tensor): Input tensor (#batch, time, idim).
-            Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
-        masks : paddle.Tensor
+        Returns:
-            Mask tensor (#batch, 1, time).
+            Tensor: Output tensor (#batch, time, attention_dim).
-        Returns
+            Tensor: Mask tensor (#batch, 1, time).
        ----------
        paddle.Tensor
            Output tensor (#batch, time, attention_dim).
        paddle.Tensor
            Mask tensor (#batch, 1, time).
        """
        if isinstance(self.embed, (Conv2dSubsampling)):
            xs, masks = self.embed(xs, masks)
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@ -20,25 +20,18 @@ from paddle import nn
 class EncoderLayer(nn.Layer):
    """Encoder layer module.
-    Parameters
+    Args:
-    ----------
+        size (int): Input dimension.
-    size : int
+        self_attn (nn.Layer): Self-attention module instance.
-        Input dimension.
+            `MultiHeadedAttention`  instance can be used as the argument.
-    self_attn : nn.Layer
+        feed_forward (nn.Layer): Feed-forward module instance.
-        Self-attention module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-        `MultiHeadedAttention`  instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
-    feed_forward : nn.Layer
+        normalize_before (bool): Whether to use layer_norm before the first block.
-        Feed-forward module instance.
+        concat_after (bool): Whether to concat attention layer's input and output.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+            if True, additional linear will be applied.
-    dropout_rate : float
+            i.e. x -> x + linear(concat(x, att(x)))
-        Dropout rate.
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
    normalize_before : bool
        Whether to use layer_norm before the first block.
    concat_after : bool
        Whether to concat attention layer's input and output.
        if True, additional linear will be applied.
        i.e. x -> x + linear(concat(x, att(x)))
        if False, no additional linear will be applied. i.e. x -> x + att(x)
    """
    def __init__(
@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer):
    def forward(self, x, mask, cache=None):
        """Compute encoded features.
-        Parameters
+        Args:
-        ----------
+            x(Tensor): Input tensor (#batch, time, size).
-        x_input : paddle.Tensor
+            mask(Tensor): Mask tensor for the input (#batch, time).
-            Input tensor (#batch, time, size).
+            cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). 
        mask : paddle.Tensor
            Mask tensor for the input (#batch, time).
        cache : paddle.Tensor
                Cache tensor of the input (#batch, time - 1, size).
-        Returns
+        Returns:
-        ----------
+            Tensor: Output tensor (#batch, time, size).
-        paddle.Tensor
+            Tensor: Mask tensor (#batch, time).
            Output tensor (#batch, time, size).
        paddle.Tensor
            Mask tensor (#batch, time).
        """
        residual = x
        if self.normalize_before:
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer):
    This implementation is based on
    https://github.com/pytorch/fairseq/tree/master/fairseq
-    Parameters
+    Args:
-    ----------
+        wshare (int): the number of kernel of convolution
-    wshare : int
+        n_feat (int): the number of features
-        the number of kernel of convolution
+        dropout_rate (float): dropout_rate
-    n_feat : int
+        kernel_size (int): kernel size (length)
-        the number of features
+        use_kernel_mask (bool): Use causal mask or not for convolution kernel
-    dropout_rate : float
+        use_bias (bool): Use bias term or not.
        dropout_rate
    kernel_size : int
        kernel size (length)
    use_kernel_mask : bool
        Use causal mask or not for convolution kernel
    use_bias : bool
        Use bias term or not.
    """
@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer):
        This function takes query, key and value but uses only query.
        This is just for compatibility with self-attention layer (attention.py)
-        Parameters
+        Args:
-        ----------
+            query (Tensor): input tensor. (batch, time1, d_model)
-        query : paddle.Tensor
+            key (Tensor): NOT USED. (batch, time2, d_model)  
-            (batch, time1, d_model) input tensor
+            value (Tensor): NOT USED. (batch, time2, d_model) 
-        key : paddle.Tensor
+            mask : (Tensor): (batch, time1, time2) mask
-            (batch, time2, d_model) NOT USED
+
-        value : paddle.Tensor
+        Return:
-            (batch, time2, d_model) NOT USED
+            Tensor: ouput. (batch, time1, d_model) 
        mask : paddle.Tensor
            (batch, time1, time2) mask
        Return
        ----------
        x : paddle.Tensor
            (batch, time1, d_model) ouput
        """
        # linear -> GLU -> lightconv -> linear
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
@ -17,19 +17,16 @@ import paddle
 def subsequent_mask(size, dtype=paddle.bool):
    """Create mask for subsequent steps (size, size).
-    Parameters
+
-    ----------
+    Args:
-    size : int
+        size (int): size of mask
-        size of mask
+        dtype (paddle.dtype): result dtype
-    dtype : paddle.dtype
+    Return:
-        result dtype
+        Tensor:
-    Return
+            >>> subsequent_mask(3)
-    ----------
+            [[1, 0, 0],
-    paddle.Tensor
+            [1, 1, 0],
-    >>> subsequent_mask(3)
+            [1, 1, 1]]
    [[1, 0, 0],
     [1, 1, 0],
     [1, 1, 1]]
    """
    ret = paddle.ones([size, size], dtype=dtype)
    return paddle.tril(ret)
@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool):
 def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
    """Create mask for decoder self-attention.
    Parameters
    ----------
-    ys_pad : paddle.Tensor
+    Args:
-        batch of padded target sequences (B, Lmax)
+        ys_pad (Tensor): batch of padded target sequences (B, Lmax)
-    ignore_id : int
+        ignore_id (int): index of padding
-        index of padding
+        dtype (paddle.dtype): result dtype
-    dtype : torch.dtype
+    Return: 
-        result dtype
+        Tensor: (B, Lmax, Lmax)
    Return
    ----------
    paddle.Tensor 
        (B, Lmax, Lmax)
    """
    ys_mask = ys_in_pad != ignore_id
    m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0)
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize MultiLayeredConv1d module.
-        Parameters
+        Args: 
-        ----------
+            in_chans (int): Number of input channels.
-        in_chans : int
+            hidden_chans (int): Number of hidden channels.
-            Number of input channels.
+            kernel_size (int): Kernel size of conv1d.
-        hidden_chans : int
+            dropout_rate (float): Dropout rate.
            Number of hidden channels.
        kernel_size : int
            Kernel size of conv1d.
        dropout_rate : float
            Dropout rate.
        """
        super().__init__()
@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Batch of input tensors (B, T, in_chans).
        x : paddle.Tensor
            Batch of input tensors (B, T, in_chans).
-        Returns
+        Returns: 
-        ----------
+            Tensor: Batch of output tensors (B, T, in_chans).
        paddle.Tensor
            Batch of output tensors (B, T, in_chans).
        """
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer):
    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
        """Initialize Conv1dLinear module.
-        Parameters
+        Args:
-        ----------
+            in_chans (int): Number of input channels.
-        in_chans : int
+            hidden_chans (int): Number of hidden channels.
-            Number of input channels.
+            kernel_size (int): Kernel size of conv1d.
-        hidden_chans : int
+            dropout_rate (float): Dropout rate.
            Number of hidden channels.
        kernel_size : int
            Kernel size of conv1d.
        dropout_rate : float
            Dropout rate.
        """
        super().__init__()
        self.w_1 = nn.Conv1D(
@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer):
    def forward(self, x):
        """Calculate forward propagation.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Batch of input tensors (B, T, in_chans).
        x : paddle.Tensor
        Batch of input tensors (B, T, in_chans).
-        Returns
+        Returns:
-        ----------
+            Tensor: Batch of output tensors (B, T, in_chans).
        paddle.Tensor
            Batch of output tensors (B, T, in_chans).
        """
        x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@ -20,14 +20,10 @@ from paddle import nn
 class PositionwiseFeedForward(nn.Layer):
    """Positionwise feed forward layer.
-    Parameters
+    Args:
-    ----------
+        idim (int): Input dimenstion.
-    idim : int
+        hidden_units (int): The number of hidden units.
-        Input dimenstion.
+        dropout_rate (float): Dropout rate.
    hidden_units : int
        The number of hidden units.
    dropout_rate : float
        Dropout rate.
    """
    def __init__(self,
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential):
 def repeat(N, fn):
    """Repeat module N times.
-    Parameters
+    Args:
-    ----------
+        N (int): Number of repeat time.
-    N : int
+        fn (Callable): Function to generate module.
        Number of repeat time.
    fn : Callable
        Function to generate module.
-    Returns
+    Returns:
-    ----------
+        MultiSequential: Repeated model instance.
    MultiSequential
        Repeated model instance.
    """
    return MultiSequential(*[fn(n) for n in range(N)])
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 class Conv2dSubsampling(nn.Layer):
    """Convolutional 2D subsampling (to 1/4 length).
-    Parameters
+
-    ----------
+    Args:
-    idim : int
+        idim (int): Input dimension.
-        Input dimension.
+        odim (int): Output dimension.
-    odim : int
+        dropout_rate (float): Dropout rate.
-        Output dimension.
+        pos_enc (nn.Layer): Custom position encoding layer.
    dropout_rate : float
        Dropout rate.
    pos_enc : nn.Layer
        Custom position encoding layer.
    """
    def __init__(self, idim, odim, dropout_rate, pos_enc=None):
@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer):
    def forward(self, x, x_mask):
        """Subsample x.
-        Parameters
+        Args:
-        ----------
+            x (Tensor): Input tensor (#batch, time, idim).
-        x : paddle.Tensor
+            x_mask (Tensor): Input mask (#batch, 1, time).
-            Input tensor (#batch, time, idim).
+        Returns:
-        x_mask : paddle.Tensor
+            Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
-            Input mask (#batch, 1, time).
+            Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
        Returns
        ----------
        paddle.Tensor
            Subsampled tensor (#batch, time', odim),
            where time' = time // 4.
        paddle.Tensor
            Subsampled mask (#batch, 1, time'),
            where time' = time // 4.
        """
        # (b, c, t, f)
        x = x.unsqueeze(1)
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
@ -27,17 +27,12 @@ class Stretch2D(nn.Layer):
    def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"):
        """Strech an image (or image-like object) with some interpolation.
-        Parameters
+        Args:
-        ----------
+            w_scale (int): Scalar of width.
-        w_scale : int
+            h_scale (int): Scalar of the height.
-            Scalar of width.
+            mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", 
-        h_scale : int
+                "trilinear", "bicubic", "linear" and "area",by default "nearest"
-            Scalar of the height.
+        For more details about interpolation, see 
        mode : str, optional
            Interpolation mode, modes suppored are "nearest", "bilinear", 
            "trilinear", "bicubic", "linear" and "area",by default "nearest"
            For more details about interpolation, see 
            `paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
        """
        super().__init__()
@ -47,16 +42,14 @@ class Stretch2D(nn.Layer):
    def forward(self, x):
        """
-        Parameters
+
-        ----------
+        Args: 
-        x : Tensor
+            x (Tensor): Shape (N, C, H, W)
-            Shape (N, C, H, W)
+
-
+        Returns:
-        Returns
+            Tensor: The stretched image.
-        -------
+                Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
-        Tensor
+            
            Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
            The stretched image.
        """
        out = F.interpolate(
            x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode)
@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer):
    """A Layer to upsample spectrogram by applying consecutive stretch and
    convolutions.
-    Parameters
+    Args:
-    ----------
+        upsample_scales (List[int]): Upsampling factors for each strech.
-    upsample_scales : List[int]
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
-        Upsampling factors for each strech.
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
-    nonlinear_activation : Optional[str], optional
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
-        Activation after each convolution, by default None
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
-    nonlinear_activation_params : Dict[str, Any], optional
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
-        Parameters passed to construct the activation, by default {}
+            If True, Causal padding is used along the time axis, 
-    interpolate_mode : str, optional
+            i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
-        Interpolation mode of the strech, by default "nearest"
+            If False, "same" padding is used along the time axis.
    freq_axis_kernel_size : int, optional
        Convolution kernel size along the frequency axis, by default 1
    use_causal_conv : bool, optional
        Whether to use causal padding before convolution, by default False
        If True, Causal padding is used along the time axis, i.e. padding
        amount is ``receptive field - 1`` and 0 for before and after,
        respectively.
        If False, "same" padding is used along the time axis.
    """
    def __init__(self,
@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer):
    def forward(self, c):
        """
-        Parameters
+        Args:
-        ----------
+            c (Tensor): spectrogram. Shape (N, F, T)
-        c : Tensor
+
-            Shape (N, F, T), spectrogram
+        Returns: 
-
+            Tensor: upsampled spectrogram.
-        Returns
+                Shape (N, F, T'), where ``T' = upsample_factor * T``, 
        -------
        Tensor
            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
            spectrogram
        """
        c = c.unsqueeze(1)
        for f in self.up_layers:
@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer):
 class ConvInUpsampleNet(nn.Layer):
    """A Layer to upsample spectrogram composed of a convolution and an 
    UpsampleNet.
-
+    
-    Parameters
+    Args:
-    ----------
+        upsample_scales (List[int]): Upsampling factors for each strech.
-    upsample_scales : List[int]
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
-        Upsampling factors for each strech.
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
-    nonlinear_activation : Optional[str], optional
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
-        Activation after each convolution, by default None
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
-    nonlinear_activation_params : Dict[str, Any], optional
+        aux_channels (int, optional): Feature size of the input, by default 80
-        Parameters passed to construct the activation, by default {}
+        aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It 
-    interpolate_mode : str, optional
+            related to the kernel size of the convolution, by default 0
-        Interpolation mode of the strech, by default "nearest"
+            If use causal convolution, the kernel size is ``window + 1``, 
-    freq_axis_kernel_size : int, optional
+            else the kernel size is ``2 * window + 1``.
-        Convolution kernel size along the frequency axis, by default 1
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
-    aux_channels : int, optional
+            If True, Causal padding is used along the time axis, i.e. padding 
-        Feature size of the input, by default 80
+            amount is ``receptive field - 1`` and 0 for before and after, respectively.
-    aux_context_window : int, optional
+            If False, "same" padding is used along the time axis.
        Context window of the first 1D convolution applied to the input. It 
        related to the kernel size of the convolution, by default 0
        If use causal convolution, the kernel size is ``window + 1``, else
        the kernel size is ``2 * window + 1``.
    use_causal_conv : bool, optional
        Whether to use causal padding before convolution, by default False
        If True, Causal padding is used along the time axis, i.e. padding 
        amount is ``receptive field - 1`` and 0 for before and after, 
        respectively.
        If False, "same" padding is used along the time axis.
    """
    def __init__(self,
@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer):
    def forward(self, c):
        """
-        Parameters
+        Args:
-        ----------
+            c (Tensor): spectrogram. Shape (N, F, T)
-        c : Tensor
+
-            Shape (N, F, T), spectrogram
+        Returns:
-
+            Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, 
        Returns
        -------
        Tensors
            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
            spectrogram
        """
        c_ = self.conv_in(c)
        c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@ -57,35 +57,30 @@ class ExperimentBase(object):
    Feel free to add/overwrite other methods and standalone functions if you
    need.
-    Parameters
+    Args:
-    ----------
+        config (yacs.config.CfgNode): The configuration used for the experiment.
-    config: yacs.config.CfgNode
+        args (argparse.Namespace): The parsed command line arguments.
-        The configuration used for the experiment.
+
-
+    Examples:
-    args: argparse.Namespace
+        >>> def main_sp(config, args):
-        The parsed command line arguments.
+        >>>     exp = Experiment(config, args)
-
+        >>>     exp.setup()
-    Examples
+        >>>     exe.resume_or_load()
-    --------
+        >>>     exp.run()
-    >>> def main_sp(config, args):
+        >>>
-    >>>     exp = Experiment(config, args)
+        >>> config = get_cfg_defaults()
-    >>>     exp.setup()
+        >>> parser = default_argument_parser()
-    >>>     exe.resume_or_load()
+        >>> args = parser.parse_args()
-    >>>     exp.run()
+        >>> if args.config:
-    >>>
+        >>>     config.merge_from_file(args.config)
-    >>> config = get_cfg_defaults()
+        >>> if args.opts:
-    >>> parser = default_argument_parser()
+        >>>     config.merge_from_list(args.opts)
-    >>> args = parser.parse_args()
+        >>> config.freeze()
-    >>> if args.config:
+        >>>
-    >>>     config.merge_from_file(args.config)
+        >>> if args.ngpu > 1:
-    >>> if args.opts:
+        >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
-    >>>     config.merge_from_list(args.opts)
+        >>> else:
-    >>> config.freeze()
+        >>>     main_sp(config, args)
    >>>
    >>> if args.ngpu > 1:
    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
    >>> else:
    >>>     main_sp(config, args)
    """
    def __init__(self, config, args):
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
@ -43,10 +43,8 @@ class Snapshot(extension.Extension):
    parameters and optimizer states. If the updater inside the trainer
    subclasses StandardUpdater, everything is good to go.
-    Parameters
+    Arsg:
-    ----------
+        checkpoint_dir (Union[str, Path]): The directory to save checkpoints into.
    checkpoint_dir : Union[str, Path]
        The directory to save checkpoints into.
    """
    trigger = (1, 'epoch')
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
    """Compute the levenshtein distance between reference sequence and
    hypothesis sequence in word-level.
-    Parameters
+    Args:
-    ----------
+        reference (str): The reference sentence.
-    reference : str
+        hypothesis (str): The hypothesis sentence.
-        The reference sentence.
+        ignore_case (bool): Whether case-sensitive or not.
-    hypothesis : str
+        delimiter (char(str)): Delimiter of input sentences.
-        The hypothesis sentence.
+
-    ignore_case : bool
+    Returns:
-        Whether case-sensitive or not.
+        list: Levenshtein distance and word number of reference sentence.
    delimiter : char(str)
        Delimiter of input sentences.
    Returns
    ----------
    list
        Levenshtein distance and word number of reference sentence.
    """
    if ignore_case:
        reference = reference.lower()
@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
    """Compute the levenshtein distance between reference sequence and
    hypothesis sequence in char-level.
-    Parameters
+    Args:
-    ----------
+        reference (str): The reference sentence.
-    reference: str
+        hypothesis (str): The hypothesis sentence.
-        The reference sentence.
+        ignore_case (bool): Whether case-sensitive or not.
-    hypothesis: str
+        remove_space (bool): Whether remove internal space characters
-        The hypothesis sentence.
+
-    ignore_case: bool
+    Returns:
-        Whether case-sensitive or not.
+        list: Levenshtein distance and length of reference sentence.
    remove_space: bool
        Whether remove internal space characters
    Returns
    ----------
    list
        Levenshtein distance and length of reference sentence.
    """
    if ignore_case:
        reference = reference.lower()
@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
    We can use levenshtein distance to calculate WER. Please draw an attention
    that empty items will be removed when splitting sentences by delimiter.
-    Parameters
+    Args:
-    ----------
+        reference (str): The reference sentence.
-    reference: str
+        hypothesis (str): The hypothesis sentence.
-        The reference sentence.
+        ignore_case (bool): Whether case-sensitive or not.
-
+        delimiter (char): Delimiter of input sentences.
-    hypothesis: str
+
-        The hypothesis sentence.
+    Returns: 
-    ignore_case: bool
+        float: Word error rate.
-        Whether case-sensitive or not.
+
-    delimiter: char
+    Raises:
-        Delimiter of input sentences.
+        ValueError: If word number of reference is zero.
    Returns
    ----------
    float
         Word error rate.
    Raises
    ----------
    ValueError
        If word number of reference is zero.
    """
    edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
                                         delimiter)
@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False):
    space characters will be truncated and multiple consecutive space
    characters in a sentence will be replaced by one space character.
-    Parameters
+    Args:
-    ----------
+        reference (str): The reference sentence.
-    reference: str
+        hypothesis (str): The hypothesis sentence.
-        The reference sentence.
+        ignore_case (bool): Whether case-sensitive or not.
-    hypothesis: str
+        remove_space (bool): Whether remove internal space characters
-        The hypothesis sentence.
+
-    ignore_case: bool
+    Returns: 
-        Whether case-sensitive or not.
+        float: Character error rate.
-    remove_space: bool
+
-        Whether remove internal space characters
+    Raises: 
-
+        ValueError: If the reference length is zero.
    Returns
    ----------
    float
        Character error rate.
    Raises
    ----------
    ValueError
        If the reference length is zero.
    """
    edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
                                         remove_space)
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py
@ -23,18 +23,12 @@ import numpy as np
 def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
    """Read a dataset from a HDF5 file.
    Args:
        filename (Union[Path, str]): Path of the HDF5 file.
        dataset_name (str): Name of the dataset to read.
-    Parameters
+    Returns:
-    ----------
+        Any: The retrieved dataset.
    filename : Union[Path, str]
        Path of the HDF5 file.
    dataset_name : str
        Name of the dataset to read.
    Returns
    -------
    Any
        The retrieved dataset.
    """
    filename = Path(filename)
@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str],
               write_data: np.ndarray,
               is_overwrite: bool=True) -> None:
    """Write dataset to HDF5 file.
-
+    Args:
-    Parameters
+        filename (Union[Path, str]): Path of the HDF5 file.
-    ----------
+        dataset_name (str): Name of the dataset to write to.
-    filename : Union[Path, str]
+        write_data (np.ndarrays): The data to write.
-        Path of the HDF5 file.
+        is_overwrite (bool, optional): Whether to overwrite, by default True
    dataset_name : str
        Name of the dataset to write to.
    write_data : np.ndarrays
        The data to write.
    is_overwrite : bool, optional
        Whether to overwrite, by default True
    """
    # convert to numpy array
    filename = Path(filename)