From 683679bec72009517f6352395b6a133018cc92dd Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 10 Feb 2022 12:41:24 +0000
Subject: [PATCH 1/2] merge data and datasets, test=tts

---
 paddlespeech/t2s/__init__.py                  |   1 -
 paddlespeech/t2s/data/__init__.py             |  17 --
 paddlespeech/t2s/data/dataset.py              | 261 ------------------
 paddlespeech/t2s/datasets/__init__.py         |   1 -
 paddlespeech/t2s/datasets/am_batch_fn.py      |   2 +-
 paddlespeech/t2s/{data => datasets}/batch.py  |   0
 paddlespeech/t2s/datasets/common.py           |  92 ------
 .../t2s/{data => datasets}/get_feats.py       |   0
 .../t2s/exps/fastspeech2/preprocess.py        |   6 +-
 .../parallelwave_gan/synthesize_from_wav.py   |   2 +-
 .../t2s/exps/gan_vocoder/preprocess.py        |   2 +-
 .../t2s/exps/speedyspeech/preprocess.py       |   2 +-
 paddlespeech/t2s/exps/tacotron2/preprocess.py |   2 +-
 .../t2s/exps/transformer_tts/preprocess.py    |   2 +-
 paddlespeech/t2s/exps/waveflow/ljspeech.py    |   4 +-
 15 files changed, 11 insertions(+), 383 deletions(-)
 delete mode 100644 paddlespeech/t2s/data/__init__.py
 delete mode 100644 paddlespeech/t2s/data/dataset.py
 rename paddlespeech/t2s/{data => datasets}/batch.py (100%)
 delete mode 100644 paddlespeech/t2s/datasets/common.py
 rename paddlespeech/t2s/{data => datasets}/get_feats.py (100%)

diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 8a0acc48..7d93c026 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import logging
 
-from . import data
 from . import datasets
 from . import exps
 from . import frontend
diff --git a/paddlespeech/t2s/data/__init__.py b/paddlespeech/t2s/data/__init__.py
deleted file mode 100644
index c605205d..00000000
--- a/paddlespeech/t2s/data/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""t2s's infrastructure for data processing.
-"""
-from .batch import *
-from .dataset import *
diff --git a/paddlespeech/t2s/data/dataset.py b/paddlespeech/t2s/data/dataset.py
deleted file mode 100644
index 2d6c03cb..00000000
--- a/paddlespeech/t2s/data/dataset.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import six
-from paddle.io import Dataset
-
-__all__ = [
-    "split",
-    "TransformDataset",
-    "CacheDataset",
-    "TupleDataset",
-    "DictDataset",
-    "SliceDataset",
-    "SubsetDataset",
-    "FilterDataset",
-    "ChainDataset",
-]
-
-
-def split(dataset, first_size):
-    """A utility function to split a dataset into two datasets."""
-    first = SliceDataset(dataset, 0, first_size)
-    second = SliceDataset(dataset, first_size, len(dataset))
-    return first, second
-
-
-class TransformDataset(Dataset):
-    def __init__(self, dataset, transform):
-        """Dataset which is transformed from another with a transform.
-
-        Args:
-            dataset (Dataset): the base dataset.
-            transform (callable): the transform which takes an example of the base dataset as parameter and return a new example.
-        """
-        self._dataset = dataset
-        self._transform = transform
-
-    def __len__(self):
-        return len(self._dataset)
-
-    def __getitem__(self, i):
-        in_data = self._dataset[i]
-        return self._transform(in_data)
-
-
-class CacheDataset(Dataset):
-    def __init__(self, dataset):
-        """A lazy cache of the base dataset.
-
-        Args:
-            dataset (Dataset): the base dataset to cache.
-        """
-        self._dataset = dataset
-        self._cache = dict()
-
-    def __len__(self):
-        return len(self._dataset)
-
-    def __getitem__(self, i):
-        if i not in self._cache:
-            self._cache[i] = self._dataset[i]
-        return self._cache[i]
-
-
-class TupleDataset(Dataset):
-    def __init__(self, *datasets):
-        """A compound dataset made from several datasets of the same length. An example of the `TupleDataset` is a tuple of examples from the constituent datasets.
-
-        Args:
-            datasets: tuple[Dataset], the constituent datasets.
-        """
-        if not datasets:
-            raise ValueError("no datasets are given")
-        length = len(datasets[0])
-        for i, dataset in enumerate(datasets):
-            if len(dataset) != length:
-                raise ValueError("all the datasets should have the same length."
-                                 "dataset {} has a different length".format(i))
-        self._datasets = datasets
-        self._length = length
-
-    def __getitem__(self, index):
-        # SOA
-        batches = [dataset[index] for dataset in self._datasets]
-        if isinstance(index, slice):
-            length = len(batches[0])
-            # AOS
-            return [
-                tuple([batch[i] for batch in batches])
-                for i in six.moves.range(length)
-            ]
-        else:
-            return tuple(batches)
-
-    def __len__(self):
-        return self._length
-
-
-class DictDataset(Dataset):
-    def __init__(self, **datasets):
-        """
-        A compound dataset made from several datasets of the same length. An 
-        example of the `DictDataset` is a dict of examples from the constituent 
-        datasets.
-
-        WARNING: paddle does not have a good support for DictDataset, because
-        every batch yield from a DataLoader is a list, but it cannot be a dict.
-        So you have to provide a collate function because you cannot use the
-        default one.
-
-        Args:
-            datasets: Dict[Dataset], the constituent datasets.
-        """
-        if not datasets:
-            raise ValueError("no datasets are given")
-        length = None
-        for key, dataset in six.iteritems(datasets):
-            if length is None:
-                length = len(dataset)
-            elif len(dataset) != length:
-                raise ValueError(
-                    "all the datasets should have the same length."
-                    "dataset {} has a different length".format(key))
-        self._datasets = datasets
-        self._length = length
-
-    def __getitem__(self, index):
-        batches = {
-            key: dataset[index]
-            for key, dataset in six.iteritems(self._datasets)
-        }
-        if isinstance(index, slice):
-            length = len(six.next(six.itervalues(batches)))
-            return [{key: batch[i]
-                     for key, batch in six.iteritems(batches)}
-                    for i in six.moves.range(length)]
-        else:
-            return batches
-
-    def __len__(self):
-        return self._length
-
-
-class SliceDataset(Dataset):
-    def __init__(self, dataset, start, finish, order=None):
-        """A Dataset which is a slice of the base dataset.
-
-        Args:
-            dataset (Dataset): the base dataset.
-            start (int): the start of the slice.
-            finish (int): the end of the slice, not inclusive.
-            order (List[int], optional): the order, it is a permutation of the valid example ids of the base dataset. If `order` is provided, the slice is taken in `order`. Defaults to None.
-        """
-        if start < 0 or finish > len(dataset):
-            raise ValueError("subset overruns the dataset.")
-        self._dataset = dataset
-        self._start = start
-        self._finish = finish
-        self._size = finish - start
-
-        if order is not None and len(order) != len(dataset):
-            raise ValueError(
-                "order should have the same length as the dataset"
-                "len(order) = {} which does not euqals len(dataset) = {} ".
-                format(len(order), len(dataset)))
-        self._order = order
-
-    def __len__(self):
-        return self._size
-
-    def __getitem__(self, i):
-        if i >= 0:
-            if i >= self._size:
-                raise IndexError('dataset index out of range')
-            index = self._start + i
-        else:
-            if i < -self._size:
-                raise IndexError('dataset index out of range')
-            index = self._finish + i
-
-        if self._order is not None:
-            index = self._order[index]
-        return self._dataset[index]
-
-
-class SubsetDataset(Dataset):
-    def __init__(self, dataset, indices):
-        """A Dataset which is a subset of the base dataset.
-
-        Args:
-            dataset (Dataset): the base dataset.
-            indices (Iterable[int]): the indices of the examples to pick.
-        """
-        self._dataset = dataset
-        if len(indices) > len(dataset):
-            raise ValueError("subset's size larger that dataset's size!")
-        self._indices = indices
-        self._size = len(indices)
-
-    def __len__(self):
-        return self._size
-
-    def __getitem__(self, i):
-        index = self._indices[i]
-        return self._dataset[index]
-
-
-class FilterDataset(Dataset):
-    def __init__(self, dataset, filter_fn):
-        """A filtered dataset.
-
-        Args:
-            dataset (Dataset): the base dataset.
-            filter_fn (callable): a callable which takes an example of the base dataset and return a boolean.
-        """
-        self._dataset = dataset
-        self._indices = [
-            i for i in range(len(dataset)) if filter_fn(dataset[i])
-        ]
-        self._size = len(self._indices)
-
-    def __len__(self):
-        return self._size
-
-    def __getitem__(self, i):
-        index = self._indices[i]
-        return self._dataset[index]
-
-
-class ChainDataset(Dataset):
-    def __init__(self, *datasets):
-        """A concatenation of the several datasets which the same structure.
-
-        Args:
-            datasets (Iterable[Dataset]): datasets to concat.
-        """
-        self._datasets = datasets
-
-    def __len__(self):
-        return sum(len(dataset) for dataset in self._datasets)
-
-    def __getitem__(self, i):
-        if i < 0:
-            raise IndexError("ChainDataset doesnot support negative indexing.")
-
-        for dataset in self._datasets:
-            if i < len(dataset):
-                return dataset[i]
-            i -= len(dataset)
-
-        raise IndexError("dataset index out of range")
diff --git a/paddlespeech/t2s/datasets/__init__.py b/paddlespeech/t2s/datasets/__init__.py
index fc64a82f..caf20aac 100644
--- a/paddlespeech/t2s/datasets/__init__.py
+++ b/paddlespeech/t2s/datasets/__init__.py
@@ -11,5 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .common import *
 from .ljspeech import *
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index 655e06e3..4e3ad3c1 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -14,7 +14,7 @@
 import numpy as np
 import paddle
 
-from paddlespeech.t2s.data.batch import batch_sequences
+from paddlespeech.t2s.datasets.batch import batch_sequences
 
 
 def tacotron2_single_spk_batch_fn(examples):
diff --git a/paddlespeech/t2s/data/batch.py b/paddlespeech/t2s/datasets/batch.py
similarity index 100%
rename from paddlespeech/t2s/data/batch.py
rename to paddlespeech/t2s/datasets/batch.py
diff --git a/paddlespeech/t2s/datasets/common.py b/paddlespeech/t2s/datasets/common.py
deleted file mode 100644
index 122a35ae..00000000
--- a/paddlespeech/t2s/datasets/common.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from pathlib import Path
-from typing import List
-
-import librosa
-import numpy as np
-from paddle.io import Dataset
-
-__all__ = ["AudioSegmentDataset", "AudioDataset", "AudioFolderDataset"]
-
-
-class AudioSegmentDataset(Dataset):
-    """A simple dataset adaptor for audio files to train vocoders.
-    Read -> trim silence -> normalize -> extract a segment
-    """
-
-    def __init__(self,
-                 file_paths: List[Path],
-                 sample_rate: int,
-                 length: int,
-                 top_db: float):
-        self.file_paths = file_paths
-        self.sr = sample_rate
-        self.top_db = top_db
-        self.length = length  # samples in the clip
-
-    def __getitem__(self, i):
-        fpath = self.file_paths[i]
-        y, sr = librosa.load(fpath, sr=self.sr)
-        y, _ = librosa.effects.trim(y, top_db=self.top_db)
-        y = librosa.util.normalize(y)
-        y = y.astype(np.float32)
-
-        # pad or trim
-        if y.size <= self.length:
-            y = np.pad(y, [0, self.length - len(y)], mode='constant')
-        else:
-            start = np.random.randint(0, 1 + len(y) - self.length)
-            y = y[start:start + self.length]
-        return y
-
-    def __len__(self):
-        return len(self.file_paths)
-
-
-class AudioDataset(Dataset):
-    """A simple dataset adaptor for the audio files.
-    Read -> trim silence -> normalize
-    """
-
-    def __init__(self,
-                 file_paths: List[Path],
-                 sample_rate: int,
-                 top_db: float=60):
-        self.file_paths = file_paths
-        self.sr = sample_rate
-        self.top_db = top_db
-
-    def __getitem__(self, i):
-        fpath = self.file_paths[i]
-        y, sr = librosa.load(fpath, sr=self.sr)
-        y, _ = librosa.effects.trim(y, top_db=self.top_db)
-        y = librosa.util.normalize(y)
-        y = y.astype(np.float32)
-        return y
-
-    def __len__(self):
-        return len(self.file_paths)
-
-
-class AudioFolderDataset(AudioDataset):
-    def __init__(
-            self,
-            root,
-            sample_rate,
-            top_db=60,
-            extension=".wav", ):
-        root = Path(root).expanduser()
-        file_paths = sorted(list(root.rglob("*{}".format(extension))))
-        super().__init__(file_paths, sample_rate, top_db)
diff --git a/paddlespeech/t2s/data/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
similarity index 100%
rename from paddlespeech/t2s/data/get_feats.py
rename to paddlespeech/t2s/datasets/get_feats.py
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index fd6da2cb..5bda7545 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -27,9 +27,9 @@ import tqdm
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.data.get_feats import Energy
-from paddlespeech.t2s.data.get_feats import LogMelFBank
-from paddlespeech.t2s.data.get_feats import Pitch
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
 from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
 from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
index f5affb50..def30e67 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
@@ -23,7 +23,7 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGInference
 from paddlespeech.t2s.modules.normalizer import ZScore
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index 47d0a292..4871bca7 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -27,7 +27,7 @@ import tqdm
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
 from paddlespeech.t2s.utils import str2bool
diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
index db888fba..3f81c4e1 100644
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -27,7 +27,7 @@ import tqdm
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
 from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones
diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
index ffbeaad9..7f41089e 100644
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -27,7 +27,7 @@ import tqdm
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
 from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
index 93158b67..7cfa91b9 100644
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -26,7 +26,7 @@ import tqdm
 import yaml
 from yacs.config import CfgNode as Configuration
 
-from paddlespeech.t2s.data.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.frontend import English
 
 
diff --git a/paddlespeech/t2s/exps/waveflow/ljspeech.py b/paddlespeech/t2s/exps/waveflow/ljspeech.py
index 655b63da..a6efa9ec 100644
--- a/paddlespeech/t2s/exps/waveflow/ljspeech.py
+++ b/paddlespeech/t2s/exps/waveflow/ljspeech.py
@@ -17,8 +17,8 @@ import numpy as np
 import pandas
 from paddle.io import Dataset
 
-from paddlespeech.t2s.data.batch import batch_spec
-from paddlespeech.t2s.data.batch import batch_wav
+from paddlespeech.t2s.datasets.batch import batch_spec
+from paddlespeech.t2s.datasets.batch import batch_wav
 
 
 class LJSpeech(Dataset):

From 9699c00769e90fcfcd297240e87b12adb21e8caf Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 11 Feb 2022 14:11:40 +0000
Subject: [PATCH 2/2] change the docstring style from numpydoc to google,
 test=tts

---
 paddlespeech/t2s/datasets/data_table.py       |  56 +-
 paddlespeech/t2s/datasets/preprocess_utils.py |  51 +-
 paddlespeech/t2s/datasets/vocoder_batch_fn.py |  64 +--
 .../t2s/exps/transformer_tts/preprocess.py    |  28 +-
 paddlespeech/t2s/frontend/arpabet.py          | 104 ++--
 paddlespeech/t2s/frontend/phonectic.py        | 145 ++----
 paddlespeech/t2s/frontend/vocab.py            |  22 +-
 .../frontend/zh_normalization/chronology.py   |  30 +-
 .../t2s/frontend/zh_normalization/num.py      |  70 +--
 .../frontend/zh_normalization/phonecode.py    |  20 +-
 .../frontend/zh_normalization/quantifier.py   |  10 +-
 .../zh_normalization/text_normlization.py     |  12 +-
 .../t2s/models/fastspeech2/fastspeech2.py     | 473 ++++++-----------
 paddlespeech/t2s/models/hifigan/hifigan.py    | 295 ++++-------
 paddlespeech/t2s/models/melgan/melgan.py      | 199 +++-----
 .../t2s/models/melgan/style_melgan.py         | 109 ++--
 .../parallel_wavegan/parallel_wavegan.py      | 227 +++-----
 .../t2s/models/tacotron2/tacotron2.py         | 207 +++-----
 .../models/transformer_tts/transformer_tts.py | 333 +++++-------
 paddlespeech/t2s/models/waveflow.py           | 483 ++++++------------
 paddlespeech/t2s/models/wavernn/wavernn.py    | 240 ++++-----
 paddlespeech/t2s/modules/causal_conv.py       |  24 +-
 .../t2s/modules/conformer/convolution.py      |  23 +-
 .../t2s/modules/conformer/encoder_layer.py    |  82 ++-
 paddlespeech/t2s/modules/conv.py              | 164 +++---
 paddlespeech/t2s/modules/geometry.py          |  28 +-
 paddlespeech/t2s/modules/layer_norm.py        |  22 +-
 paddlespeech/t2s/modules/losses.py            | 434 ++++++----------
 paddlespeech/t2s/modules/nets_utils.py        | 121 ++---
 paddlespeech/t2s/modules/pqmf.py              |  64 +--
 .../modules/predictor/duration_predictor.py   |  87 ++--
 .../t2s/modules/predictor/length_regulator.py |  24 +-
 .../modules/predictor/variance_predictor.py   |  33 +-
 paddlespeech/t2s/modules/residual_block.py    |  90 ++--
 paddlespeech/t2s/modules/residual_stack.py    |  44 +-
 paddlespeech/t2s/modules/style_encoder.py     | 124 ++---
 .../t2s/modules/tacotron2/attentions.py       | 213 +++-----
 paddlespeech/t2s/modules/tacotron2/decoder.py | 271 ++++------
 paddlespeech/t2s/modules/tacotron2/encoder.py |  75 +--
 paddlespeech/t2s/modules/tade_res_block.py    |  37 +-
 .../t2s/modules/transformer/attention.py      | 141 ++---
 .../t2s/modules/transformer/decoder.py        | 150 ++----
 .../t2s/modules/transformer/decoder_layer.py  |  77 ++-
 .../t2s/modules/transformer/embedding.py      |  83 +--
 .../t2s/modules/transformer/encoder.py        | 316 ++++--------
 .../t2s/modules/transformer/encoder_layer.py  |  52 +-
 .../t2s/modules/transformer/lightconv.py      |  44 +-
 paddlespeech/t2s/modules/transformer/mask.py  |  41 +-
 .../modules/transformer/multi_layer_conv.py   |  54 +-
 .../transformer/positionwise_feed_forward.py  |  12 +-
 .../t2s/modules/transformer/repeat.py         |  15 +-
 .../t2s/modules/transformer/subsampling.py    |  36 +-
 paddlespeech/t2s/modules/upsample.py          | 141 ++---
 paddlespeech/t2s/training/experiment.py       |  53 +-
 .../t2s/training/extensions/snapshot.py       |   6 +-
 paddlespeech/t2s/utils/error_rate.py          | 109 ++--
 paddlespeech/t2s/utils/h5_utils.py            |  32 +-
 57 files changed, 2350 insertions(+), 4150 deletions(-)

diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py
index b0e4c891..c9815af2 100644
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@@ -22,26 +22,17 @@ from paddle.io import Dataset
 
 class DataTable(Dataset):
     """Dataset to load and convert data for general purpose.
-
-    Parameters
-    ----------
-    data : List[Dict[str, Any]]
-        Metadata, a list of meta datum, each of which is composed of 
-        several fields
-    fields : List[str], optional
-        Fields to use, if not specified, all the fields in the data are 
-        used, by default None
-    converters : Dict[str, Callable], optional
-        Converters used to process each field, by default None
-    use_cache : bool, optional
-        Whether to use cache, by default False
-
-    Raises
-    ------
-    ValueError
-        If there is some field that does not exist in data. 
-    ValueError
-        If there is some field in converters that does not exist in fields.
+    Args:
+        data (List[Dict[str, Any]]): Metadata, a list of meta datum, each of which is composed of  several fields
+        fields (List[str], optional): Fields to use, if not specified, all the fields in the data are used, by default None
+        converters (Dict[str, Callable], optional): Converters used to process each field, by default None
+        use_cache (bool, optional): Whether to use cache, by default False
+
+    Raises:
+        ValueError:
+            If there is some field that does not exist in data. 
+        ValueError:
+            If there is some field in converters that does not exist in fields.
     """
 
     def __init__(self,
@@ -95,15 +86,11 @@ class DataTable(Dataset):
         """Convert a meta datum to an example by applying the corresponding 
         converters to each fields requested.
 
-        Parameters
-        ----------
-        meta_datum : Dict[str, Any]
-            Meta datum
+        Args:
+            meta_datum (Dict[str, Any]): Meta datum
 
-        Returns
-        -------
-        Dict[str, Any]
-            Converted example
+        Returns:
+            Dict[str, Any]: Converted example
         """
         example = {}
         for field in self.fields:
@@ -118,16 +105,11 @@ class DataTable(Dataset):
 
     def __getitem__(self, idx: int) -> Dict[str, Any]:
         """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get
 
-        Parameters
-        ----------
-        idx : int
-            Index of the example to get
-
-        Returns
-        -------
-        Dict[str, Any]
-            A converted example
+        Returns:
+            Dict[str, Any]: A converted example
         """
         if self.use_cache and self.caches[idx] is not None:
             return self.caches[idx]
diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py
index 8b01f6c3..445b69bd 100644
--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -18,14 +18,10 @@ import re
 def get_phn_dur(file_name):
     '''
     read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-        path of gen_duration_from_textgrid.py's result
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+    Returns: 
+        Dict: sentence: {'utt': ([char], [int])}
     '''
     f = open(file_name, 'r')
     sentence = {}
@@ -48,10 +44,8 @@ def get_phn_dur(file_name):
 def merge_silence(sentence):
     '''
     merge silences
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': (([char], [int]), str)}
+    Args:
+        sentence (Dict): sentence: {'utt': (([char], [int]), str)}
     '''
     for utt in sentence:
         cur_phn, cur_dur, speaker = sentence[utt]
@@ -81,12 +75,9 @@ def merge_silence(sentence):
 def get_input_token(sentence, output_path, dataset="baker"):
     '''
     get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    output_path : str or path
-        path to save phone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        output_path (str or path):path to save phone_id_map
     '''
     phn_token = set()
     for utt in sentence:
@@ -112,14 +103,10 @@ def get_phones_tones(sentence,
                      dataset="baker"):
     '''
     get phone set and tone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], [int])}
-    phones_output_path : str or path
-        path to save phone_id_map
-    tones_output_path : str or path
-        path to save tone_id_map
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], [int])}
+        phones_output_path (str or path): path to save phone_id_map
+        tones_output_path (str or path): path to save tone_id_map
     '''
     phn_token = set()
     tone_token = set()
@@ -162,14 +149,10 @@ def get_spk_id_map(speaker_set, output_path):
 def compare_duration_and_mel_length(sentences, utt, mel):
     '''
     check duration error, correct sentences[utt] if possible, else pop sentences[utt]
-    Parameters
-    ----------
-    sentences : Dict
-        sentences[utt] = [phones_list ,durations_list]
-    utt : str
-        utt_id
-    mel : np.ndarry
-        features (num_frames, n_mels)
+    Args:
+        sentences (Dict): sentences[utt] = [phones_list ,durations_list]
+        utt (str): utt_id
+        mel (np.ndarry): features (num_frames, n_mels)
     '''
 
     if utt in sentences:
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
index d969a1d3..08748de0 100644
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -29,15 +29,11 @@ class Clip(object):
             hop_size=256,
             aux_context_window=0, ):
         """Initialize customized collater for DataLoader.
+        Args:
 
-        Parameters
-        ----------
-        batch_max_steps : int
-            The maximum length of input signal in batch.
-        hop_size : int
-            Hop size of auxiliary features.
-        aux_context_window : int
-            Context window size for auxiliary feature conv.
+            batch_max_steps (int): The maximum length of input signal in batch.
+            hop_size (int): Hop size of auxiliary features.
+            aux_context_window (int): Context window size for auxiliary feature conv.
 
         """
         if batch_max_steps % hop_size != 0:
@@ -56,18 +52,15 @@ class Clip(object):
     def __call__(self, batch):
         """Convert into batch tensors.
 
-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
 
-        Returns
-        ----------
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
-        Tensor
-            Target signal batch (B, 1, T).
+        Returns: 
+            Tensor:
+                Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
+            Tensor:
+                Target signal batch (B, 1, T).
 
         """
         # check length
@@ -104,11 +97,10 @@ class Clip(object):
     def _adjust_length(self, x, c):
         """Adjust the audio and feature lengths.
 
-        Note
-        -------
-        Basically we assume that the length of x and c are adjusted
-        through preprocessing stage, but if we use other library processed
-        features, this process will be needed.
+        Note:
+            Basically we assume that the length of x and c are adjusted
+            through preprocessing stage, but if we use other library processed
+            features, this process will be needed.
 
         """
         if len(x) < c.shape[0] * self.hop_size:
@@ -162,22 +154,14 @@ class WaveRNNClip(Clip):
         # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
         # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
         """Convert into batch tensors.
-
-        Parameters
-        ----------
-        batch : list
-            list of tuple of the pair of audio and features. 
-            Audio shape (T, ), features shape(T', C).
-
-        Returns
-        ----------
-        Tensor
-            Input signal batch (B, 1, T).
-        Tensor
-            Target signal batch (B, 1, T).
-        Tensor
-            Auxiliary feature batch (B, C, T'), where
-            T = (T' - 2 * aux_context_window) * hop_size.
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns:
+            Tensor: Input signal batch (B, 1, T).
+            Tensor: Target signal batch (B, 1, T).
+            Tensor: Auxiliary feature batch (B, C, T'), 
+                where T = (T' - 2 * aux_context_window) * hop_size.
 
         """
         # check length
diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
index 7cfa91b9..9aa87e91 100644
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -31,15 +31,12 @@ from paddlespeech.t2s.frontend import English
 
 
 def get_lj_sentences(file_name, frontend):
-    '''
-    read MFA duration.txt
-    Parameters
-    ----------
-    file_name : str or Path
-    Returns
-    ----------
-    Dict
-        sentence: {'utt': ([char], [int])}
+    '''read MFA duration.txt
+
+    Args:
+        file_name (str or Path)
+    Returns:
+        Dict: sentence: {'utt': ([char], [int])}
     '''
     f = open(file_name, 'r')
     sentence = {}
@@ -59,14 +56,11 @@ def get_lj_sentences(file_name, frontend):
 
 
 def get_input_token(sentence, output_path):
-    '''
-    get phone set from training data and save it
-    Parameters
-    ----------
-    sentence : Dict
-        sentence: {'utt': ([char], str)}
-    output_path : str or path
-        path to save phone_id_map
+    '''get phone set from training data and save it
+    
+    Args:
+        sentence (Dict): sentence: {'utt': ([char], str)}
+        output_path (str or path): path to save phone_id_map
     '''
     phn_token = set()
     for utt in sentence:
diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py
index 094a2bfa..7a81b645 100644
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -133,16 +133,11 @@ class ARPABET(Phonetics):
 
     def phoneticize(self, sentence, add_start_end=False):
         """ Normalize the input text sequence and convert it into pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
     
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns:
+            List[str]: The list of pronunciation sequence.
         """
         phonemes = [
             self._remove_vowels(item) for item in self.backend(sentence)
@@ -156,16 +151,12 @@ class ARPABET(Phonetics):
 
     def numericalize(self, phonemes):
         """ Convert pronunciation sequence into pronunciation id sequence.
-        
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
     
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
         """
         ids = [self.vocab.lookup(item) for item in phonemes]
         return ids
@@ -173,30 +164,23 @@ class ARPABET(Phonetics):
     def reverse(self, ids):
         """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
         
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids( List[int]): The list of pronunciation id sequence.
     
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: 
+                The list of pronunciation sequence.
         """
         return [self.vocab.reverse(i) for i in ids]
 
     def __call__(self, sentence, add_start_end=False):
         """ Convert the input text sequence into pronunciation id sequence.
     
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args:
+            sentence (str): The input text sequence.
     
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
         """
         return self.numericalize(
             self.phoneticize(sentence, add_start_end=add_start_end))
@@ -229,15 +213,11 @@ class ARPABETWithStress(Phonetics):
     def phoneticize(self, sentence, add_start_end=False):
         """ Normalize the input text sequence and convert it into pronunciation sequence.
     
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
+        Args: 
+            sentence (str): The input text sequence.
     
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
         """
         phonemes = self.backend(sentence)
         if add_start_end:
@@ -249,47 +229,33 @@ class ARPABETWithStress(Phonetics):
 
     def numericalize(self, phonemes):
         """ Convert pronunciation sequence into pronunciation id sequence.
-        
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
+
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
     
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Returns:
+            List[int]: The list of pronunciation id sequence.
         """
         ids = [self.vocab.lookup(item) for item in phonemes]
         return ids
 
     def reverse(self, ids):
         """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
     
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
         """
         return [self.vocab.reverse(i) for i in ids]
 
     def __call__(self, sentence, add_start_end=False):
         """ Convert the input text sequence into pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
     
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-    
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
         """
         return self.numericalize(
             self.phoneticize(sentence, add_start_end=add_start_end))
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index a488a6fc..8e9f1173 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -65,14 +65,10 @@ class English(Phonetics):
 
     def phoneticize(self, sentence):
         """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
         """
         start = self.vocab.start_symbol
         end = self.vocab.end_symbol
@@ -123,14 +119,10 @@ class English(Phonetics):
 
     def numericalize(self, phonemes):
         """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes (List[str]): The list of pronunciation sequence.
+        Returns: 
+            List[int]: The list of pronunciation id sequence.
         """
         ids = [
             self.vocab.lookup(item) for item in phonemes
@@ -140,27 +132,19 @@ class English(Phonetics):
 
     def reverse(self, ids):
         """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
         """
         return [self.vocab.reverse(i) for i in ids]
 
     def __call__(self, sentence):
         """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation id sequence.
         """
         return self.numericalize(self.phoneticize(sentence))
 
@@ -183,28 +167,21 @@ class EnglishCharacter(Phonetics):
 
     def phoneticize(self, sentence):
         """ Normalize the input text sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        str
-            A text sequence after normalize.
+        Args:
+            sentence(str): The input text sequence.
+        Returns:
+            str: A text sequence after normalize.
         """
         words = normalize(sentence)
         return words
 
     def numericalize(self, sentence):
         """ Convert a text sequence into ids.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[int]:
+                List of a character id sequence.
         """
         ids = [
             self.vocab.lookup(item) for item in sentence
@@ -214,27 +191,19 @@ class EnglishCharacter(Phonetics):
 
     def reverse(self, ids):
         """ Convert a character id sequence into text.
-        Parameters
-        -----------
-        ids: List[int]
-            List of a character id sequence.
-        Returns
-        ----------
-        str
-            The input text sequence.
+        Args:
+            ids (List[int]): List of a character id sequence.
+        Returns:
+            str: The input text sequence.
         """
         return [self.vocab.reverse(i) for i in ids]
 
     def __call__(self, sentence):
         """ Normalize the input text sequence and convert it into character id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[int]
-            List of a character id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns: 
+            List[int]: List of a character id sequence.
         """
         return self.numericalize(self.phoneticize(sentence))
 
@@ -264,14 +233,10 @@ class Chinese(Phonetics):
 
     def phoneticize(self, sentence):
         """ Normalize the input text sequence and convert it into pronunciation sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+            sentence(str): The input text sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
         """
         # simplified = self.opencc_backend.convert(sentence)
         simplified = sentence
@@ -296,28 +261,20 @@ class Chinese(Phonetics):
 
     def numericalize(self, phonemes):
         """ Convert pronunciation sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        phonemes: List[str]
-            The list of pronunciation sequence.
-        Returns
-        ----------
-        List[int]
-            The list of pronunciation id sequence.
+        Args:
+            phonemes(List[str]): The list of pronunciation sequence.
+        Returns:
+                List[int]: The list of pronunciation id sequence.
         """
         ids = [self.vocab.lookup(item) for item in phonemes]
         return ids
 
     def __call__(self, sentence):
         """ Convert the input text sequence into pronunciation id sequence.
-        Parameters
-        -----------
-        sentence: str
-            The input text sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation id sequence.
+        Args:
+            sentence (str): The input text sequence.
+        Returns:
+            List[str]: The list of pronunciation id sequence.
         """
         return self.numericalize(self.phoneticize(sentence))
 
@@ -329,13 +286,9 @@ class Chinese(Phonetics):
 
     def reverse(self, ids):
         """ Reverse the list of pronunciation id sequence to a list of pronunciation sequence.
-        Parameters
-        -----------
-        ids: List[int]
-            The list of pronunciation id sequence.
-        Returns
-        ----------
-        List[str]
-            The list of pronunciation sequence.
+        Args:
+        ids (List[int]): The list of pronunciation id sequence.
+        Returns: 
+            List[str]: The list of pronunciation sequence.
         """
         return [self.vocab.reverse(i) for i in ids]
diff --git a/paddlespeech/t2s/frontend/vocab.py b/paddlespeech/t2s/frontend/vocab.py
index 9ef6b137..76bb3c7b 100644
--- a/paddlespeech/t2s/frontend/vocab.py
+++ b/paddlespeech/t2s/frontend/vocab.py
@@ -20,22 +20,12 @@ __all__ = ["Vocab"]
 class Vocab(object):
     """  Vocabulary.
 
-    Parameters
-    -----------
-    symbols: Iterable[str]
-        Common symbols.
-
-    padding_symbol: str, optional
-        Symbol for pad. Defaults to "<pad>".
-
-    unk_symbol: str, optional
-        Symbol for unknow. Defaults to "<unk>"
-
-    start_symbol: str, optional
-        Symbol for start. Defaults to "<s>"
-
-    end_symbol: str, optional
-        Symbol for end. Defaults to "</s>"
+    Args:
+        symbols (Iterable[str]): Common symbols.
+        padding_symbol (str, optional): Symbol for pad. Defaults to "<pad>".
+        unk_symbol (str, optional): Symbol for unknow. Defaults to "<unk>"
+        start_symbol (str, optional): Symbol for start. Defaults to "<s>"
+        end_symbol (str, optional): Symbol for end. Defaults to "</s>"
     """
 
     def __init__(self,
diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
index 8801baa0..bfa7d2b1 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -44,12 +44,10 @@ RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
 
 def replace_time(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
 
     is_range = len(match.groups()) > 5
@@ -87,12 +85,10 @@ RE_DATE = re.compile(r'(\d{4}|\d{2})年'
 
 def replace_date(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     year = match.group(1)
     month = match.group(3)
@@ -114,12 +110,10 @@ RE_DATE2 = re.compile(
 
 def replace_date2(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     year = match.group(1)
     month = match.group(3)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py
index 1e575c08..416edfb1 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -36,12 +36,10 @@ RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
 
 def replace_frac(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     sign = match.group(1)
     nominator = match.group(2)
@@ -59,12 +57,10 @@ RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
 
 def replace_percentage(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     sign = match.group(1)
     percent = match.group(2)
@@ -81,12 +77,10 @@ RE_INTEGER = re.compile(r'(-)' r'(\d+)')
 
 def replace_negative_num(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     sign = match.group(1)
     number = match.group(2)
@@ -103,12 +97,10 @@ RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
 
 def replace_default_num(match):
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     number = match.group(0)
     return verbalize_digit(number)
@@ -124,12 +116,10 @@ RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
 
 def replace_positive_quantifier(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     number = match.group(1)
     match_2 = match.group(2)
@@ -142,12 +132,10 @@ def replace_positive_quantifier(match) -> str:
 
 def replace_number(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     sign = match.group(1)
     number = match.group(2)
@@ -169,12 +157,10 @@ RE_RANGE = re.compile(
 
 def replace_range(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     first, second = match.group(1), match.group(8)
     first = RE_NUMBER.sub(replace_number, first)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
index b7b69b41..06b5d41b 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -45,23 +45,19 @@ def phone2str(phone_string: str, mobile=True) -> str:
 
 def replace_phone(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     return phone2str(match.group(0), mobile=False)
 
 
 def replace_mobile(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     return phone2str(match.group(0))
diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
index d3805a32..268d7229 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py
@@ -22,12 +22,10 @@ RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
 
 def replace_temperature(match) -> str:
     """
-    Parameters
-    ----------
-    match : re.Match
-    Returns
-    ----------
-    str
+    Args:
+        match (re.Match)
+    Returns:
+        str
     """
     sign = match.group(1)
     temperature = match.group(2)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
index 9794a700..f9d1b8cb 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -55,14 +55,10 @@ class TextNormalizer():
 
     def _split(self, text: str, lang="zh") -> List[str]:
         """Split long text into sentences with sentence-splitting punctuations.
-        Parameters
-        ----------
-        text : str
-            The input text.
-        Returns
-        -------
-        List[str]
-            Sentences.
+        Args:
+            text (str): The input text.
+        Returns:
+            List[str]: Sentences.
         """
         # Only for pure Chinese here
         if lang == "zh":
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 3e952c20..73f5498e 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -38,17 +38,21 @@ from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 
 class FastSpeech2(nn.Layer):
     """FastSpeech2 module.
-
+    
     This is a module of FastSpeech2 described in `FastSpeech 2: Fast and
     High-Quality End-to-End Text to Speech`_. Instead of quantized pitch and
     energy, we use token-averaged value introduced in `FastPitch: Parallel
     Text-to-speech with Pitch Prediction`_.
-
+    
     .. _`FastSpeech 2: Fast and High-Quality End-to-End Text to Speech`:
         https://arxiv.org/abs/2006.04558
     .. _`FastPitch: Parallel Text-to-speech with Pitch Prediction`:
         https://arxiv.org/abs/2006.06873
 
+    Args:
+    
+    Returns:
+
     """
 
     def __init__(
@@ -127,136 +131,72 @@ class FastSpeech2(nn.Layer):
             init_enc_alpha: float=1.0,
             init_dec_alpha: float=1.0, ):
         """Initialize FastSpeech2 module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        adim : int
-            Attention dimension.
-        aheads : int
-            Number of attention heads.
-        elayers : int
-            Number of encoder layers.
-        eunits : int
-            Number of encoder hidden units.
-        dlayers : int
-            Number of decoder layers.
-        dunits : int
-            Number of decoder hidden units.
-        postnet_layers : int
-            Number of postnet layers.
-        postnet_chans : int
-            Number of postnet channels.
-        postnet_filts : int
-            Kernel size of postnet.
-        postnet_dropout_rate : float
-            Dropout rate in postnet.
-        use_scaled_pos_enc : bool
-            Whether to use trainable scaled pos encoding.
-        use_batch_norm : bool
-            Whether to use batch normalization in encoder prenet.
-        encoder_normalize_before : bool
-            Whether to apply layernorm layer before encoder block.
-        decoder_normalize_before : bool
-            Whether to apply layernorm layer before
-            decoder block.
-        encoder_concat_after : bool
-            Whether to concatenate attention layer's input and output in encoder.
-        decoder_concat_after : bool
-            Whether to concatenate attention layer's input  and output in decoder.
-        reduction_factor : int
-            Reduction factor.
-        encoder_type : str
-            Encoder type ("transformer" or "conformer").
-        decoder_type : str
-            Decoder type ("transformer" or "conformer").
-        transformer_enc_dropout_rate : float
-            Dropout rate in encoder except attention and positional encoding.
-        transformer_enc_positional_dropout_rate (float): Dropout rate after encoder
-            positional encoding.
-        transformer_enc_attn_dropout_rate (float): Dropout rate in encoder
-            self-attention module.
-        transformer_dec_dropout_rate (float): Dropout rate in decoder except
-            attention & positional encoding.
-        transformer_dec_positional_dropout_rate (float): Dropout rate after decoder
-            positional encoding.
-        transformer_dec_attn_dropout_rate (float): Dropout rate in decoder
-            self-attention module.
-        conformer_pos_enc_layer_type : str
-            Pos encoding layer type in conformer.
-        conformer_self_attn_layer_type : str
-            Self-attention layer type in conformer
-        conformer_activation_type : str
-            Activation function type in conformer.
-        use_macaron_style_in_conformer : bool
-            Whether to use macaron style FFN.
-        use_cnn_in_conformer : bool
-            Whether to use CNN in conformer.
-        zero_triu : bool
-            Whether to use zero triu in relative self-attention module.
-        conformer_enc_kernel_size : int
-            Kernel size of encoder conformer.
-        conformer_dec_kernel_size : int
-            Kernel size of decoder conformer.
-        duration_predictor_layers : int
-            Number of duration predictor layers.
-        duration_predictor_chans : int
-            Number of duration predictor channels.
-        duration_predictor_kernel_size : int
-            Kernel size of duration predictor.
-        duration_predictor_dropout_rate : float
-            Dropout rate in duration predictor.
-        pitch_predictor_layers : int
-            Number of pitch predictor layers.
-        pitch_predictor_chans : int
-            Number of pitch predictor channels.
-        pitch_predictor_kernel_size : int
-            Kernel size of pitch predictor.
-        pitch_predictor_dropout_rate : float
-            Dropout rate in pitch predictor.
-        pitch_embed_kernel_size : float
-            Kernel size of pitch embedding.
-        pitch_embed_dropout_rate : float
-            Dropout rate for pitch embedding.
-        stop_gradient_from_pitch_predictor : bool
-            Whether to stop gradient from pitch predictor to encoder.
-        energy_predictor_layers : int
-            Number of energy predictor layers.
-        energy_predictor_chans : int
-            Number of energy predictor channels.
-        energy_predictor_kernel_size : int
-            Kernel size of energy predictor.
-        energy_predictor_dropout_rate : float
-            Dropout rate in energy predictor.
-        energy_embed_kernel_size : float
-            Kernel size of energy embedding.
-        energy_embed_dropout_rate : float
-            Dropout rate for energy embedding.
-        stop_gradient_from_energy_predictor : bool 
-            Whether to stop gradient from energy predictor to encoder.
-        spk_num : Optional[int]
-            Number of speakers. If not None, assume that the spk_embed_dim is not None,
-            spk_ids will be provided as the input and use spk_embedding_table.
-        spk_embed_dim : Optional[int]
-            Speaker embedding dimension. If not None, 
-            assume that spk_emb will be provided as the input or spk_num is not None.
-        spk_embed_integration_type : str
-            How to integrate speaker embedding.
-        tone_num : Optional[int]
-            Number of tones. If not None, assume that the
-            tone_ids will be provided as the input and use tone_embedding_table.
-        tone_embed_dim : Optional[int]
-            Tone embedding dimension. If not None, assume that tone_num is not None.
-        tone_embed_integration_type : str
-            How to integrate tone embedding.
-        init_type : str
-            How to initialize transformer parameters.
-        init_enc_alpha : float
-            Initial value of alpha in scaled pos encoding of the encoder.
-        init_dec_alpha : float
-            Initial value of alpha in scaled pos encoding of the decoder.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            adim (int): Attention dimension.
+            aheads (int): Number of attention heads.
+            elayers (int): Number of encoder layers.
+            eunits (int): Number of encoder hidden units.
+            dlayers (int): Number of decoder layers.
+            dunits (int): Number of decoder hidden units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_chans (int): Number of postnet channels.
+            postnet_filts (int): Kernel size of postnet.
+            postnet_dropout_rate (float): Dropout rate in postnet.
+            use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
+            decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
+            encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
+            decoder_concat_after (bool): Whether to concatenate attention layer's input  and output in decoder.
+            reduction_factor (int): Reduction factor.
+            encoder_type (str): Encoder type ("transformer" or "conformer").
+            decoder_type (str): Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
+            transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
+            transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
+            transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
+            conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): Self-attention layer type in conformer
+            conformer_activation_type (str): Activation function type in conformer.
+            use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
+            use_cnn_in_conformer (bool): Whether to use CNN in conformer.
+            zero_triu (bool): Whether to use zero triu in relative self-attention module.
+            conformer_enc_kernel_size (int): Kernel size of encoder conformer.
+            conformer_dec_kernel_size (int): Kernel size of decoder conformer.
+            duration_predictor_layers (int): Number of duration predictor layers.
+            duration_predictor_chans (int): Number of duration predictor channels.
+            duration_predictor_kernel_size (int): Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
+            pitch_predictor_layers (int): Number of pitch predictor layers.
+            pitch_predictor_chans (int): Number of pitch predictor channels.
+            pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
+            pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
+            pitch_embed_kernel_size (float): Kernel size of pitch embedding.
+            pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
+            stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
+            energy_predictor_layers (int): Number of energy predictor layers.
+            energy_predictor_chans (int): Number of energy predictor channels.
+            energy_predictor_kernel_size (int): Kernel size of energy predictor.
+            energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
+            energy_embed_kernel_size (float): Kernel size of energy embedding.
+            energy_embed_dropout_rate (float): Dropout rate for energy embedding.
+            stop_gradient_from_energy_predictor（bool): Whether to stop gradient from energy predictor to encoder.
+            spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
+                spk_ids will be provided as the input and use spk_embedding_table.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None, 
+                assume that spk_emb will be provided as the input or spk_num is not None.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            tone_num (Optional[int]): Number of tones. If not None, assume that the
+                tone_ids will be provided as the input and use tone_embedding_table.
+            tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
+            tone_embed_integration_type (str): How to integrate tone embedding.
+            init_type (str): How to initialize transformer parameters.
+            init_enc_alpha （float): Initial value of alpha in scaled pos encoding of the encoder.
+            init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
     
         """
         assert check_argument_types()
@@ -489,45 +429,21 @@ class FastSpeech2(nn.Layer):
     ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded token ids (B, Tmax).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input (B,).
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        durations : Tensor(int64)
-            Batch of padded durations (B, Tmax).
-        pitch : Tensor
-            Batch of padded token-averaged pitch (B, Tmax, 1).
-        energy : Tensor
-            Batch of padded token-averaged energy (B, Tmax, 1).
-        tone_id : Tensor, optional(int64)
-                Batch of padded tone ids  (B, Tmax).
-        spk_emb : Tensor, optional
-            Batch of speaker embeddings (B, spk_embed_dim).
-        spk_id : Tnesor, optional(int64)
-            Batch of speaker ids (B,)
-
-        Returns
-        ----------
-        Tensor
-            mel outs before postnet
-        Tensor
-            mel outs after postnet
-        Tensor
-            duration predictor's output
-        Tensor
-            pitch predictor's output
-        Tensor
-            energy predictor's output
-        Tensor
-            speech
-        Tensor
-            speech_lengths, modified if reduction_factor > 1
+        Args:
+            text(Tensor(int64)): Batch of padded token ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            durations(Tensor(int64)): Batch of padded durations (B, Tmax).
+            pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
+            tone_id(Tensor, optional(int64)): Batch of padded tone ids  (B, Tmax).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
+
+        Returns:
+
+        
         """
 
         # input of embedding must be int64
@@ -680,34 +596,22 @@ class FastSpeech2(nn.Layer):
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Generate the sequence of features given the sequences of characters.
 
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : Tensor, optional (int64)
-            Groundtruth of duration (T,).
-        pitch : Tensor, optional
-            Groundtruth of token-averaged pitch (T, 1).
-        energy : Tensor, optional
-            Groundtruth of token-averaged energy (T, 1).
-        alpha : float, optional
-            Alpha to control the speed.
-        use_teacher_forcing : bool, optional
-            Whether to use teacher forcing.
-            If true, groundtruth of duration, pitch and energy will be used.
-        spk_emb : Tensor, optional
-            peaker embedding vector (spk_embed_dim,).
-        spk_id : Tensor, optional(int64)
-            Batch of padded spk ids  (1,).
-        tone_id : Tensor, optional(int64)
-            Batch of padded tone ids  (T,).
-
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            durations(Tensor, optional (int64)): Groundtruth of duration (T,).
+            pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
+            energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
+            alpha(float, optional): Alpha to control the speed.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): Batch of padded spk ids  (1,). (Default value = None)
+            tone_id(Tensor, optional(int64), optional): Batch of padded tone ids  (T,). (Default value = None)
+
+        Returns:
+
+        
         """
         # input of embedding must be int64
         x = paddle.cast(text, 'int64')
@@ -761,17 +665,13 @@ class FastSpeech2(nn.Layer):
     def _integrate_with_spk_embed(self, hs, spk_emb):
         """Integrate speaker embedding with hidden states.
 
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim)
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+
+        
         """
         if self.spk_embed_integration_type == "add":
             # apply projection and then add to hidden states
@@ -790,17 +690,13 @@ class FastSpeech2(nn.Layer):
     def _integrate_with_tone_embed(self, hs, tone_embs):
         """Integrate speaker embedding with hidden states.
 
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        tone_embs : Tensor
-            Batch of speaker embeddings (B, Tmax, tone_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim)
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
+
+        Returns:
+
+        
         """
         if self.tone_embed_integration_type == "add":
             # apply projection and then add to hidden states
@@ -819,24 +715,17 @@ class FastSpeech2(nn.Layer):
     def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
         """Make masks for self-attention.
 
-        Parameters
-        ----------
-        ilens : Tensor
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor): Batch of lengths (B,).
 
-        Returns
-        -------
-        Tensor
-            Mask tensor for self-attention.
-            dtype=paddle.bool
-
-        Examples
-        -------
-        >>> ilens = [5, 3]
-        >>> self._source_mask(ilens)
-        tensor([[[1, 1, 1, 1, 1],
-                    [1, 1, 1, 0, 0]]]) bool
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool
 
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool
         """
         x_masks = make_non_pad_mask(ilens)
         return x_masks.unsqueeze(-2)
@@ -910,34 +799,26 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
                 spk_emb=None,
                 spk_id=None):
         """
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : paddle.Tensor/np.ndarray, optional (int64)
-            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
-        durations_scale: int/float, optional
-        durations_bias: int/float, optional
-        pitch : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-        pitch_scale: int/float, optional
-            In denormed HZ domain.
-        pitch_bias: int/float, optional
-            In denormed HZ domain.
-        energy : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
-        energy_scale: int/float, optional
-            In denormed domain.
-        energy_bias: int/float, optional
-            In denormed domain.
-        robot : bool, optional
-            Weather output robot style
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
+
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+            durations_scale(int/float, optional): 
+            durations_bias(int/float, optional): 
+            pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+            pitch_scale(int/float, optional): In denormed HZ domain.
+            pitch_bias(int/float, optional): In denormed HZ domain.
+            energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+            energy_scale(int/float, optional): In denormed domain.
+            energy_bias(int/float, optional): In denormed domain.
+            robot: bool:  (Default value = False)
+            spk_emb: (Default value = None)
+            spk_id: (Default value = None)
+
+        Returns:
+            Tensor: logmel
+
         """
         normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
             text,
@@ -1011,13 +892,9 @@ class FastSpeech2Loss(nn.Layer):
     def __init__(self, use_masking: bool=True,
                  use_weighted_masking: bool=False):
         """Initialize feed-forward Transformer loss module.
-
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to weighted masking in loss calculation.
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to weighted masking in loss calculation.
         """
         assert check_argument_types()
         super().__init__()
@@ -1048,42 +925,22 @@ class FastSpeech2Loss(nn.Layer):
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        d_outs : Tensor
-                Batch of outputs of duration predictor (B, Tmax).
-        p_outs : Tensor
-            Batch of outputs of pitch predictor (B, Tmax, 1).
-        e_outs : Tensor
-            Batch of outputs of energy predictor (B, Tmax, 1).
-        ys : Tensor
-            Batch of target features (B, Lmax, odim).
-        ds : Tensor
-            Batch of durations (B, Tmax).
-        ps : Tensor
-            Batch of target token-averaged pitch (B, Tmax, 1).
-        es : Tensor
-            Batch of target token-averaged energy (B, Tmax, 1).
-        ilens : Tensor
-            Batch of the lengths of each input (B,).
-        olens : Tensor
-            Batch of the lengths of each target (B,).
-
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Duration predictor loss value.
-        Tensor
-            Pitch predictor loss value.
-        Tensor
-            Energy predictor loss value.
-
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
+            p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
+            e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
+            ys(Tensor): Batch of target features (B, Lmax, odim).
+            ds(Tensor): Batch of durations (B, Tmax).
+            ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
+            es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
+            ilens(Tensor): Batch of the lengths of each input (B,).
+            olens(Tensor): Batch of the lengths of each target (B,).
+
+        Returns:
+
+        
         """
         # apply mask to remove padded part
         if self.use_masking:
diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py
index 82dd66c1..116376ec 100644
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -37,35 +37,21 @@ class HiFiGANGenerator(nn.Layer):
             use_weight_norm: bool=True,
             init_type: str="xavier_uniform", ):
         """Initialize HiFiGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        channels : int
-            Number of hidden representation channels.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_kernel_sizes : list
-            List of kernel sizes for upsampling layers.
-        resblock_kernel_sizes : list
-            List of kernel sizes for residual blocks.
-        resblock_dilations : list
-            List of dilation list for residual blocks.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
+            resblock_dilations (list): List of dilation list for residual blocks.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
         """
         super().__init__()
 
@@ -134,14 +120,11 @@ class HiFiGANGenerator(nn.Layer):
 
     def forward(self, c):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
         """
         c = self.input_conv(c)
         for i in range(self.num_upsamples):
@@ -196,15 +179,12 @@ class HiFiGANGenerator(nn.Layer):
 
     def inference(self, c):
         """Perform inference.
-        Parameters
-        ----------
-        c : Tensor 
-            Input tensor (T, in_channels).
-            normalize_before (bool): Whether to perform normalization.
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+                normalize_before (bool): Whether to perform normalization.
+        Returns:
+            Tensor:
+                Output tensor (T ** prod(upsample_scales), out_channels).
         """
         c = self.forward(c.transpose([1, 0]).unsqueeze(0))
         return c.squeeze(0).transpose([1, 0])
@@ -229,36 +209,23 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
             use_spectral_norm: bool=False,
             init_type: str="xavier_uniform", ):
         """Initialize HiFiGANPeriodDiscriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        period : int
-            Period.
-        kernel_sizes : list
-            Kernel sizes of initial conv layers and the final conv layer.
-        channels : int
-            Number of initial channels.
-        downsample_scales : list
-            List of downsampling scales.
-        max_downsample_channels : int
-            Number of maximum downsampling channels.
-        use_additional_convs : bool
-            Whether to use additional conv layers in residual blocks.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            period (int): Period.
+            kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
+            channels (int): Number of initial channels.
+            downsample_scales (list): List of downsampling scales.
+            max_downsample_channels (int): Number of maximum downsampling channels.
+            use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
         """
         super().__init__()
 
@@ -307,14 +274,11 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        list
-            List of each layer's tensors.
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            list: List of each layer's tensors.
         """
         # transform 1d to 2d -> (B, C, T/P, P)
         b, c, t = paddle.shape(x)
@@ -379,13 +343,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
             },
             init_type: str="xavier_uniform", ):
         """Initialize HiFiGANMultiPeriodDiscriminator module.
-        Parameters
-        ----------
-        periods : list
-            List of periods.
-        discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
-            The period parameter will be overwritten.
+
+        Args:
+            periods (list): List of periods.
+            discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
         """
         super().__init__()
         # initialize parameters
@@ -399,14 +361,11 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
         """
         outs = []
         for f in self.discriminators:
@@ -434,33 +393,22 @@ class HiFiGANScaleDiscriminator(nn.Layer):
             use_spectral_norm: bool=False,
             init_type: str="xavier_uniform", ):
         """Initilize HiFiGAN scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : list
-            List of four kernel sizes. The first will be used for the first conv layer,
-            and the second is for downsampling part, and the remaining two are for output layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : list
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_spectral_norm : bool
-            Whether to use spectral norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
+                and the second is for downsampling part, and the remaining two are for output layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (list): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_spectral_norm (bool): Whether to use spectral norm.
+                If set to true, it will be applied to all of the conv layers.
         """
         super().__init__()
 
@@ -546,14 +494,11 @@ class HiFiGANScaleDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer.
         """
         outs = []
         for f in self.layers:
@@ -613,20 +558,14 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
             follow_official_norm: bool=False,
             init_type: str="xavier_uniform", ):
         """Initilize HiFiGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool
-            Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
+   
+        Args:
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm (bool): Whether to follow the norm setting of the official
+                implementaion. The first discriminator uses spectral norm and the other discriminators use weight norm.
         """
         super().__init__()
 
@@ -651,14 +590,11 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
         """
         outs = []
         for f in self.discriminators:
@@ -715,24 +651,17 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
             },
             init_type: str="xavier_uniform", ):
         """Initilize HiFiGAN multi-scale + multi-period discriminator module.
-        Parameters
-        ----------
-        scales : int
-            Number of multi-scales.
-        scale_downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        scale_downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        scale_discriminator_params : dict
-            Parameters for hifi-gan scale discriminator module.
-        follow_official_norm : bool): Whether to follow the norm setting of the official
-            implementaion. The first discriminator uses spectral norm and the other
-            discriminators use weight norm.
-        periods : list
-            List of periods.
-        period_discriminator_params : dict
-            Parameters for hifi-gan period discriminator module.
-            The period parameter will be overwritten.
+
+        Args:
+            scales (int): Number of multi-scales.
+            scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            scale_downsample_pooling_params (dict): Parameters for the above pooling module.
+            scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
+            follow_official_norm （bool): Whether to follow the norm setting of the official implementaion. 
+                The first discriminator uses spectral norm and the other discriminators use weight norm.
+            periods (list): List of periods.
+            period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
+                The period parameter will be overwritten.
         """
         super().__init__()
 
@@ -751,16 +680,14 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List:
-            List of list of each discriminator outputs,
-            which consists of each layer output tensors.
-            Multi scale and multi period ones are concatenated.
+
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List:
+                List of list of each discriminator outputs,
+                which consists of each layer output tensors.
+                Multi scale and multi period ones are concatenated.
         """
         msd_outs = self.msd(x)
         mpd_outs = self.mpd(x)
diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py
index 3e90b691..6a139659 100644
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
@@ -51,41 +51,26 @@ class MelGANGenerator(nn.Layer):
             use_causal_conv: bool=False,
             init_type: str="xavier_uniform", ):
         """Initialize MelGANGenerator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels,
-            the number of sub-band is out_channels in multi-band melgan.
-        kernel_size : int
-            Kernel size of initial and final conv layer.
-        channels : int
-            Initial number of channels for conv layer.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        upsample_scales : List[int]
-            List of upsampling scales.
-        stack_kernel_size : int
-            Kernel size of dilated conv layers in residual stack.
-        stacks : int
-            Number of stacks in a single residual stack.
-        nonlinear_activation : Optional[str], optional
-            Non linear activation in upsample network, by default None
-        nonlinear_activation_params : Dict[str, Any], optional
-            Parameters passed to the linear activation in the upsample network, 
-            by default {}
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_final_nonlinear_activation : nn.Layer
-            Activation function for the final layer.
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels,
+                the number of sub-band is out_channels in multi-band melgan.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            channels (int): Initial number of channels for conv layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            upsample_scales (List[int]): List of upsampling scales.
+            stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
+            stacks (int): Number of stacks in a single residual stack.
+            nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+            nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+                by default {}
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params （dict): Hyperparameters for padding function.
+            use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+            use_causal_conv (bool): Whether to use causal convolution.
         """
         super().__init__()
 
@@ -207,14 +192,11 @@ class MelGANGenerator(nn.Layer):
 
     def forward(self, c):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Input tensor (B, in_channels, T).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
         """
         out = self.melgan(c)
         return out
@@ -260,14 +242,11 @@ class MelGANGenerator(nn.Layer):
 
     def inference(self, c):
         """Perform inference.
-        Parameters
-        ----------
-        c : Union[Tensor, ndarray]
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (out_channels*T ** prod(upsample_scales), 1).
+
+        Args:
+            c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
         """
         # pseudo batch
         c = c.transpose([1, 0]).unsqueeze(0)
@@ -298,33 +277,22 @@ class MelGANDiscriminator(nn.Layer):
             pad_params: Dict[str, Any]={"mode": "reflect"},
             init_type: str="xavier_uniform", ):
         """Initilize MelGAN discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The prod will be used for the first conv layer,
-            and the first and the second kernel sizes will be used for the last two layers.
-            For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
-            the last two layers' kernel size will be 5 and 3, respectively.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+                For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
+                the last two layers' kernel size will be 5 and 3, respectively.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
         """
         super().__init__()
 
@@ -395,14 +363,10 @@ class MelGANDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of output tensors of each layer (for feat_match_loss).
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of output tensors of each layer (for feat_match_loss).
         """
         outs = []
         for f in self.layers:
@@ -440,39 +404,24 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
             use_weight_norm: bool=True,
             init_type: str="xavier_uniform", ):
         """Initilize MelGAN multi-scale discriminator module.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input channels.
-        out_channels : int
-            Number of output channels.
-        scales : int
-            Number of multi-scales.
-        downsample_pooling : str
-            Pooling module name for downsampling of the inputs.
-        downsample_pooling_params : dict
-            Parameters for the above pooling module.
-        kernel_sizes : List[int]
-            List of two kernel sizes. The sum will be used for the first conv layer,
-            and the first and the second kernel sizes will be used for the last two layers.
-        channels : int
-            Initial number of channels for conv layer.
-        max_downsample_channels : int
-            Maximum number of channels for downsampling layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        downsample_scales : List[int]
-            List of downsampling scales.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : dict
-            Hyperparameters for padding function.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            scales (int): Number of multi-scales.
+            downsample_pooling (str): Pooling module name for downsampling of the inputs.
+            downsample_pooling_params (dict): Parameters for the above pooling module.
+            kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
+                and the first and the second kernel sizes will be used for the last two layers.
+            channels (int): Initial number of channels for conv layer.
+            max_downsample_channels (int): Maximum number of channels for downsampling layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            downsample_scales (List[int]): List of downsampling scales.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (dict): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
         """
         super().__init__()
 
@@ -514,14 +463,10 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input noise signal (B, 1, T).
-        Returns
-        ----------
-        List
-            List of list of each discriminator outputs, which consists of each layer output tensors.
+        Args:
+            x (Tensor): Input noise signal (B, 1, T).
+        Returns:
+            List: List of list of each discriminator outputs, which consists of each layer output tensors.
         """
         outs = []
         for f in self.discriminators:
diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py
index bd451e1f..40a2f100 100644
--- a/paddlespeech/t2s/models/melgan/style_melgan.py
+++ b/paddlespeech/t2s/models/melgan/style_melgan.py
@@ -52,37 +52,23 @@ class StyleMelGANGenerator(nn.Layer):
             use_weight_norm: bool=True,
             init_type: str="xavier_uniform", ):
         """Initilize Style MelGAN generator.
-        Parameters
-        ----------
-        in_channels : int
-            Number of input noise channels.
-        aux_channels : int
-            Number of auxiliary input channels.
-        channels : int
-            Number of channels for conv layer.
-        out_channels : int
-            Number of output channels.
-        kernel_size : int
-            Kernel size of conv layers.
-        dilation : int
-            Dilation factor for conv layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        noise_upsample_scales : list
-            List of noise upsampling scales.
-        noise_upsample_activation : str
-            Activation function module name for noise upsampling.
-        noise_upsample_activation_params : dict
-            Hyperparameters for the above activation function.
-        upsample_scales : list
-            List of upsampling scales.
-        upsample_mode : str
-            Upsampling mode in TADE layer.
-        gated_function : str
-            Gated function in TADEResBlock ("softmax" or "sigmoid").
-        use_weight_norm : bool
-            Whether to use weight norm.
-            If set to true, it will be applied to all of the conv layers.
+
+        Args:
+            in_channels (int): Number of input noise channels.
+            aux_channels (int): Number of auxiliary input channels.
+            channels (int): Number of channels for conv layer.
+            out_channels (int): Number of output channels.
+            kernel_size (int): Kernel size of conv layers.
+            dilation (int): Dilation factor for conv layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            noise_upsample_scales (list): List of noise upsampling scales.
+            noise_upsample_activation (str): Activation function module name for noise upsampling.
+            noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
+            upsample_scales (list): List of upsampling scales.
+            upsample_mode (str): Upsampling mode in TADE layer.
+            gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
         """
         super().__init__()
 
@@ -147,16 +133,12 @@ class StyleMelGANGenerator(nn.Layer):
 
     def forward(self, c, z=None):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Auxiliary input tensor (B, channels, T).
-        z : Tensor
-            Input noise tensor (B, in_channels, 1).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T ** prod(upsample_scales)).
+
+        Args:
+            c (Tensor): Auxiliary input tensor (B, channels, T).
+            z (Tensor): Input noise tensor (B, in_channels, 1).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
         """
         # batch_max_steps(24000) == noise_upsample_factor(80) * upsample_factor(300)
         if z is None:
@@ -211,14 +193,10 @@ class StyleMelGANGenerator(nn.Layer):
 
     def inference(self, c):
         """Perform inference.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (T, in_channels).
-        Returns
-        ----------
-        Tensor
-            Output tensor (T ** prod(upsample_scales), out_channels).
+        Args:
+            c (Tensor): Input tensor (T, in_channels).
+        Returns:
+            Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
         """
         # (1, in_channels, T)
         c = c.transpose([1, 0]).unsqueeze(0)
@@ -278,18 +256,13 @@ class StyleMelGANDiscriminator(nn.Layer):
             use_weight_norm: bool=True,
             init_type: str="xavier_uniform", ):
         """Initilize Style MelGAN discriminator.
-        Parameters
-        ----------
-        repeats : int
-            Number of repititons to apply RWD.
-        window_sizes : list
-            List of random window sizes.
-        pqmf_params : list
-            List of list of Parameters for PQMF modules
-        discriminator_params : dict
-            Parameters for base discriminator module.
-        use_weight_nom : bool
-            Whether to apply weight normalization.
+
+        Args:
+            repeats (int): Number of repititons to apply RWD.
+            window_sizes (list): List of random window sizes.
+            pqmf_params (list): List of list of Parameters for PQMF modules
+            discriminator_params (dict): Parameters for base discriminator module.
+            use_weight_nom (bool): Whether to apply weight normalization.
         """
         super().__init__()
 
@@ -325,15 +298,11 @@ class StyleMelGANDiscriminator(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        List
-            List of discriminator outputs, #items in the list will be
-            equal to repeats * #discriminators.
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            List: List of discriminator outputs, #items in the list will be
+                equal to repeats * #discriminators.
         """
         outs = []
         for _ in range(self.repeats):
diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
index 9eff4497..cc8460e4 100644
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py
@@ -31,51 +31,30 @@ from paddlespeech.t2s.modules.upsample import ConvInUpsampleNet
 class PWGGenerator(nn.Layer):
     """Wave Generator for Parallel WaveGAN
 
-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input waveform, by default 1
-    out_channels : int, optional
-        Number of channels of the output waveform, by default 1
-    kernel_size : int, optional
-        Kernel size of the residual blocks inside, by default 3
-    layers : int, optional
-        Number of residual blocks inside, by default 30
-    stacks : int, optional
-        The number of groups to split the residual blocks into, by default 3
-        Within each group, the dilation of the residual block grows 
-        exponentially.
-    residual_channels : int, optional
-        Residual channel of the residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channel of the residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channel of the residual blocks, by default 64
-    aux_channels : int, optional
-        Auxiliary channel of the residual blocks, by default 80
-    aux_context_window : int, optional
-        The context window size of the first convolution applied to the 
-        auxiliary input, by default 2
-    dropout : float, optional
-        Dropout of the residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight norm in all convolutions, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding in the upsample network and residual 
-        blocks, by default False
-    upsample_scales : List[int], optional
-        Upsample scales of the upsample network, by default [4, 4, 4, 4]
-    nonlinear_activation : Optional[str], optional
-        Non linear activation in upsample network, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to the linear activation in the upsample network, 
-        by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the upsample network, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Kernel size along the frequency axis of the upsample network, by default 1
+    Args:
+        in_channels (int, optional): Number of channels of the input waveform, by default 1
+        out_channels (int, optional): Number of channels of the output waveform, by default 1
+        kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): Number of residual blocks inside, by default 30
+        stacks (int, optional): The number of groups to split the residual blocks into, by default 3
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): Residual channel of the residual blocks, by default 64
+        gate_channels (int, optional): Gate channel of the residual blocks, by default 128
+        skip_channels (int, optional): Skip channel of the residual blocks, by default 64
+        aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
+        aux_context_window (int, optional): The context window size of the first convolution applied to the 
+            auxiliary input, by default 2
+        dropout (float, optional): Dropout of the residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual 
+            blocks, by default False
+        upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
+        nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, 
+            by default {}
+        interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
+        freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
     """
 
     def __init__(
@@ -167,18 +146,13 @@ class PWGGenerator(nn.Layer):
     def forward(self, x, c):
         """Generate waveform.
 
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_in, T), The input waveform.
-        c : Tensor
-            Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It 
+        Args:
+            x(Tensor): Shape (N, C_in, T), The input waveform.
+            c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
             is upsampled to match the time resolution of the input.
 
-        Returns
-        -------
-        Tensor
-            Shape (N, C_out, T), the generated waveform.
+        Returns:
+            Tensor: Shape (N, C_out, T), the generated waveform.
         """
         c = self.upsample_net(c)
         assert c.shape[-1] == x.shape[-1]
@@ -218,19 +192,14 @@ class PWGGenerator(nn.Layer):
         self.apply(_remove_weight_norm)
 
     def inference(self, c=None):
-        """Waveform generation. This function is used for single instance 
-        inference.
-        Parameters
-        ----------
-        c : Tensor, optional
-            Shape (T', C_aux), the auxiliary input, by default None
-        x : Tensor, optional
-            Shape (T, C_in), the noise waveform, by default None
-            If not provided, a sample is drawn from a gaussian distribution.
-        Returns
-        -------
-        Tensor
-            Shape (T, C_out), the generated waveform
+        """Waveform generation. This function is used for single instance inference.
+
+        Args:
+            c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
+            x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
+
+        Returns:
+            Tensor: Shape (T, C_out), the generated waveform
         """
         # when to static, can not input x, see https://github.com/PaddlePaddle/Parakeet/pull/132/files
         x = paddle.randn(
@@ -244,32 +213,21 @@ class PWGGenerator(nn.Layer):
 class PWGDiscriminator(nn.Layer):
     """A convolutional discriminator for audio.
 
-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of convolutional sublayers, by default 3
-    layers : int, optional
-        Number of layers, by default 10
-    conv_channels : int, optional
-        Feature size of the convolutional sublayers, by default 64
-    dilation_factor : int, optional
-        The factor with which dilation of each convolutional sublayers grows 
-        exponentially if it is greater than 1, else the dilation of each 
-        convolutional sublayers grows linearly, by default 1
-    nonlinear_activation : str, optional
-        The activation after each convolutional sublayer, by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        The parameters passed to the activation's initializer, by default 
-        {"negative_slope": 0.2}
-    bias : bool, optional
-        Whether to use bias in convolutional sublayers, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization at all convolutional sublayers, 
-        by default True
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
+        layers (int, optional): Number of layers, by default 10
+        conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
+        dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows 
+            exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly, 
+            by default 1
+        nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default 
+            {"negative_slope": 0.2}
+        bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers, 
+            by default True
     """
 
     def __init__(
@@ -330,15 +288,12 @@ class PWGDiscriminator(nn.Layer):
 
     def forward(self, x):
         """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+
+        Args:
+            x (Tensor): Shape (N, in_channels, num_samples), the input audio.
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
         """
         return self.conv_layers(x)
 
@@ -362,39 +317,25 @@ class PWGDiscriminator(nn.Layer):
 class ResidualPWGDiscriminator(nn.Layer):
     """A wavenet-style discriminator for audio.
 
-    Parameters
-    ----------
-    in_channels : int, optional
-        Number of channels of the input audio, by default 1
-    out_channels : int, optional
-        Output feature size, by default 1
-    kernel_size : int, optional
-        Kernel size of residual blocks, by default 3
-    layers : int, optional
-        Number of residual blocks, by default 30
-    stacks : int, optional
-        Number of groups of residual blocks, within which the dilation 
-        of each residual blocks grows exponentially, by default 3
-    residual_channels : int, optional
-        Residual channels of residual blocks, by default 64
-    gate_channels : int, optional
-        Gate channels of residual blocks, by default 128
-    skip_channels : int, optional
-        Skip channels of residual blocks, by default 64
-    dropout : float, optional
-        Dropout probability of residual blocks, by default 0.
-    bias : bool, optional
-        Whether to use bias in residual blocks, by default True
-    use_weight_norm : bool, optional
-        Whether to use weight normalization in all convolutional layers, 
-        by default True
-    use_causal_conv : bool, optional
-        Whether to use causal convolution in residual blocks, by default False
-    nonlinear_activation : str, optional
-        Activation after convolutions other than those in residual blocks, 
-        by default "leakyrelu"
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters to pass to the activation, by default {"negative_slope": 0.2}
+    Args:
+        in_channels (int, optional): Number of channels of the input audio, by default 1
+        out_channels (int, optional): Output feature size, by default 1
+        kernel_size (int, optional): Kernel size of residual blocks, by default 3
+        layers (int, optional): Number of residual blocks, by default 30
+        stacks (int, optional): Number of groups of residual blocks, within which the dilation 
+            of each residual blocks grows exponentially, by default 3
+        residual_channels (int, optional): Residual channels of residual blocks, by default 64
+        gate_channels (int, optional): Gate channels of residual blocks, by default 128
+        skip_channels (int, optional): Skip channels of residual blocks, by default 64
+        dropout (float, optional): Dropout probability of residual blocks, by default 0.
+        bias (bool, optional): Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers, 
+            by default True
+        use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
+        nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks, 
+            by default "leakyrelu"
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation, 
+            by default {"negative_slope": 0.2}
     """
 
     def __init__(
@@ -463,15 +404,11 @@ class ResidualPWGDiscriminator(nn.Layer):
 
     def forward(self, x):
         """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, in_channels, num_samples), the input audio.
-
-        Returns
-        -------
-        Tensor
-            Shape (N, out_channels, num_samples), the predicted logits.
+        Args:
+            x(Tensor): Shape (N, in_channels, num_samples), the input audio.↩
+
+        Returns:
+            Tensor: Shape (N, out_channels, num_samples), the predicted logits.
         """
         x = self.first_conv(x)
         skip = 0
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py
index da71077f..abb691b4 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -81,69 +81,39 @@ class Tacotron2(nn.Layer):
             # training related
             init_type: str="xavier_uniform", ):
         """Initialize Tacotron2 module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        embed_dim : int
-            Dimension of the token embedding.
-        elayers : int
-            Number of encoder blstm layers.
-        eunits : int
-            Number of encoder blstm units.
-        econv_layers : int
-            Number of encoder conv layers.
-        econv_filts : int
-            Number of encoder conv filter size.
-        econv_chans : int
-            Number of encoder conv filter channels.
-        dlayers : int
-            Number of decoder lstm layers.
-        dunits : int
-            Number of decoder lstm units.
-        prenet_layers : int
-            Number of prenet layers.
-        prenet_units : int
-            Number of prenet units.
-        postnet_layers : int
-            Number of postnet layers.
-        postnet_filts : int
-            Number of postnet filter size.
-        postnet_chans : int
-            Number of postnet filter channels.
-        output_activation : str
-            Name of activation function for outputs.
-        adim : int
-            Number of dimension of mlp in attention.
-        aconv_chans : int
-            Number of attention conv filter channels.
-        aconv_filts : int
-            Number of attention conv filter size.
-        cumulate_att_w : bool
-            Whether to cumulate previous attention weight.
-        use_batch_norm : bool
-            Whether to use batch normalization.
-        use_concate : bool
-            Whether to concat enc outputs w/ dec lstm outputs.
-        reduction_factor : int
-            Reduction factor.
-        spk_num : Optional[int]
-            Number of speakers. If set to > 1, assume that the
-            sids will be provided as the input and use sid embedding layer.
-        lang_num : Optional[int]
-            Number of languages. If set to > 1, assume that the
-            lids will be provided as the input and use sid embedding layer.
-        spk_embed_dim : Optional[int]
-            Speaker embedding dimension. If set to > 0,
-            assume that spk_emb will be provided as the input.
-        spk_embed_integration_type : str
-            How to integrate speaker embedding.
-        dropout_rate : float
-            Dropout rate.
-        zoneout_rate : float
-            Zoneout rate.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            embed_dim (int): Dimension of the token embedding.
+            elayers (int): Number of encoder blstm layers.
+            eunits (int): Number of encoder blstm units.
+            econv_layers (int): Number of encoder conv layers.
+            econv_filts (int): Number of encoder conv filter size.
+            econv_chans (int): Number of encoder conv filter channels.
+            dlayers (int): Number of decoder lstm layers.
+            dunits (int): Number of decoder lstm units.
+            prenet_layers (int): Number of prenet layers.
+            prenet_units (int): Number of prenet units.
+            postnet_layers (int): Number of postnet layers.
+            postnet_filts (int): Number of postnet filter size.
+            postnet_chans (int): Number of postnet filter channels.
+            output_activation (str): Name of activation function for outputs.
+            adim (int): Number of dimension of mlp in attention.
+            aconv_chans (int): Number of attention conv filter channels.
+            aconv_filts (int): Number of attention conv filter size.
+            cumulate_att_w (bool): Whether to cumulate previous attention weight.
+            use_batch_norm (bool): Whether to use batch normalization.
+            use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
+            reduction_factor (int): Reduction factor.
+            spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
+                sids will be provided as the input and use sid embedding layer.
+            lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
+                lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
+                assume that spk_emb will be provided as the input.
+            spk_embed_integration_type (str): How to integrate speaker embedding.
+            dropout_rate (float): Dropout rate.
+            zoneout_rate (float): Zoneout rate.
         """
         assert check_argument_types()
         super().__init__()
@@ -258,31 +228,19 @@ class Tacotron2(nn.Layer):
     ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, T_text).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, T_feats, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Optional[Tensor]
-            Batch of speaker embeddings (B, spk_embed_dim).
-        spk_id : Optional[Tensor]
-            Batch of speaker IDs (B, 1).
-        lang_id : Optional[Tensor]
-            Batch of language IDs (B, 1).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
-        Tensor
-            Weight value if not joint training else model outputs.
+        Args:
+            text (Tensor(int64)): Batch of padded character ids (B, T_text).
+            text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech (Tensor): Batch of padded target features (B, T_feats, odim).
+            speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
+            lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
+            Tensor: Weight value if not joint training else model outputs.
 
         """
         text = text[:, :text_lengths.max()]
@@ -369,40 +327,26 @@ class Tacotron2(nn.Layer):
             use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
         """Generate the sequence of features given the sequences of characters.
 
-        Parameters
-        ----------
-        text Tensor(int64)
-            Input sequence of characters (T_text,).
-        speech : Optional[Tensor]
-            Feature sequence to extract style (N, idim).
-        spk_emb : ptional[Tensor]
-            Speaker embedding (spk_embed_dim,).
-        spk_id : Optional[Tensor]
-            Speaker ID (1,).
-        lang_id : Optional[Tensor]
-            Language ID (1,).
-        threshold : float
-            Threshold in inference.
-        minlenratio : float
-            Minimum length ratio in inference.
-        maxlenratio : float
-            Maximum length ratio in inference.
-        use_att_constraint : bool
-            Whether to apply attention constraint.
-        backward_window : int
-            Backward window in attention constraint.
-        forward_window : int
-            Forward window in attention constraint.
-        use_teacher_forcing : bool
-            Whether to use teacher forcing.
-
-        Return
-        ----------
-        Dict[str, Tensor]
-        Output dict including the following items:
-            * feat_gen (Tensor): Output sequence of features (T_feats, odim).
-            * prob (Tensor): Output sequence of stop probabilities (T_feats,).
-            * att_w (Tensor): Attention weights (T_feats, T).
+        Args:
+            text (Tensor(int64)): Input sequence of characters (T_text,).
+            speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
+            spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
+            spk_id (Optional[Tensor]): Speaker ID (1,).
+            lang_id (Optional[Tensor]): Language ID (1,).
+            threshold (float): Threshold in inference.
+            minlenratio (float): Minimum length ratio in inference.
+            maxlenratio (float): Maximum length ratio in inference.
+            use_att_constraint (bool): Whether to apply attention constraint.
+            backward_window (int): Backward window in attention constraint.
+            forward_window (int): Forward window in attention constraint.
+            use_teacher_forcing (bool): Whether to use teacher forcing.
+
+        Returns:
+            Dict[str, Tensor]
+            Output dict including the following items:
+                * feat_gen (Tensor): Output sequence of features (T_feats, odim).
+                * prob (Tensor): Output sequence of stop probabilities (T_feats,).
+                * att_w (Tensor): Attention weights (T_feats, T).
 
         """
         x = text
@@ -458,18 +402,13 @@ class Tacotron2(nn.Layer):
                                   spk_emb: paddle.Tensor) -> paddle.Tensor:
         """Integrate speaker embedding with hidden states.
 
-        Parameters
-        ----------
-         hs : Tensor
-            Batch of hidden state sequences (B, Tmax, eunits).
-         spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-         Tensor
-            Batch of integrated hidden state sequences (B, Tmax, eunits) if
-            integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, Tmax, eunits).
+            spk_emb (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, eunits) if
+                integration_type is "add" else (B, Tmax, eunits + spk_embed_dim).
 
         """
         if self.spk_embed_integration_type == "add":
diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
index 4babe283..92754c30 100644
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -48,127 +48,67 @@ class TransformerTTS(nn.Layer):
     .. _`Neural Speech Synthesis with Transformer Network`:
         https://arxiv.org/pdf/1809.08895.pdf
 
-    Parameters
-    ----------
-    idim : int
-        Dimension of the inputs.
-    odim : int
-        Dimension of the outputs.
-    embed_dim : int, optional
-        Dimension of character embedding.
-    eprenet_conv_layers : int, optional
-        Number of encoder prenet convolution layers.
-    eprenet_conv_chans : int, optional
-        Number of encoder prenet convolution channels.
-    eprenet_conv_filts : int, optional
-        Filter size of encoder prenet convolution.
-    dprenet_layers : int, optional
-        Number of decoder prenet layers.
-    dprenet_units : int, optional
-        Number of decoder prenet hidden units.
-    elayers : int, optional
-        Number of encoder layers.
-    eunits : int, optional
-        Number of encoder hidden units.
-    adim : int, optional
-        Number of attention transformation dimensions.
-    aheads : int, optional
-        Number of heads for multi head attention.
-    dlayers : int, optional
-        Number of decoder layers.
-    dunits : int, optional
-        Number of decoder hidden units.
-    postnet_layers : int, optional
-        Number of postnet layers.
-    postnet_chans : int, optional
-        Number of postnet channels.
-    postnet_filts : int, optional
-        Filter size of postnet.
-    use_scaled_pos_enc : pool, optional
-        Whether to use trainable scaled positional encoding.
-    use_batch_norm : bool, optional
-        Whether to use batch normalization in encoder prenet.
-    encoder_normalize_before : bool, optional
-        Whether to perform layer normalization before encoder block.
-    decoder_normalize_before : bool, optional
-        Whether to perform layer normalization before decoder block.
-    encoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in encoder.
-    decoder_concat_after : bool, optional
-        Whether to concatenate attention layer's input and output in decoder.
-    positionwise_layer_type : str, optional
-        Position-wise operation type.
-    positionwise_conv_kernel_size : int, optional
-        Kernel size in position wise conv 1d.
-    reduction_factor : int, optional
-        Reduction factor.
-    spk_embed_dim : int, optional
-        Number of speaker embedding dimenstions.
-    spk_embed_integration_type : str, optional
-        How to integrate speaker embedding.
-    use_gst : str, optional
-        Whether to use global style token.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    gst_conv_layers : int, optional
-        The number of conv layers in GST.
-    gst_conv_chans_list : Sequence[int], optional
-            List of the number of channels of conv layers in GST.
-    gst_conv_kernel_size : int, optional
-        Kernal size of conv layers in GST.
-    gst_conv_stride : int, optional
-        Stride size of conv layers in GST.
-    gst_gru_layers : int, optional
-        The number of GRU layers in GST.
-    gst_gru_units : int, optional
-        The number of GRU units in GST.
-    transformer_lr : float, optional
-        Initial value of learning rate.
-    transformer_warmup_steps : int, optional
-        Optimizer warmup steps.
-    transformer_enc_dropout_rate : float, optional
-        Dropout rate in encoder except attention and positional encoding.
-    transformer_enc_positional_dropout_rate : float, optional
-        Dropout rate after encoder positional encoding.
-    transformer_enc_attn_dropout_rate : float, optional
-        Dropout rate in encoder self-attention module.
-    transformer_dec_dropout_rate : float, optional
-        Dropout rate in decoder except attention & positional encoding.
-    transformer_dec_positional_dropout_rate : float, optional
-        Dropout rate after decoder positional encoding.
-    transformer_dec_attn_dropout_rate : float, optional
-        Dropout rate in deocoder self-attention module.
-    transformer_enc_dec_attn_dropout_rate : float, optional
-        Dropout rate in encoder-deocoder attention module.
-    init_type : str, optional
-        How to initialize transformer parameters.
-    init_enc_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the encoder.
-    init_dec_alpha : float, optional
-        Initial value of alpha in scaled pos encoding of the decoder.
-    eprenet_dropout_rate : float, optional
-        Dropout rate in encoder prenet.
-    dprenet_dropout_rate : float, optional
-        Dropout rate in decoder prenet.
-    postnet_dropout_rate : float, optional
-        Dropout rate in postnet.
-    use_masking : bool, optional
-        Whether to apply masking for padded part in loss calculation.
-    use_weighted_masking : bool, optional
-        Whether to apply weighted masking in loss calculation.
-    bce_pos_weight : float, optional
-        Positive sample weight in bce calculation (only for use_masking=true).
-    loss_type : str, optional
-        How to calculate loss.
-    use_guided_attn_loss : bool, optional
-        Whether to use guided attention loss.
-    num_heads_applied_guided_attn : int, optional
-        Number of heads in each layer to apply guided attention loss.
-    num_layers_applied_guided_attn : int, optional
-        Number of layers to apply guided attention loss.
-        List of module names to apply guided attention loss.
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        embed_dim (int, optional): Dimension of character embedding.
+        eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
+        eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
+        eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
+        dprenet_layers (int, optional): Number of decoder prenet layers.
+        dprenet_units (int, optional): Number of decoder prenet hidden units.
+        elayers (int, optional): Number of encoder layers.
+        eunits (int, optional): Number of encoder hidden units.
+        adim (int, optional): Number of attention transformation dimensions.
+        aheads (int, optional): Number of heads for multi head attention.
+        dlayers (int, optional): Number of decoder layers.
+        dunits (int, optional): Number of decoder hidden units.
+        postnet_layers (int, optional): Number of postnet layers.
+        postnet_chans (int, optional): Number of postnet channels.
+        postnet_filts (int, optional): Filter size of postnet.
+        use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
+        use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
+        encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
+        decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
+        encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
+        decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
+        positionwise_layer_type (str, optional): Position-wise operation type.
+        positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
+        reduction_factor (int, optional): Reduction factor.
+        spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
+        spk_embed_integration_type (str, optional): How to integrate speaker embedding.
+        use_gst (str, optional): Whether to use global style token.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        gst_conv_layers (int, optional): The number of conv layers in GST.
+        gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
+        gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
+        gst_conv_stride (int, optional): Stride size of conv layers in GST.
+        gst_gru_layers (int, optional): The number of GRU layers in GST.
+        gst_gru_units (int, optional): The number of GRU units in GST.
+        transformer_lr (float, optional): Initial value of learning rate.
+        transformer_warmup_steps (int, optional): Optimizer warmup steps.
+        transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
+        transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
+        transformer_enc_attn_dropout_rate （float, optional): Dropout rate in encoder self-attention module.
+        transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
+        transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
+        transformer_dec_attn_dropout_rate （float, optional): Dropout rate in deocoder self-attention module.
+        transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
+        init_type (str, optional): How to initialize transformer parameters.
+        init_enc_alpha （float, optional）: Initial value of alpha in scaled pos encoding of the encoder.
+        init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
+        eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
+        dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
+        postnet_dropout_rate (float, optional): Dropout rate in postnet.
+        use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
+        use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
+        bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
+        loss_type (str, optional): How to calculate loss.
+        use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
+        num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
+        num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
+            List of module names to apply guided attention loss.
     """
 
     def __init__(
@@ -398,25 +338,16 @@ class TransformerTTS(nn.Layer):
     ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Batch of padded character ids (B, Tmax).
-        text_lengths : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        speech_lengths : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        spk_emb : Tensor, optional
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Loss scalar value.
-        Dict
-            Statistics to be monitored.
+        Args:
+            text(Tensor(int64)): Batch of padded character ids (B, Tmax).
+            text_lengths(Tensor(int64)): Batch of lengths of each input batch (B,).
+            speech(Tensor): Batch of padded target features (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
+            spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Loss scalar value.
+            Dict: Statistics to be monitored.
 
         """
         # input of embedding must be int64
@@ -525,31 +456,19 @@ class TransformerTTS(nn.Layer):
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         """Generate the sequence of features given the sequences of characters.
 
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        spk_emb : Tensor, optional
-            Speaker embedding vector (spk_embed_dim,).
-        threshold : float, optional
-            Threshold in inference.
-        minlenratio : float, optional
-            Minimum length ratio in inference.
-        maxlenratio : float, optional
-            Maximum length ratio in inference.
-        use_teacher_forcing : bool, optional
-            Whether to use teacher forcing.
-
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        Tensor
-            Output sequence of stop probabilities (L,).
-        Tensor
-            Encoder-decoder (source) attention weights (#layers, #heads, L, T).
+        Args:
+            text(Tensor(int64)): Input sequence of characters (T,).
+            speech(Tensor, optional): Feature sequence to extract style (N, idim).
+            spk_emb(Tensor, optional): Speaker embedding vector (spk_embed_dim,).
+            threshold(float, optional): Threshold in inference.
+            minlenratio(float, optional): Minimum length ratio in inference.
+            maxlenratio(float, optional): Maximum length ratio in inference.
+            use_teacher_forcing(bool, optional): Whether to use teacher forcing.
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Encoder-decoder (source) attention weights (#layers, #heads, L, T).
 
         """
         # input of embedding must be int64
@@ -671,23 +590,17 @@ class TransformerTTS(nn.Layer):
     def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
         """Make masks for self-attention.
 
-        Parameters
-        ----------
-        ilens : Tensor
-            Batch of lengths (B,).
+        Args:
+            ilens(Tensor): Batch of lengths (B,).
 
-        Returns
-        -------
-        Tensor
-            Mask tensor for self-attention.
-            dtype=paddle.bool
+        Returns:
+            Tensor: Mask tensor for self-attention. dtype=paddle.bool
 
-        Examples
-        -------
-        >>> ilens = [5, 3]
-        >>> self._source_mask(ilens)
-        tensor([[[1, 1, 1, 1, 1],
-                    [1, 1, 1, 0, 0]]]) bool
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                        [1, 1, 1, 0, 0]]]) bool
 
         """
         x_masks = make_non_pad_mask(ilens)
@@ -696,30 +609,25 @@ class TransformerTTS(nn.Layer):
     def _target_mask(self, olens: paddle.Tensor) -> paddle.Tensor:
         """Make masks for masked self-attention.
 
-        Parameters
-        ----------
-            olens : LongTensor
-                Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor
-            Mask tensor for masked self-attention.
-
-        Examples
-        ----------
-        >>> olens = [5, 3]
-        >>> self._target_mask(olens)
-        tensor([[[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 1, 0],
-                    [1, 1, 1, 1, 1]],
-                [[1, 0, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 0, 0],
-                    [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
+        Args:
+            olens (Tensor(int64)): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for masked self-attention.
+
+        Examples:
+            >>> olens = [5, 3]
+            >>> self._target_mask(olens)
+            tensor([[[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 1, 0],
+                        [1, 1, 1, 1, 1]],
+                    [[1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0],
+                        [1, 1, 1, 0, 0]]], dtype=paddle.uint8)
 
         """
         y_masks = make_non_pad_mask(olens)
@@ -731,17 +639,12 @@ class TransformerTTS(nn.Layer):
                                   spk_emb: paddle.Tensor) -> paddle.Tensor:
         """Integrate speaker embedding with hidden states.
 
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of hidden state sequences (B, Tmax, adim).
-        spk_emb : Tensor
-            Batch of speaker embeddings (B, spk_embed_dim).
-
-        Returns
-        ----------
-        Tensor
-            Batch of integrated hidden state sequences (B, Tmax, adim).
+        Args:
+            hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
+            spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, Tmax, adim).
 
         """
         if self.spk_embed_integration_type == "add":
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index e519e0c5..2c2f7ebb 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -30,20 +30,14 @@ __all__ = ["WaveFlow", "ConditionalWaveFlow", "WaveFlowLoss"]
 
 
 def fold(x, n_group):
-    r"""Fold audio or spectrogram's temporal dimension in to groups.
+    """Fold audio or spectrogram's temporal dimension in to groups.
 
-    Parameters
-    ----------
-    x : Tensor [shape=(\*, time_steps)
-        The input tensor.
+    Args:
+        x(Tensor): The input tensor. shape=(\*, time_steps)
+        n_group(int): The size of a group.
 
-    n_group : int
-        The size of a group.
-
-    Returns
-    ---------
-    Tensor : [shape=(\*, time_steps // n_group, group)]
-        Folded tensor.
+    Returns:
+        Tensor: Folded tensor. shape=(\*, time_steps // n_group, group)
     """
     spatial_shape = list(x.shape[:-1])
     time_steps = paddle.shape(x)[-1]
@@ -58,27 +52,23 @@ class UpsampleNet(nn.LayerList):
     It consists of several conv2dtranspose layers which perform deconvolution
     on mel and time dimension.
 
-    Parameters
-    ----------
-    upscale_factors : List[int], optional
-        Time upsampling factors for each Conv2DTranspose Layer.
-
-        The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
-        Layers. Each upscale_factor is used as the ``stride`` for the
-        corresponding Conv2DTranspose. Defaults to [16, 16], this the default
-        upsampling factor is 256.
+    Args:
+        upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
+            The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
+            Layers. Each upscale_factor is used as the ``stride`` for the
+            corresponding Conv2DTranspose. Defaults to [16, 16], this the default
+            upsampling factor is 256.
 
-    Notes
-    ------
-    ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
-    transformation used to extract spectrogram features from audio.
+    Notes:
+        ``np.prod(upscale_factors)`` should equals the ``hop_length`` of the stft
+        transformation used to extract spectrogram features from audio.
 
-    For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
-    transformation whose ``hop_length`` equals 256 is suitable.
+        For example, ``16 * 16 = 256``, then the spectrogram extracted with a stft
+        transformation whose ``hop_length`` equals 256 is suitable.
 
-    See Also
-    ---------
-    ``librosa.core.stft``
+        See Also
+    
+        ``librosa.core.stft``
     """
 
     def __init__(self, upsample_factors):
@@ -101,25 +91,18 @@ class UpsampleNet(nn.LayerList):
         self.upsample_factors = upsample_factors
 
     def forward(self, x, trim_conv_artifact=False):
-        r"""Forward pass of the ``UpsampleNet``.
+        """Forward pass of the ``UpsampleNet``
 
-        Parameters
-        -----------
-        x : Tensor [shape=(batch_size, input_channels, time_steps)]
-            The input spectrogram.
+        Args:
+            x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
+            trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
 
-        trim_conv_artifact : bool, optional
-            Trim deconvolution artifact at each layer. Defaults to False.
+        Returns:
+           Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps \* upsample_factor)
 
-        Returns
-        --------
-        Tensor: [shape=(batch_size, input_channels, time_steps \* upsample_factor)]
-            The upsampled spectrogram.
-
-        Notes
-        --------
-        If trim_conv_artifact is ``True``, the output time steps is less
-        than ``time_steps \* upsample_factors``.
+        Notes:
+            If trim_conv_artifact is ``True``, the output time steps is less
+            than ``time_steps \* upsample_factors``.
         """
         x = paddle.unsqueeze(x, 1)  # (B, C, T) -> (B, 1, C, T)
         for layer in self:
@@ -139,19 +122,11 @@ class ResidualBlock(nn.Layer):
     same paddign in width dimension. It also has projection for the condition
     and output.
 
-    Parameters
-    ----------
-    channels : int
-        Feature size of the input.
-
-    cond_channels : int
-        Featuer size of the condition.
-
-    kernel_size : Tuple[int]
-        Kernel size of the Convolution2d applied to the input.
-
-    dilations : int
-        Dilations of the Convolution2d applied to the input.
+    Args:
+        channels (int): Feature size of the input.
+        cond_channels (int): Featuer size of the condition.
+        kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
+        dilations (int): Dilations of the Convolution2d applied to the input.
     """
 
     def __init__(self, channels, cond_channels, kernel_size, dilations):
@@ -197,21 +172,13 @@ class ResidualBlock(nn.Layer):
     def forward(self, x, condition):
         """Compute output for a whole folded sequence.
 
-        Parameters
-        ----------
-        x : Tensor [shape=(batch_size, channel, height, width)]
-            The input.
-
-        condition : Tensor [shape=(batch_size, condition_channel, height, width)]
-            The local condition.
+        Args:
+            x (Tensor): The input. [shape=(batch_size, channel, height, width)]
+            condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
 
-        Returns
-        -------
-        res : Tensor [shape=(batch_size, channel, height, width)]
-            The residual output.
-
-        skip : Tensor [shape=(batch_size, channel, height, width)]
-            The skip output.
+        Returns: 
+            res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
+            skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
         """
         x_in = x
         x = self.conv(x)
@@ -248,21 +215,14 @@ class ResidualBlock(nn.Layer):
     def add_input(self, x_row, condition_row):
         """Compute the output for a row and update the buffer.
 
-        Parameters
-        ----------
-        x_row : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the input.
-
-        condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
-            A row of the condition.
+        Args:
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
 
-        Returns
-        -------
-        res : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the the residual output.
+        Returns:
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
 
-        skip : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the skip output.
         """
         x_row_in = x_row
         if len(paddle.shape(self._conv_buffer)) == 1:
@@ -297,27 +257,15 @@ class ResidualBlock(nn.Layer):
 class ResidualNet(nn.LayerList):
     """A stack of several ResidualBlocks. It merges condition at each layer.
 
-    Parameters
-    ----------
-    n_layer : int
-        Number of ResidualBlocks in the ResidualNet.
-
-    residual_channels : int
-        Feature size of each ResidualBlocks.
-
-    condition_channels : int
-        Feature size of the condition.
+    Args:
+        n_layer (int): Number of ResidualBlocks in the ResidualNet.
+        residual_channels (int): Feature size of each ResidualBlocks.
+        condition_channels (int): Feature size of the condition.
+        kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
+        dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
 
-    kernel_size : Tuple[int]
-        Kernel size of each ResidualBlock.
-
-    dilations_h : List[int]
-        Dilation in height dimension of every ResidualBlock.
-
-    Raises
-    ------
-    ValueError
-        If the length of dilations_h does not equals n_layers.
+    Raises:
+        ValueError: If the length of dilations_h does not equals n_layers.
     """
 
     def __init__(self,
@@ -339,18 +287,13 @@ class ResidualNet(nn.LayerList):
     def forward(self, x, condition):
         """Comput the output of given the input and the condition.
 
-        Parameters
-        -----------
-        x : Tensor [shape=(batch_size, channel, height, width)]
-            The input.
-
-        condition : Tensor [shape=(batch_size, condition_channel, height, width)]
-            The local condition.
-
-        Returns
-        --------
-        Tensor : [shape=(batch_size, channel, height, width)]
-            The output, which is an aggregation of all the skip outputs.
+        Args:
+            x (Tensor): The input. shape=(batch_size, channel, height, width)
+            condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
+            
+        Returns: 
+            Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
+            
         """
         skip_connections = []
         for layer in self:
@@ -368,21 +311,14 @@ class ResidualNet(nn.LayerList):
     def add_input(self, x_row, condition_row):
         """Compute the output for a row and update the buffers.
 
-        Parameters
-        ----------
-        x_row : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the input.
-
-        condition_row : Tensor [shape=(batch_size, condition_channel, 1, width)]
-            A row of the condition.
-
-        Returns
-        -------
-        res : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the the residual output.
-
-        skip : Tensor [shape=(batch_size, channel, 1, width)]
-            A row of the skip output.
+        Args:
+            x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
+            condition_row (Tensor):  A row of the condition. shape=(batch_size, condition_channel, 1, width)
+            
+        Returns:
+            res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width) 
+            skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
+                
         """
         skip_connections = []
         for layer in self:
@@ -400,22 +336,12 @@ class Flow(nn.Layer):
     probability density estimation. The ``inverse`` method implements the
     sampling.
 
-    Parameters
-    ----------
-    n_layers : int
-        Number of ResidualBlocks in the Flow.
-
-    channels : int
-        Feature size of the ResidualBlocks.
-
-    mel_bands : int
-        Feature size of the mel spectrogram (mel bands).
-
-    kernel_size : Tuple[int]
-        Kernel size of each ResisualBlocks in the Flow.
-
-    n_group : int
-        Number of timesteps to the folded into a group.
+    Args:
+        n_layers (int): Number of ResidualBlocks in the Flow.
+        channels (int): Feature size of the ResidualBlocks.
+        mel_bands (int): Feature size of the mel spectrogram (mel bands).
+        kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
+        n_group (int): Number of timesteps to the folded into a group.
     """
     dilations_dict = {
         8: [1, 1, 1, 1, 1, 1, 1, 1],
@@ -466,26 +392,16 @@ class Flow(nn.Layer):
         """Probability density estimation. It is done by inversely transform
         a sample from p(X) into a sample from p(Z).
 
-        Parameters
-        -----------
-        x : Tensor [shape=(batch, 1, height, width)]
-            A input sample of the distribution p(X).
-
-        condition : Tensor [shape=(batch, condition_channel, height, width)]
-            The local condition.
-
-        Returns
-        --------
-        z (Tensor): shape(batch, 1, height, width), the transformed sample.
-
-        Tuple[Tensor, Tensor]
-            The parameter of the transformation.
-
-            logs (Tensor): shape(batch, 1, height - 1, width), the log scale
-            of the transformation from x to z.
-
-            b (Tensor): shape(batch, 1, height - 1, width), the shift of the
-            transformation from x to z.
+        Args:
+            x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
+            condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
+            
+        Returns:
+            z (Tensor): shape(batch, 1, height, width), the transformed sample.
+            Tuple[Tensor, Tensor]:
+                The parameter of the transformation.
+                logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
+                b (Tensor): shape(batch, 1, height - 1, width), the shift of the transformation from x to z.
         """
         # (B, C, H-1, W)
         logs, b = self._predict_parameters(x[:, :, :-1, :],
@@ -516,27 +432,12 @@ class Flow(nn.Layer):
         """Sampling from the the distrition p(X). It is done by sample form
         p(Z) and transform the sample. It is a auto regressive transformation.
 
-        Parameters
-        -----------
-        z : Tensor [shape=(batch, 1, height, width)]
-            A sample of the distribution p(Z).
-
-        condition : Tensor [shape=(batch, condition_channel, height, width)]
-            The local condition.
-
-        Returns
-        ---------
-        x : Tensor [shape=(batch, 1, height, width)]
-            The transformed sample.
-
-        Tuple[Tensor, Tensor]
-            The parameter of the transformation.
-
-            logs (Tensor): shape(batch, 1, height - 1, width), the log scale
-            of the transformation from x to z.
-
-            b (Tensor): shape(batch, 1, height - 1, width), the shift of the
-            transformation from x to z.
+        Args:
+            z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
+        Returns:
+            Tensor:
+                The transformed sample. shape=(batch, 1, height, width)
         """
         z_0 = z[:, :, :1, :]
         x = paddle.zeros_like(z)
@@ -560,25 +461,13 @@ class WaveFlow(nn.LayerList):
     """An Deep Reversible layer that is composed of severel auto regressive
     flows.
 
-    Parameters
-    -----------
-    n_flows : int
-        Number of flows in the WaveFlow model.
-
-    n_layers : int
-        Number of ResidualBlocks in each Flow.
-
-    n_group : int
-        Number of timesteps to fold as a group.
-
-    channels : int
-        Feature size of each ResidualBlock.
-
-    mel_bands : int
-        Feature size of mel spectrogram (mel bands).
-
-    kernel_size : Union[int, List[int]]
-        Kernel size of the convolution layer in each ResidualBlock.
+    Args:
+        n_flows (int): Number of flows in the WaveFlow model.
+        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_group (int): Number of timesteps to fold as a group.
+        channels (int): Feature size of each ResidualBlock.
+        mel_bands (int): Feature size of mel spectrogram (mel bands).
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
     """
 
     def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
@@ -628,22 +517,13 @@ class WaveFlow(nn.LayerList):
         """Probability density estimation of random variable x given the
         condition.
 
-        Parameters
-        -----------
-        x : Tensor [shape=(batch_size, time_steps)]
-            The audio.
-
-        condition : Tensor [shape=(batch_size, condition channel, time_steps)]
-            The local condition (mel spectrogram here).
-
-        Returns
-        --------
-        z : Tensor [shape=(batch_size, time_steps)]
-            The transformed random variable.
-
-        log_det_jacobian: Tensor [shape=(1,)]
-            The log determinant of the jacobian of the transformation from x
-            to z.
+        Args:
+            x (Tensor): The audio. shape=(batch_size, time_steps)
+            condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
+                
+        Returns:
+            Tensor: The transformed random variable. shape=(batch_size, time_steps)
+            Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
         """
         # x: (B, T)
         # condition: (B, C, T) upsampled condition
@@ -678,18 +558,13 @@ class WaveFlow(nn.LayerList):
         Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
         autoregressive manner.
 
-        Parameters
-        ----------
-        z : Tensor [shape=(batch, 1, time_steps]
-            A sample of the distribution p(Z).
-
-        condition : Tensor [shape=(batch, condition_channel, time_steps)]
-            The local condition.
+        Args:
+            z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
+            condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)    
 
-        Returns
-        --------
-        x : Tensor [shape=(batch_size, time_steps)]
-            The transformed sample (audio here).
+        Returns: 
+            Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
+            
         """
 
         z, condition = self._trim(z, condition)
@@ -714,29 +589,15 @@ class WaveFlow(nn.LayerList):
 class ConditionalWaveFlow(nn.LayerList):
     """ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
 
-    Parameters
-    ----------
-    upsample_factors : List[int]
-        Upsample factors for the upsample net.
-
-    n_flows : int
-        Number of flows in the WaveFlow model.
-
-    n_layers : int
-        Number of ResidualBlocks in each Flow.
-
-    n_group : int
-        Number of timesteps to fold as a group.
-
-    channels : int
-        Feature size of each ResidualBlock.
-
-    n_mels : int
-        Feature size of mel spectrogram (mel bands).
-
-    kernel_size : Union[int, List[int]]
-        Kernel size of the convolution layer in each ResidualBlock.
-    """
+    Args:
+        upsample_factors (List[int]): Upsample factors for the upsample net.
+        n_flows (int): Number of flows in the WaveFlow model.
+        n_layers (int): Number of ResidualBlocks in each Flow.
+        n_group (int): Number of timesteps to fold as a group.
+        channels (int): Feature size of each ResidualBlock.
+        n_mels (int): Feature size of mel spectrogram (mel bands).
+        kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
+        """
 
     def __init__(self,
                  upsample_factors: List[int],
@@ -760,22 +621,13 @@ class ConditionalWaveFlow(nn.LayerList):
         """Compute the transformed random variable z (x to z) and the log of
         the determinant of the jacobian of the transformation from x to z.
 
-        Parameters
-        ----------
-        audio : Tensor [shape=(B, T)]
-            The audio.
+        Args:
+            audio(Tensor): The audio. shape=(B, T)
+            mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
 
-        mel : Tensor [shape=(B, C_mel, T_mel)]
-            The mel spectrogram.
-
-        Returns
-        -------
-        z : Tensor [shape=(B, T)]
-            The inversely transformed random variable z (x to z)
-
-        log_det_jacobian: Tensor [shape=(1,)]
-            the log of the determinant of the jacobian of the transformation
-            from x to z.
+        Returns:
+            Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
+            Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
         """
         condition = self.encoder(mel)
         z, log_det_jacobian = self.decoder(audio, condition)
@@ -783,17 +635,13 @@ class ConditionalWaveFlow(nn.LayerList):
 
     @paddle.no_grad()
     def infer(self, mel):
-        r"""Generate raw audio given mel spectrogram.
+        """Generate raw audio given mel spectrogram.
 
-        Parameters
-        ----------
-        mel : Tensor [shape=(B, C_mel, T_mel)]
-            Mel spectrogram (in log-magnitude).
+        Args:
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
 
-        Returns
-        -------
-        Tensor : [shape=(B, T)]
-            The synthesized audio, where``T <= T_mel \* upsample_factors``.
+        Returns:
+            Tensor: The synthesized audio, where``T <= T_mel \* upsample_factors``. shape=(B, T)
         """
         start = time.time()
         condition = self.encoder(mel, trim_conv_artifact=True)  # (B, C, T)
@@ -808,15 +656,11 @@ class ConditionalWaveFlow(nn.LayerList):
     def predict(self, mel):
         """Generate raw audio given mel spectrogram.
 
-        Parameters
-        ----------
-        mel : np.ndarray [shape=(C_mel, T_mel)]
-            Mel spectrogram of an utterance(in log-magnitude).
+        Args:
+            mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
 
-        Returns
-        -------
-        np.ndarray [shape=(T,)]
-            The synthesized audio.
+        Returns:
+            np.ndarray: The synthesized audio. shape=(T,)
         """
         mel = paddle.to_tensor(mel)
         mel = paddle.unsqueeze(mel, 0)
@@ -828,18 +672,12 @@ class ConditionalWaveFlow(nn.LayerList):
     def from_pretrained(cls, config, checkpoint_path):
         """Build a ConditionalWaveFlow model from a pretrained model.
 
-        Parameters
-        ----------
-        config: yacs.config.CfgNode
-            model configs
+        Args:
+            config(yacs.config.CfgNode): model configs
+            checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
 
-        checkpoint_path: Path or str
-            the path of pretrained model checkpoint, without extension name
-
-        Returns
-        -------
-        ConditionalWaveFlow
-            The model built from pretrained result.
+        Returns:
+            ConditionalWaveFlow The model built from pretrained result.
         """
         model = cls(upsample_factors=config.model.upsample_factors,
                     n_flows=config.model.n_flows,
@@ -855,11 +693,9 @@ class ConditionalWaveFlow(nn.LayerList):
 class WaveFlowLoss(nn.Layer):
     """Criterion of a WaveFlow model.
 
-    Parameters
-    ----------
-    sigma : float
-        The standard deviation of the gaussian noise used in WaveFlow, by
-        default 1.0.
+    Args:
+        sigma (float): The standard deviation of the gaussian noise used in WaveFlow, 
+            by default 1.0.
     """
 
     def __init__(self, sigma=1.0):
@@ -871,19 +707,13 @@ class WaveFlowLoss(nn.Layer):
         """Compute the loss given the transformed random variable z and the
         log_det_jacobian of transformation from x to z.
 
-        Parameters
-        ----------
-        z : Tensor [shape=(B, T)]
-            The transformed random variable (x to z).
-
-        log_det_jacobian : Tensor [shape=(1,)]
-            The log of the determinant of the jacobian matrix of the
-            transformation from x to z.
+        Args:
+            z(Tensor): The transformed random variable (x to z). shape=(B, T)
+            log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
+                transformation from x to z.  shape=(1,)
 
-        Returns
-        -------
-        Tensor [shape=(1,)]
-            The loss.
+        Returns:
+            Tensor: The loss. shape=(1,)
         """
         loss = paddle.sum(z * z) / (2 * self.sigma * self.sigma
                                     ) - log_det_jacobian
@@ -895,15 +725,12 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
     def forward(self, mel):
         """Generate raw audio given mel spectrogram.
 
-        Parameters
-        ----------
-        mel : np.ndarray [shape=(C_mel, T_mel)]
-            Mel spectrogram of an utterance(in log-magnitude).
-
-        Returns
-        -------
-        np.ndarray [shape=(T,)]
-            The synthesized audio.
+        Args:
+            mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
+            
+        Returns:
+            np.ndarray: The synthesized audio. shape=(T,)
+            
         """
         audio = self.predict(mel)
         return audio
diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py
index fcf39a48..1320ffa3 100644
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -67,14 +67,10 @@ class MelResNet(nn.Layer):
 
     def forward(self, x):
         '''
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_dims, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, res_out_dims, T).
+        Args:
+            x (Tensor): Input tensor (B, in_dims, T).
+        Returns:
+            Tensor: Output tensor (B, res_out_dims, T).
         '''
 
         x = self.conv_in(x)
@@ -121,16 +117,11 @@ class UpsampleNetwork(nn.Layer):
 
     def forward(self, m):
         '''
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, C_aux, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
-        Tensor
-            Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
+        Args:
+            c (Tensor): Input tensor (B, C_aux, T).
+        Returns:
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), C_aux).
+            Tensor: Output tensor (B, (T - 2 * pad) *  prob(upsample_scales), res_out_dims).
         '''
         # aux: [B, C_aux, T] 
         # -> [B, res_out_dims, T - 2 * aux_context_window]
@@ -172,32 +163,20 @@ class WaveRNN(nn.Layer):
             mode='RAW',
             init_type: str="xavier_uniform", ):
         '''
-        Parameters
-        ----------
-        rnn_dims : int, optional
-            Hidden dims of RNN Layers.
-        fc_dims : int, optional
-             Dims of FC Layers.
-        bits : int, optional
-            bit depth of signal.
-        aux_context_window : int, optional
-            The context window size of the first convolution applied to the 
-            auxiliary input, by default 2
-        upsample_scales : List[int], optional
-            Upsample scales of the upsample network.
-        aux_channels : int, optional
-            Auxiliary channel of the residual blocks.
-        compute_dims : int, optional
-            Dims of Conv1D in MelResNet.
-        res_out_dims : int, optional
-            Dims of output in MelResNet.
-        res_blocks : int, optional
-            Number of residual blocks.
-        mode : str, optional
-            Output mode of the WaveRNN vocoder. `MOL` for Mixture of Logistic Distribution,
-            and `RAW` for quantized bits as the model's output.
-        init_type : str
-            How to initialize parameters.
+        Args:
+            rnn_dims (int, optional): Hidden dims of RNN Layers.
+            fc_dims (int, optional): Dims of FC Layers.
+            bits (int, optional): bit depth of signal.
+            aux_context_window (int, optional): The context window size of the first convolution applied to the 
+                auxiliary input, by default 2
+            upsample_scales (List[int], optional): Upsample scales of the upsample network.
+            aux_channels (int, optional): Auxiliary channel of the residual blocks.
+            compute_dims (int, optional): Dims of Conv1D in MelResNet.
+            res_out_dims (int, optional): Dims of output in MelResNet.
+            res_blocks (int, optional): Number of residual blocks.
+            mode (str, optional): Output mode of the WaveRNN vocoder. 
+                `MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
+            init_type (str): How to initialize parameters.
         '''
         super().__init__()
         self.mode = mode
@@ -245,18 +224,13 @@ class WaveRNN(nn.Layer):
 
     def forward(self, x, c):
         '''
-        Parameters
-        ----------
-        x : Tensor
-            wav sequence, [B, T]
-        c : Tensor
-            mel spectrogram [B, C_aux, T']
-        
-        T = (T' - 2 * aux_context_window ) * hop_length
-        Returns
-        ----------
-        Tensor
-            [B, T, n_classes]
+        Args:
+            x (Tensor): wav sequence, [B, T]
+            c (Tensor): mel spectrogram [B, C_aux, T']
+
+            T = (T' - 2 * aux_context_window ) * hop_length
+        Returns:
+            Tensor: [B, T, n_classes]
         '''
         # Although we `_flatten_parameters()` on init, when using DataParallel
         # the model gets replicated, making it no longer guaranteed that the
@@ -304,22 +278,14 @@ class WaveRNN(nn.Layer):
                  mu_law: bool=True,
                  gen_display: bool=False):
         """
-        Parameters
-        ----------
-        c : Tensor
-            input mels, (T', C_aux)
-        batched : bool
-            generate in batch or not
-        target : int
-            target number of samples to be generated in each batch entry
-        overlap : int
-            number of samples for crossfading between batches
-        mu_law : bool
-            use mu law or not
-        Returns
-        ----------
-        wav sequence
-            Output (T' * prod(upsample_scales), out_channels, C_out).
+        Args:
+            c(Tensor): input mels, (T', C_aux)
+            batched(bool): generate in batch or not
+            target(int): target number of samples to be generated in each batch entry
+            overlap(int): number of samples for crossfading between batches
+            mu_law(bool)
+        Returns: 
+            wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
         """
 
         self.eval()
@@ -434,16 +400,13 @@ class WaveRNN(nn.Layer):
 
     def pad_tensor(self, x, pad, side='both'):
         '''
-        Parameters
-        ----------
-        x : Tensor
-            mel, [1, n_frames, 80]
-        pad : int
-        side : str 
-            'both', 'before' or 'after'
-        Returns
-        ----------
-        Tensor
+        Args:
+            x(Tensor): mel, [1, n_frames, 80]
+            pad(int): 
+            side(str, optional):  (Default value = 'both')
+
+        Returns:
+            Tensor
         '''
         b, t, _ = paddle.shape(x)
         # for dygraph to static graph
@@ -461,38 +424,29 @@ class WaveRNN(nn.Layer):
         Fold the tensor with overlap for quick batched inference.
         Overlap will be used for crossfading in xfade_and_unfold()
 
-        Parameters
-        ----------
-        x : Tensor
-            Upsampled conditioning features. mels or aux
-            shape=(1, T, features)
-            mels: [1, T, 80]
-            aux: [1, T, 128]
-        target : int
-            Target timesteps for each index of batch
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-            overlap = hop_length * 2
-
-        Returns
-        ----------
-        Tensor 
-            shape=(num_folds, target + 2 * overlap, features)
-            num_flods = (time_seq - overlap) // (target + overlap)
-            mel: [num_folds, target + 2 * overlap, 80]
-            aux: [num_folds, target + 2 * overlap, 128]
-
-        Details
-        ----------
-        x = [[h1, h2, ... hn]]
-
-        Where each h is a vector of conditioning features
-
-        Eg: target=2, overlap=1 with x.size(1)=10
-
-        folded = [[h1, h2, h3, h4],
-                  [h4, h5, h6, h7],
-                  [h7, h8, h9, h10]]
+        Args:
+            x(Tensor): Upsampled conditioning features. mels or aux
+                shape=(1, T, features)
+                mels: [1, T, 80]
+                aux: [1, T, 128]
+            target(int): Target timesteps for each index of batch
+            overlap(int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor: 
+                shape=(num_folds, target + 2 * overlap, features)
+                num_flods = (time_seq - overlap) // (target + overlap)
+                mel: [num_folds, target + 2 * overlap, 80]
+                aux: [num_folds, target + 2 * overlap, 128]
+
+        Details:
+            x = [[h1, h2, ... hn]]
+            Where each h is a vector of conditioning features
+            Eg: target=2, overlap=1 with x.size(1)=10
+
+            folded = [[h1, h2, h3, h4],
+                    [h4, h5, h6, h7],
+                    [h7, h8, h9, h10]]
         '''
 
         _, total_len, features = paddle.shape(x)
@@ -520,37 +474,33 @@ class WaveRNN(nn.Layer):
     def xfade_and_unfold(self, y, target: int=12000, overlap: int=600):
         ''' Applies a crossfade and unfolds into a 1d array.
 
-        Parameters
-        ----------
-        y : Tensor
-            Batched sequences of audio samples
-            shape=(num_folds, target + 2 * overlap)
-            dtype=paddle.float32
-        overlap : int
-            Timesteps for both xfade and rnn warmup
-
-        Returns
-        ----------
-        Tensor
-            audio samples in a 1d array
-            shape=(total_len)
-            dtype=paddle.float32
-
-        Details
-        ----------
-        y = [[seq1],
-            [seq2],
-            [seq3]]
-
-        Apply a gain envelope at both ends of the sequences
-
-        y = [[seq1_in, seq1_target, seq1_out],
-            [seq2_in, seq2_target, seq2_out],
-            [seq3_in, seq3_target, seq3_out]]
-
-        Stagger and add up the groups of samples:
-
-        [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
+        Args:
+            y (Tensor): 
+                Batched sequences of audio samples
+                shape=(num_folds, target + 2 * overlap)
+                dtype=paddle.float32
+            overlap (int): Timesteps for both xfade and rnn warmup
+
+        Returns:
+            Tensor
+                audio samples in a 1d array
+                shape=(total_len)
+                dtype=paddle.float32
+
+        Details:
+            y = [[seq1],
+                [seq2],
+                [seq3]]
+
+            Apply a gain envelope at both ends of the sequences
+
+            y = [[seq1_in, seq1_target, seq1_out],
+                [seq2_in, seq2_target, seq2_out],
+                [seq3_in, seq3_target, seq3_out]]
+
+            Stagger and add up the groups of samples:
+
+            [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...]
 
         '''
         # num_folds = (total_len - overlap) // (target + overlap)
diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py
index c0d4f955..3abccc15 100644
--- a/paddlespeech/t2s/modules/causal_conv.py
+++ b/paddlespeech/t2s/modules/causal_conv.py
@@ -41,14 +41,10 @@ class CausalConv1D(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+        Returns: 
+            Tensor: Output tensor (B, out_channels, T).
         """
         return self.conv(self.pad(x))[:, :, :x.shape[2]]
 
@@ -70,13 +66,9 @@ class CausalConv1DTranspose(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T_in).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, out_channels, T_out).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T_in).
+        Returns:
+            Tensor: Output tensor (B, out_channels, T_out).
         """
         return self.deconv(x)[:, :, :-self.stride]
diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py
index e4a6c8c6..185c62fb 100644
--- a/paddlespeech/t2s/modules/conformer/convolution.py
+++ b/paddlespeech/t2s/modules/conformer/convolution.py
@@ -18,12 +18,10 @@ from paddle import nn
 
 class ConvolutionModule(nn.Layer):
     """ConvolutionModule in Conformer model.
-    Parameters
-    ----------
-    channels : int
-        The number of channels of conv layers.
-    kernel_size : int
-        Kernerl size of conv layers.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
     """
 
     def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@@ -59,14 +57,11 @@ class ConvolutionModule(nn.Layer):
 
     def forward(self, x):
         """Compute convolution module.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, channels).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, channels).
+
+        Args:
+            x (Tensor): Input tensor (#batch, time, channels).
+        Returns:
+            Tensor: Output tensor (#batch, time, channels).
         """
         # exchange the temporal dimension and the feature dimension
         x = x.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
index 2949dc37..61c32612 100644
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -21,38 +21,29 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
 
 class EncoderLayer(nn.Layer):
     """Encoder layer module.
-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
-        can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
-    feed_forward_macaron : nn.Layer
-        Additional feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
-        can be used as the argument.
-    conv_module : nn.Layer
-        Convolution module instance.
-        `ConvlutionModule` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    stochastic_depth_rate : float
-        Proability to skip this layer.
-        During training, the layer may skip residual computation and return input
-        as-is with given probability.
+    
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (nn.Layer): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        stochastic_depth_rate (float): Proability to skip this layer.
+            During training, the layer may skip residual computation and return input
+            as-is with given probability.
     """
 
     def __init__(
@@ -93,22 +84,17 @@ class EncoderLayer(nn.Layer):
 
     def forward(self, x_input, mask, cache=None):
         """Compute encoded features.
-        Parameters
-        ----------
-        x_input : Union[Tuple, paddle.Tensor]
-            Input tensor w/ or w/o pos emb.
-            - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
-            - w/o pos emb: Tensor (#batch, time, size).
-        mask : paddle.Tensor
-            Mask tensor for the input (#batch, time).
-        cache paddle.Tensor
-            Cache tensor of the input (#batch, time - 1, size).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, size).
-        paddle.Tensor
-            Mask tensor (#batch, time).
+
+        Args:
+            x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache (Tensor): 
+
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
         """
         if isinstance(x_input, tuple):
             x, pos_emb = x_input[0], x_input[1]
diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py
index 68766d5e..aa875bd5 100644
--- a/paddlespeech/t2s/modules/conv.py
+++ b/paddlespeech/t2s/modules/conv.py
@@ -40,36 +40,29 @@ class Conv1dCell(nn.Conv1D):
     2. padding must be a causal padding (recpetive_field - 1, 0).
     Thus, these arguments are removed from the ``__init__`` method of this
     class.
-    
-    Parameters
-    ----------
-    in_channels: int
-        The feature size of the input.
-    out_channels: int
-        The feature size of the output.
-    kernel_size: int or Tuple[int]
-        The size of the kernel.
-    dilation: int or Tuple[int]
-        The dilation of the convolution, by default 1
-    weight_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr: ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias. If ``False``, this layer does not
-        have a bias, by default None.
-        
-    Examples
-    --------
-    >>> cell = Conv1dCell(3, 4, kernel_size=5)
-    >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
-    >>> outputs = []
-    >>> cell.eval()
-    >>> cell.start_sequence()
-    >>> for xt in inputs:
-    >>>     outputs.append(cell.add_input(xt))
-    >>> len(outputs))
-    16
-    >>> outputs[0].shape
-    [4, 4]
+
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int or Tuple[int]): The size of the kernel.
+        dilation (int or Tuple[int]): The dilation of the convolution, by default 1
+        weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel, 
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias. 
+            If ``False``, this layer does not have a bias, by default None.
+            
+    Examples: 
+        >>> cell = Conv1dCell(3, 4, kernel_size=5)
+        >>> inputs = [paddle.randn([4, 3]) for _ in range(16)]
+        >>> outputs = []
+        >>> cell.eval()
+        >>> cell.start_sequence()
+        >>> for xt in inputs:
+        >>>     outputs.append(cell.add_input(xt))
+        >>> len(outputs))
+        16
+        >>> outputs[0].shape
+        [4, 4]
     """
 
     def __init__(self,
@@ -103,15 +96,13 @@ class Conv1dCell(nn.Conv1D):
     def start_sequence(self):
         """Prepare the layer for a series of incremental forward.
         
-        Warnings
-        ---------
-        This method should be called before a sequence of calls to
-        ``add_input``.
+        Warnings:
+            This method should be called before a sequence of calls to
+            ``add_input``.
 
-        Raises
-        ------
-        Exception
-            If this method is called when the layer is in training mode.
+        Raises:
+            Exception
+                If this method is called when the layer is in training mode.
         """
         if self.training:
             raise Exception("only use start_sequence in evaluation")
@@ -130,10 +121,9 @@ class Conv1dCell(nn.Conv1D):
     def initialize_buffer(self, x_t):
         """Initialize the buffer for the step input.
 
-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
         """
         batch_size, _ = x_t.shape
         self._buffer = paddle.zeros(
@@ -143,26 +133,22 @@ class Conv1dCell(nn.Conv1D):
     def update_buffer(self, x_t):
         """Shift the buffer by one step.
 
-        Parameters
-        ----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+            
         """
         self._buffer = paddle.concat(
             [self._buffer[:, :, 1:], paddle.unsqueeze(x_t, -1)], -1)
 
     def add_input(self, x_t):
         """Add step input and compute step output.
-        
-        Parameters
-        -----------
-        x_t : Tensor [shape=(batch_size, in_channels)]
-            The step input.
-            
-        Returns
-        -------
-        y_t :Tensor [shape=(batch_size, out_channels)]
-            The step output.
+
+        Args:
+            x_t (Tensor): The step input. shape=(batch_size, in_channels)
+          
+        Returns: 
+            y_t (Tensor): The step output. shape=(batch_size, out_channels)
+
         """
         batch_size = x_t.shape[0]
         if self.receptive_field > 1:
@@ -186,33 +172,26 @@ class Conv1dCell(nn.Conv1D):
 class Conv1dBatchNorm(nn.Layer):
     """A Conv1D Layer followed by a BatchNorm1D.
 
-    Parameters
-    ----------
-    in_channels : int
-        The feature size of the input.
-    out_channels : int
-        The feature size of the output.
-    kernel_size : int
-        The size of the convolution kernel.
-    stride : int, optional
-        The stride of the convolution, by default 1.
-    padding : int, str or Tuple[int], optional
-        The padding of the convolution.
-        If int, a symmetrical padding is applied before convolution;
-        If str, it should be "same" or "valid";
-        If Tuple[int], its length should be 2, meaning
-        ``(pad_before, pad_after)``, by default 0.
-    weight_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the convolution kernel, by default None.
-    bias_attr : ParamAttr, Initializer, str or bool, optional
-        The parameter attribute of the bias of the convolution, by default
-        None.
-    data_format : str ["NCL" or "NLC"], optional
-        The data layout of the input, by default "NCL"
-    momentum : float, optional
-        The momentum of the BatchNorm1D layer, by default 0.9
-    epsilon : [type], optional
-        The epsilon of the BatchNorm1D layer, by default 1e-05
+    Args:
+        in_channels (int): The feature size of the input.
+        out_channels (int): The feature size of the output.
+        kernel_size (int): The size of the convolution kernel.
+        stride (int, optional): The stride of the convolution, by default 1.
+        padding (int, str or Tuple[int], optional):
+            The padding of the convolution.
+            If int, a symmetrical padding is applied before convolution;
+            If str, it should be "same" or "valid";
+            If Tuple[int], its length should be 2, meaning
+            ``(pad_before, pad_after)``, by default 0.
+        weight_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the convolution kernel,
+            by default None.
+        bias_attr (ParamAttr, Initializer, str or bool, optional):
+            The parameter attribute of the bias of the convolution,
+            by defaultNone.
+        data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
+        momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
+        epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
     """
 
     def __init__(self,
@@ -244,16 +223,15 @@ class Conv1dBatchNorm(nn.Layer):
 
     def forward(self, x):
         """Forward pass of the Conv1dBatchNorm layer.
-
-        Parameters
-        ----------
-        x : Tensor [shape=(B, C_in, T_in) or (B, T_in, C_in)]
-            The input tensor. Its data layout depends on ``data_format``.
-
-        Returns
-        -------
-        Tensor [shape=(B, C_out, T_out) or (B, T_out, C_out)]
-            The output tensor. 
+        
+        Args:
+            x (Tensor): The input tensor. Its data layout depends on ``data_format``. 
+            shape=(B, C_in, T_in) or (B, T_in, C_in)
+    
+        Returns:
+            Tensor: The output tensor. 
+                shape=(B, C_out, T_out) or (B, T_out, C_out)
+                
         """
         x = self.conv(x)
         x = self.bn(x)
diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py
index a3d56f7d..01eb5ad0 100644
--- a/paddlespeech/t2s/modules/geometry.py
+++ b/paddlespeech/t2s/modules/geometry.py
@@ -17,24 +17,18 @@ import paddle
 
 def shuffle_dim(x, axis, perm=None):
     """Permute input tensor along aixs given the permutation or randomly.
+    
+    Args:
+        x (Tensor): The input tensor.
+        axis (int): The axis to shuffle.
+        perm (List[int], ndarray, optional): 
+            The order to reorder the tensor along the ``axis``-th dimension.
+            It is a permutation of ``[0, d)``, where d is the size of the
+            ``axis``-th dimension of the input tensor. If not provided,
+            a random permutation is used. Defaults to None.
 
-    Parameters
-    ----------
-    x : Tensor
-        The input tensor.
-    axis : int
-        The axis to shuffle.
-    perm : List[int], ndarray, optional
-        The order to reorder the tensor along the ``axis``-th dimension.
-        
-        It is a permutation of ``[0, d)``, where d is the size of the
-        ``axis``-th dimension of the input tensor. If not provided,
-        a random permutation is used. Defaults to None.
-
-    Returns
-    ---------
-    Tensor
-        The shuffled tensor, which has the same shape as x does.
+    Returns:
+        Tensor: The shuffled tensor, which has the same shape as x does.
     """
     size = x.shape[axis]
     if perm is not None and len(perm) != size:
diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py
index 4edd22c9..088b98e0 100644
--- a/paddlespeech/t2s/modules/layer_norm.py
+++ b/paddlespeech/t2s/modules/layer_norm.py
@@ -18,13 +18,9 @@ from paddle import nn
 
 class LayerNorm(nn.LayerNorm):
     """Layer normalization module.
-
-    Parameters
-    ----------
-    nout : int
-        Output dim size.
-    dim : int
-        Dimension to be normalized.
+    Args:
+        nout (int): Output dim size.
+        dim (int): Dimension to be normalized.
     """
 
     def __init__(self, nout, dim=-1):
@@ -35,15 +31,11 @@ class LayerNorm(nn.LayerNorm):
     def forward(self, x):
         """Apply layer normalization.
 
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor.
+        Args:
+            x (Tensor):Input tensor.
 
-        Returns
-        ----------
-        paddle.Tensor
-            Normalized tensor.
+        Returns: 
+            Tensor: Normalized tensor.
         """
 
         if self.dim == -1:
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index 618f444a..93644e24 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -118,16 +118,13 @@ def discretized_mix_logistic_loss(y_hat,
 def sample_from_discretized_mix_logistic(y, log_scale_min=None):
     """
     Sample from discretized mixture of logistic distributions
-    Parameters
-    ----------
-    y : Tensor 
-        (B, C, T)
-    log_scale_min : float
-        Log scale minimum value
-    Returns
-    ----------
-    Tensor
-        sample in range of [-1, 1].
+
+    Args:
+        y(Tensor): (B, C, T)
+        log_scale_min(float, optional):  (Default value = None)
+
+    Returns:
+        Tensor: sample in range of [-1, 1].
     """
     if log_scale_min is None:
         log_scale_min = float(np.log(1e-14))
@@ -181,14 +178,10 @@ class GuidedAttentionLoss(nn.Layer):
     def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
         """Initialize guided attention loss module.
 
-        Parameters
-        ----------
-        sigma : float, optional
-            Standard deviation to control how close attention to a diagonal.
-        alpha : float, optional
-            Scaling coefficient (lambda).
-        reset_always : bool, optional
-            Whether to always reset masks.
+        Args:
+            sigma (float, optional): Standard deviation to control how close attention to a diagonal.
+            alpha (float, optional): Scaling coefficient (lambda).
+            reset_always (bool, optional): Whether to always reset masks.
 
         """
         super().__init__()
@@ -205,19 +198,13 @@ class GuidedAttentionLoss(nn.Layer):
     def forward(self, att_ws, ilens, olens):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of attention weights (B, T_max_out, T_max_in).
-        ilens : Tensor(int64)
-            Batch of input lenghts (B,).
-        olens : Tensor(int64)
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
+        Args:
+            att_ws(Tensor): Batch of attention weights (B, T_max_out, T_max_in).
+            ilens(Tensor(int64)): Batch of input lenghts (B,).
+            olens(Tensor(int64)): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.
 
         """
         if self.guided_attn_masks is None:
@@ -282,39 +269,33 @@ class GuidedAttentionLoss(nn.Layer):
     def _make_masks(ilens, olens):
         """Make masks indicating non-padded part.
 
-        Parameters
-        ----------
-        ilens : Tensor(int64) or List
-            Batch of lengths (B,).
-        olens : Tensor(int64) or List
-            Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor
-            Mask tensor indicating non-padded part.
-
-        Examples
-        ----------
-        >>> ilens, olens = [5, 2], [8, 5]
-        >>> _make_mask(ilens, olens)
-        tensor([[[1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1]],
-
-                [[1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+        Args:
+            ilens(Tensor(int64) or List): Batch of lengths (B,).
+            olens(Tensor(int64) or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.
+
+        Examples:
+            >>> ilens, olens = [5, 2], [8, 5]
+            >>> _make_mask(ilens, olens)
+            tensor([[[1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1]],
+
+                    [[1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [1, 1, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
 
         """
         # (B, T_in)
@@ -330,34 +311,24 @@ class GuidedAttentionLoss(nn.Layer):
 class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
     """Guided attention loss function module for multi head attention.
 
-    Parameters
-    ----------
-    sigma : float, optional
-        Standard deviation to controlGuidedAttentionLoss
-        how close attention to a diagonal.
-    alpha : float, optional
-        Scaling coefficient (lambda).
-    reset_always : bool, optional
-        Whether to always reset masks.
+    Args:
+        sigma (float, optional): Standard deviation to controlGuidedAttentionLoss
+            how close attention to a diagonal.
+        alpha (float, optional): Scaling coefficient (lambda).
+        reset_always (bool, optional): Whether to always reset masks.
 
     """
 
     def forward(self, att_ws, ilens, olens):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-        ilens : Tensor
-            Batch of input lenghts (B,).
-        olens : Tensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
+        Args:
+            att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+            ilens(Tensor): Batch of input lenghts (B,).
+            olens(Tensor): Batch of output lenghts (B,).
+
+        Returns:
+            Tensor: Guided attention loss value.
 
         """
         if self.guided_attn_masks is None:
@@ -382,14 +353,11 @@ class Tacotron2Loss(nn.Layer):
                  use_weighted_masking=False,
                  bce_pos_weight=20.0):
         """Initialize Tactoron2 loss module.
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to apply weighted masking in loss calculation.
-        bce_pos_weight : float
-            Weight of positive sample of stop token.
+
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
+            bce_pos_weight (float): Weight of positive sample of stop token.
         """
         super().__init__()
         assert (use_masking != use_weighted_masking) or not use_masking
@@ -405,28 +373,19 @@ class Tacotron2Loss(nn.Layer):
 
     def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        logits : Tensor
-            Batch of stop logits (B, Lmax).
-        ys : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        stop_labels : Tensor(int64)
-            Batch of the sequences of stop token labels (B, Lmax).
-        olens : Tensor(int64)
-            Batch of the lengths of each target (B,).
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Mean square error loss value.
-        Tensor
-            Binary cross entropy loss value.
+
+        Args:
+            after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
+            logits(Tensor): Batch of stop logits (B, Lmax).
+            ys(Tensor): Batch of padded target features (B, Lmax, odim).
+            stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
+            olens(Tensor(int64)): 
+
+        Returns:
+            Tensor: L1 loss value.
+            Tensor: Mean square error loss value.
+            Tensor: Binary cross entropy loss value.
         """
         # make mask and apply it
         if self.use_masking:
@@ -513,28 +472,20 @@ def stft(x,
          center=True,
          pad_mode='reflect'):
     """Perform STFT and convert to magnitude spectrogram.
-    Parameters
-    ----------
-    x : Tensor
-        Input signal tensor (B, T).
-    fft_size : int
-        FFT size.
-    hop_size : int
-        Hop size.
-    win_length : int
-        window : str, optional
-    window : str
-        Name of window function, see `scipy.signal.get_window` for more
-        details. Defaults to "hann".
-    center : bool, optional
-        center (bool, optional): Whether to pad `x` to make that the
-        :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
-    pad_mode : str, optional
-        Choose padding pattern when `center` is `True`.
-    Returns
-    ----------
-    Tensor:
-        Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    Args:
+        x(Tensor): Input signal tensor (B, T).
+        fft_size(int): FFT size.
+        hop_size(int): Hop size.
+        win_length(int, optional): window : str, optional (Default value = None)
+        window(str, optional): Name of window function, see `scipy.signal.get_window` for more
+            details. Defaults to "hann".
+        center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
+            :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
+        pad_mode(str, optional, optional):  (Default value = 'reflect')
+        hop_length:  (Default value = None)
+
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
     """
     # calculate window
     window = signal.get_window(window, win_length, fftbins=True)
@@ -564,16 +515,11 @@ class SpectralConvergenceLoss(nn.Layer):
 
     def forward(self, x_mag, y_mag):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor)
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
+        Args: 
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
         """
         return paddle.norm(
             y_mag - x_mag, p="fro") / paddle.clip(
@@ -590,16 +536,11 @@ class LogSTFTMagnitudeLoss(nn.Layer):
 
     def forward(self, x_mag, y_mag):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x_mag : Tensor
-            Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
-        y_mag : Tensor
-            Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
-        Returns
-        ----------
-        Tensor
-            Log STFT magnitude loss value.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
         """
         return F.l1_loss(
             paddle.log(paddle.clip(y_mag, min=self.epsilon)),
@@ -625,18 +566,12 @@ class STFTLoss(nn.Layer):
 
     def forward(self, x, y):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T).
-        y : Tensor
-            Groundtruth signal (B, T).
-        Returns
-        ----------
-        Tensor
-            Spectral convergence loss value.
-        Tensor
-            Log STFT magnitude loss value.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
         """
         x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
                      self.window)
@@ -658,16 +593,11 @@ class MultiResolutionSTFTLoss(nn.Layer):
             win_lengths=[600, 1200, 240],
             window="hann", ):
         """Initialize Multi resolution STFT loss module.
-        Parameters
-        ----------
-        fft_sizes : list
-            List of FFT sizes.
-        hop_sizes : list
-            List of hop sizes.
-        win_lengths : list
-            List of window lengths.
-        window : str
-            Window function type.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
         """
         super().__init__()
         assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@@ -677,18 +607,13 @@ class MultiResolutionSTFTLoss(nn.Layer):
 
     def forward(self, x, y):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Predicted signal (B, T) or (B, #subband, T).
-        y : Tensor
-            Groundtruth signal (B, T) or (B, #subband, T).
-        Returns
-        ----------
-        Tensor
-            Multi resolution spectral convergence loss value.
-        Tensor
-            Multi resolution log STFT magnitude loss value.
+        
+        Args:
+            x (Tensor): Predicted signal (B, T) or (B, #subband, T).
+            y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
         """
         if len(x.shape) == 3:
             # (B, C, T) -> (B x C, T)
@@ -725,14 +650,10 @@ class GeneratorAdversarialLoss(nn.Layer):
 
     def forward(self, outputs):
         """Calcualate generator adversarial loss.
-        Parameters
-        ----------
-        outputs: Tensor or List
-        Discriminator outputs or list of discriminator outputs.
-        Returns
-        ----------
-        Tensor
-            Generator adversarial loss value.
+        Args:
+            outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
+        Returns:
+            Tensor: Generator adversarial loss value.
         """
         if isinstance(outputs, (tuple, list)):
             adv_loss = 0.0
@@ -772,20 +693,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):
 
     def forward(self, outputs_hat, outputs):
         """Calcualate discriminator adversarial loss.
-        Parameters
-        ----------
-        outputs_hat : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from generator outputs.
-        outputs : Tensor or list
-            Discriminator outputs or list of
-            discriminator outputs calculated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Discriminator real loss value.
-        Tensor
-            Discriminator fake loss value.
+
+        Args:
+            outputs_hat (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from generator outputs.
+            outputs (Tensor or list): Discriminator outputs or list of
+                discriminator outputs calculated from groundtruth.
+        Returns:
+            Tensor: Discriminator real loss value.
+            Tensor: Discriminator fake loss value.
         """
         if isinstance(outputs, (tuple, list)):
             real_loss = 0.0
@@ -868,17 +784,13 @@ def ssim(img1, img2, window_size=11, size_average=True):
 def weighted_mean(input, weight):
     """Weighted mean. It can also be used as masked mean.
 
-    Parameters
-    -----------
-    input : Tensor 
-        The input tensor.
-    weight : Tensor
-        The weight tensor with broadcastable shape with the input.
-
-    Returns
-    ----------
-    Tensor [shape=(1,)]
-        Weighted mean tensor with the same dtype as input.
+    Args:
+        input(Tensor): The input tensor.
+        weight(Tensor): The weight tensor with broadcastable shape with the input.
+
+    Returns:
+        Tensor: Weighted mean tensor with the same dtype as input. shape=(1,)
+            
     """
     weight = paddle.cast(weight, input.dtype)
     # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__
@@ -889,20 +801,15 @@ def weighted_mean(input, weight):
 def masked_l1_loss(prediction, target, mask):
     """Compute maksed L1 loss.
 
-    Parameters
-    ----------
-    prediction : Tensor
-        The prediction.
-    target : Tensor
-        The target. The shape should be broadcastable to ``prediction``.
-    mask : Tensor
-        The mask. The shape should be broadcatable to the broadcasted shape of
-        ``prediction`` and ``target``.
-
-    Returns
-    -------
-    Tensor [shape=(1,)]
-        The masked L1 loss.
+    Args:
+        prediction(Tensor): The prediction.
+        target(Tensor): The target. The shape should be broadcastable to ``prediction``.
+        mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
+            ``prediction`` and ``target``.
+
+    Returns:
+        Tensor: The masked L1 loss. shape=(1,)
+        
     """
     abs_error = F.l1_loss(prediction, target, reduction='none')
     loss = weighted_mean(abs_error, mask)
@@ -975,14 +882,11 @@ class MelSpectrogram(nn.Layer):
 
     def forward(self, x):
         """Calculate Mel-spectrogram.
-        Parameters
-        ----------
-        x : Tensor
-            Input waveform tensor (B, T) or (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Mel-spectrogram (B, #mels, #frames).
+        Args:
+        
+            x (Tensor): Input waveform tensor (B, T) or (B, 1, T).
+        Returns:
+            Tensor: Mel-spectrogram (B, #mels, #frames).
         """
         if len(x.shape) == 3:
             # (B, C, T) -> (B*C, T)
@@ -1047,16 +951,12 @@ class MelSpectrogramLoss(nn.Layer):
 
     def forward(self, y_hat, y):
         """Calculate Mel-spectrogram loss.
-        Parameters
-        ----------
-        y_hat : Tensor
-            Generated single tensor (B, 1, T).
-        y : Tensor
-            Groundtruth single tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Mel-spectrogram loss value.
+        Args:
+            y_hat(Tensor): Generated single tensor (B, 1, T).
+            y(Tensor): Groundtruth single tensor (B, 1, T).
+
+        Returns:
+            Tensor: Mel-spectrogram loss value.
         """
         mel_hat = self.mel_spectrogram(y_hat)
         mel = self.mel_spectrogram(y)
@@ -1081,18 +981,14 @@ class FeatureMatchLoss(nn.Layer):
 
     def forward(self, feats_hat, feats):
         """Calcualate feature matching loss.
-        Parameters
-        ----------
-        feats_hat : list
-            List of list of discriminator outputs
-            calcuated from generater outputs.
-        feats : list
-            List of list of discriminator outputs
-            calcuated from groundtruth.
-        Returns
-        ----------
-        Tensor
-            Feature matching loss value.
+
+        Args:
+            feats_hat(list): List of list of discriminator outputs
+                calcuated from generater outputs.
+            feats(list): List of list of discriminator outputs
+
+        Returns:
+            Tensor: Feature matching loss value.
 
         """
         feat_match_loss = 0.0
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 3822b33d..4207d316 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -20,27 +20,21 @@ from typeguard import check_argument_types
 def pad_list(xs, pad_value):
     """Perform padding for the list of tensors.
 
-    Parameters
-    ----------
-    xs : List[Tensor]
-        List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
-    pad_value : float)
-        Value for padding.
-
-    Returns
-    ----------
-    Tensor
-        Padded tensor (B, Tmax, `*`).
-
-    Examples
-    ----------
-    >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
-    >>> x
-    [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
-    >>> pad_list(x, 0)
-    tensor([[1., 1., 1., 1.],
-            [1., 1., 0., 0.],
-            [1., 0., 0., 0.]])
+    Args:
+        xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
+        pad_value (float): Value for padding.
+
+    Returns:
+        Tensor: Padded tensor (B, Tmax, `*`).
+
+    Examples:
+        >>> x = [paddle.ones([4]), paddle.ones([2]), paddle.ones([1])]
+        >>> x
+        [tensor([1., 1., 1., 1.]), tensor([1., 1.]), tensor([1.])]
+        >>> pad_list(x, 0)
+        tensor([[1., 1., 1., 1.],
+                [1., 1., 0., 0.],
+                [1., 0., 0., 0.]])
     """
     n_batch = len(xs)
     max_len = max(x.shape[0] for x in xs)
@@ -55,25 +49,20 @@ def pad_list(xs, pad_value):
 def make_pad_mask(lengths, length_dim=-1):
     """Make mask tensor containing indices of padded part.
 
-    Parameters
-    ----------
-    lengths : LongTensor
-            Batch of lengths (B,).
-
-    Returns
-    ----------
-    Tensor(bool)
-        Mask tensor containing indices of padded part bool.
-
-    Examples
-    ----------
-    With only lengths.
-
-    >>> lengths = [5, 3, 2]
-    >>> make_non_pad_mask(lengths)
-    masks = [[0, 0, 0, 0 ,0],
-                [0, 0, 0, 1, 1],
-                [0, 0, 1, 1, 1]]
+    Args:
+        lengths (Tensor(int64)): Batch of lengths (B,).
+
+    Returns: 
+        Tensor(bool): Mask tensor containing indices of padded part bool.
+
+    Examples:
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                    [0, 0, 0, 1, 1],
+                    [0, 0, 1, 1, 1]]
     """
     if length_dim == 0:
         raise ValueError("length_dim cannot be 0: {}".format(length_dim))
@@ -91,31 +80,24 @@ def make_pad_mask(lengths, length_dim=-1):
 def make_non_pad_mask(lengths, length_dim=-1):
     """Make mask tensor containing indices of non-padded part.
 
-    Parameters
-    ----------
-    lengths : LongTensor or List
-            Batch of lengths (B,).
-    xs : Tensor, optional
-        The reference tensor.
-        If set, masks will be the same shape as this tensor.
-    length_dim : int, optional
-        Dimension indicator of the above tensor.
-        See the example.
-
-    Returns
-    ----------
-    Tensor(bool)
-        mask tensor containing indices of padded part bool.
-
-    Examples
-    ----------
-    With only lengths.
-
-    >>> lengths = [5, 3, 2]
-    >>> make_non_pad_mask(lengths)
-    masks = [[1, 1, 1, 1 ,1],
-                [1, 1, 1, 0, 0],
-                [1, 1, 0, 0, 0]]
+    Args:
+        lengths (Tensor(int64) or List): Batch of lengths (B,).
+        xs (Tensor, optional): The reference tensor.
+            If set, masks will be the same shape as this tensor.
+        length_dim (int, optional): Dimension indicator of the above tensor.
+            See the example.
+
+    Returns:
+        Tensor(bool): mask tensor containing indices of padded part bool.
+
+    Examples: 
+        With only lengths.
+
+        >>> lengths = [5, 3, 2]
+        >>> make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1 ,1],
+                    [1, 1, 1, 0, 0],
+                    [1, 1, 0, 0, 0]]
     """
     return paddle.logical_not(make_pad_mask(lengths, length_dim))
 
@@ -127,12 +109,9 @@ def initialize(model: nn.Layer, init: str):
 
     Custom initialization routines can be implemented into submodules
 
-    Parameters
-    ----------
-    model : nn.Layer
-        Target.
-    init : str
-        Method of initialization.
+    Args:
+        model (nn.Layer): Target.
+        init (str): Method of initialization.
     """
     assert check_argument_types()
 
diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py
index fb850a4d..9860da90 100644
--- a/paddlespeech/t2s/modules/pqmf.py
+++ b/paddlespeech/t2s/modules/pqmf.py
@@ -24,20 +24,16 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
     """Design prototype filter for PQMF.
     This method is based on `A Kaiser window approach for the design of prototype
     filters of cosine modulated filterbanks`_.
-    Parameters
-    ----------
-    taps : int
-        The number of filter taps.
-    cutoff_ratio : float
-        Cut-off frequency ratio.
-    beta : float
-        Beta coefficient for kaiser window.
-    Returns
-    ----------
-    ndarray
-        Impluse response of prototype filter (taps + 1,).
-    .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
-        https://ieeexplore.ieee.org/abstract/document/681427
+
+    Args:
+        taps (int): The number of filter taps.
+        cutoff_ratio (float): Cut-off frequency ratio.
+        beta (float): Beta coefficient for kaiser window.
+    Returns:
+        ndarray:
+            Impluse response of prototype filter (taps + 1,).
+        .. _`A Kaiser window approach for the design of prototype filters of cosine modulated filterbanks`:
+            https://ieeexplore.ieee.org/abstract/document/681427
     """
     # check the arguments are valid
     assert taps % 2 == 0, "The number of taps mush be even number."
@@ -68,16 +64,12 @@ class PQMF(nn.Layer):
         """Initilize PQMF module.
         The cutoff_ratio and beta parameters are optimized for #subbands = 4.
         See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
-        Parameters
-        ----------
-        subbands : int
-            The number of subbands.
-        taps : int
-            The number of filter taps.
-        cutoff_ratio : float
-            Cut-off frequency ratio.
-        beta : float
-            Beta coefficient for kaiser window.
+
+        Args:
+            subbands (int): The number of subbands.
+            taps (int): The number of filter taps.
+            cutoff_ratio (float): Cut-off frequency ratio.
+            beta (float): Beta coefficient for kaiser window.
         """
         super().__init__()
 
@@ -110,28 +102,20 @@ class PQMF(nn.Layer):
 
     def analysis(self, x):
         """Analysis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, 1, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, subbands, T // subbands).
+        Args:
+            x (Tensor): Input tensor (B, 1, T).
+        Returns:
+            Tensor: Output tensor (B, subbands, T // subbands).
         """
         x = F.conv1d(self.pad_fn(x), self.analysis_filter)
         return F.conv1d(x, self.updown_filter, stride=self.subbands)
 
     def synthesis(self, x):
         """Synthesis with PQMF.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, subbands, T // subbands).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, 1, T).
+        Args:
+            x (Tensor): Input tensor (B, subbands, T // subbands).
+        Returns:
+            Tensor: Output tensor (B, 1, T).
         """
         x = F.conv1d_transpose(
             x, self.updown_filter * self.subbands, stride=self.subbands)
diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
index 6b7c6a6b..33ed575b 100644
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -49,20 +49,13 @@ class DurationPredictor(nn.Layer):
                  offset=1.0):
         """Initilize duration predictor module.
 
-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-                Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-                Dropout rate.
-        offset : float, optional
-            Offset value to avoid nan in log domain.
+        Args:
+            idim (int):Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
+            offset (float, optional): Offset value to avoid nan in log domain.
 
         """
         super().__init__()
@@ -105,35 +98,23 @@ class DurationPredictor(nn.Layer):
 
     def forward(self, xs, x_masks=None):
         """Calculate forward propagation.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
 
-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : ByteTensor, optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in log domain (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in log domain (B, Tmax).
         """
         return self._forward(xs, x_masks, False)
 
     def inference(self, xs, x_masks=None):
         """Inference duration.
+        Args:
+            xs(Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
 
-        Parameters
-        ----------
-        xs : Tensor
-            Batch of input sequences (B, Tmax, idim).
-        x_masks : Tensor(bool), optional
-            Batch of masks indicating padded part (B, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of predicted durations in linear domain int64 (B, Tmax).
+        Returns:
+            Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
         """
         return self._forward(xs, x_masks, True)
 
@@ -147,13 +128,9 @@ class DurationPredictorLoss(nn.Layer):
 
     def __init__(self, offset=1.0, reduction="mean"):
         """Initilize duration predictor loss module.
-
-        Parameters
-        ----------
-        offset : float, optional
-            Offset value to avoid nan in log domain.
-        reduction : str
-            Reduction type in loss calculation.
+        Args:
+            offset (float, optional): Offset value to avoid nan in log domain.
+            reduction (str): Reduction type in loss calculation.
         """
         super().__init__()
         self.criterion = nn.MSELoss(reduction=reduction)
@@ -162,21 +139,15 @@ class DurationPredictorLoss(nn.Layer):
     def forward(self, outputs, targets):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        outputs : Tensor
-            Batch of prediction durations in log domain (B, T)
-        targets : Tensor
-            Batch of groundtruth durations in linear domain (B, T)
-
-        Returns
-        ----------
-        Tensor
-            Mean squared error loss value.
-
-        Note
-        ----------
-        `outputs` is in log domain but `targets` is in linear domain.
+        Args:
+            outputs(Tensor): Batch of prediction durations in log domain (B, T)
+            targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
+
+        Returns: 
+            Tensor: Mean squared error loss value.
+
+        Note: 
+            `outputs` is in log domain but `targets` is in linear domain.
         """
         # NOTE: outputs is in log domain while targets in linear
         targets = paddle.log(targets.cast(dtype='float32') + self.offset)
diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index 9510dd88..62d707d2 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -35,10 +35,8 @@ class LengthRegulator(nn.Layer):
     def __init__(self, pad_value=0.0):
         """Initilize length regulator module.
 
-        Parameters
-        ----------
-        pad_value : float, optional
-            Value used for padding.
+        Args:
+            pad_value (float, optional): Value used for padding.
 
         """
         super().__init__()
@@ -90,19 +88,13 @@ class LengthRegulator(nn.Layer):
     def forward(self, xs, ds, alpha=1.0, is_inference=False):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        xs : Tensor
-            Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-        ds : Tensor(int64)
-            Batch of durations of each frame (B, T).
-        alpha : float, optional
-            Alpha value to control speed of speech.
+        Args:
+            xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
+            ds (Tensor(int64)): Batch of durations of each frame (B, T).
+            alpha (float, optional): Alpha value to control speed of speech.
 
-        Returns
-        ----------
-        Tensor
-            replicated input tensor based on durations (B, T*, D).
+        Returns:
+            Tensor: replicated input tensor based on durations (B, T*, D).
         """
 
         if alpha != 1.0:
diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
index 417fca82..8afbf257 100644
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@@ -42,18 +42,12 @@ class VariancePredictor(nn.Layer):
             dropout_rate: float=0.5, ):
         """Initilize duration predictor module.
 
-        Parameters
-        ----------
-        idim : int
-            Input dimension.
-        n_layers : int, optional
-            Number of convolutional layers.
-        n_chans : int, optional
-            Number of channels of convolutional layers.
-        kernel_size : int, optional
-            Kernel size of convolutional layers.
-        dropout_rate : float, optional
-            Dropout rate.
+        Args:
+            idim (int): Input dimension.
+            n_layers (int, optional): Number of convolutional layers.
+            n_chans (int, optional): Number of channels of convolutional layers.
+            kernel_size (int, optional): Kernel size of convolutional layers.
+            dropout_rate (float, optional): Dropout rate.
         """
         assert check_argument_types()
         super().__init__()
@@ -79,17 +73,12 @@ class VariancePredictor(nn.Layer):
                 x_masks: paddle.Tensor=None) -> paddle.Tensor:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-            xs : Tensor
-                Batch of input sequences (B, Tmax, idim).
-            x_masks : Tensor(bool), optional
-                Batch of masks indicating padded part (B, Tmax, 1).
+        Args:
+            xs (Tensor): Batch of input sequences (B, Tmax, idim).
+            x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).
 
-        Returns
-        ----------
-            Tensor
-                Batch of predicted sequences (B, Tmax, 1).
+        Returns:
+            Tensor: Batch of predicted sequences (B, Tmax, 1).
         """
         # (B, idim, Tmax)
         xs = xs.transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py
index a96a8946..efbfce27 100644
--- a/paddlespeech/t2s/modules/residual_block.py
+++ b/paddlespeech/t2s/modules/residual_block.py
@@ -28,26 +28,16 @@ class WaveNetResidualBlock(nn.Layer):
     unit and parametric redidual and skip connections. For more details, 
     refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.
 
-    Parameters
-    ----------
-    kernel_size : int, optional
-        Kernel size of the 1D convolution, by default 3
-    residual_channels : int, optional
-        Feature size of the resiaudl output(and also the input), by default 64
-    gate_channels : int, optional
-        Output feature size of the 1D convolution, by default 128
-    skip_channels : int, optional
-        Feature size of the skip output, by default 64
-    aux_channels : int, optional
-        Feature size of the auxiliary input (e.g. spectrogram), by default 80
-    dropout : float, optional
-        Probability of the dropout before the 1D convolution, by default 0.
-    dilation : int, optional
-        Dilation of the 1D convolution, by default 1
-    bias : bool, optional
-        Whether to use bias in the 1D convolution, by default True
-    use_causal_conv : bool, optional
-        Whether to use causal padding for the 1D convolution, by default False
+    Args:
+        kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
+        residual_channels (int, optional): Feature size of the resiaudl output(and also the input), by default 64
+        gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
+        skip_channels (int, optional): Feature size of the skip output, by default 64
+        aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
+        dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
+        dilation (int, optional): Dilation of the 1D convolution, by default 1
+        bias (bool, optional): Whether to use bias in the 1D convolution, by default True
+        use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
     """
 
     def __init__(self,
@@ -90,21 +80,15 @@ class WaveNetResidualBlock(nn.Layer):
 
     def forward(self, x, c):
         """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C_res, T), the input features.
-        c : Tensor
-            Shape (N, C_aux, T), the auxiliary input.
-
-        Returns
-        -------
-        res : Tensor
-            Shape (N, C_res, T), the residual output, which is used as the 
-            input of the next ResidualBlock in a stack of ResidualBlocks.
-        skip : Tensor
-            Shape (N, C_skip, T), the skip output, which is collected among
-            each layer in a stack of ResidualBlocks.
+        Args:
+            x (Tensor): the input features. Shape (N, C_res, T)
+            c (Tensor): the auxiliary input. Shape (N, C_aux, T)
+
+        Returns:
+            res (Tensor): Shape (N, C_res, T), the residual output, which is used as the 
+                input of the next ResidualBlock in a stack of ResidualBlocks.
+            skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
+                each layer in a stack of ResidualBlocks.
         """
         x_input = x
         x = F.dropout(x, self.dropout, training=self.training)
@@ -136,22 +120,14 @@ class HiFiGANResidualBlock(nn.Layer):
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
     ):
         """Initialize HiFiGANResidualBlock module.
-        Parameters
-        ----------
-        kernel_size : int
-            Kernel size of dilation convolution layer.
-        channels : int
-            Number of channels for convolution layer.
-        dilations : List[int]
-            List of dilation factors.
-        use_additional_convs : bool
-            Whether to use additional convolution layers.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : dict
-            Hyperparameters for activation function.
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
         """
         super().__init__()
 
@@ -190,14 +166,10 @@ class HiFiGANResidualBlock(nn.Layer):
 
     def forward(self, x):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, channels, T).
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+        Returns:
+            Tensor: Output tensor (B, channels, T).
         """
         for idx in range(len(self.convs1)):
             xt = self.convs1[idx](x)
diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py
index c885dfe9..0d949b56 100644
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
@@ -37,26 +37,17 @@ class ResidualStack(nn.Layer):
             pad_params: Dict[str, Any]={"mode": "reflect"},
             use_causal_conv: bool=False, ):
         """Initialize ResidualStack module.
-        Parameters
-        ----------
-        kernel_size : int
-            Kernel size of dilation convolution layer.
-        channels : int
-            Number of channels of convolution layers.
-        dilation : int
-            Dilation factor.
-        bias : bool
-            Whether to add bias parameter in convolution layers.
-        nonlinear_activation : str
-            Activation function module name.
-        nonlinear_activation_params : Dict[str,Any]
-            Hyperparameters for activation function.
-        pad : str
-            Padding function module name before dilated convolution layer.
-        pad_params : Dict[str, Any]
-            Hyperparameters for padding function.
-        use_causal_conv : bool
-            Whether to use causal convolution.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels of convolution layers.
+            dilation (int): Dilation factor.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
+            pad (str): Padding function module name before dilated convolution layer.
+            pad_params (Dict[str, Any]): Hyperparameters for padding function.
+            use_causal_conv (bool): Whether to use causal convolution.
         """
         super().__init__()
         # for compatibility
@@ -102,13 +93,10 @@ class ResidualStack(nn.Layer):
 
     def forward(self, c):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        c : Tensor
-            Input tensor (B, channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, chennels, T).
+
+        Args:
+            c (Tensor): Input tensor (B, channels, T).
+        Returns:     
+            Tensor: Output tensor (B, chennels, T).
         """
         return self.stack(c) + self.skip_layer(c)
diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py
index 9d4b83a2..49091eac 100644
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@@ -30,33 +30,21 @@ class StyleEncoder(nn.Layer):
 
     .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
         Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
-    Parameters
-    ----------
-    idim : int, optional
-        Dimension of the input mel-spectrogram.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_token_dim : int, optional
-        Dimension of each GST embedding.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    conv_layers : int, optional
-        The number of conv layers in the reference encoder.
-    conv_chans_list : Sequence[int], optional
-        List of the number of channels of conv layers in the referece encoder.
-    conv_kernel_size : int, optional
-        Kernal size of conv layers in the reference encoder.
-    conv_stride : int, optional
-        Stride size of conv layers in the reference encoder.
-    gru_layers : int, optional
-        The number of GRU layers in the reference encoder.
-    gru_units : int, optional
-        The number of GRU units in the reference encoder.
-
-    Todo
-    ----------
-    * Support manual weight specification in inference.
+    
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional):The number of GRU units in the reference encoder.
+
+    Todo:
+        * Support manual weight specification in inference.
 
     """
 
@@ -93,15 +81,11 @@ class StyleEncoder(nn.Layer):
     def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        speech : Tensor
-            Batch of padded target features (B, Lmax, odim).
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, odim).
 
-        Returns
-        ----------
-        Tensor:
-            Style token embeddings (B, token_dim).
+        Returns: 
+            Tensor: Style token embeddings (B, token_dim).
 
         """
         ref_embs = self.ref_enc(speech)
@@ -118,23 +102,15 @@ class ReferenceEncoder(nn.Layer):
 
     .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
         Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
-    Parameters
-    ----------
-    idim : int, optional
-        Dimension of the input mel-spectrogram.
-    conv_layers : int, optional
-        The number of conv layers in the reference encoder.
-    conv_chans_list: : Sequence[int], optional
-        List of the number of channels of conv layers in the referece encoder.
-    conv_kernel_size : int, optional
-        Kernal size of conv layers in the reference encoder.
-    conv_stride : int, optional
-        Stride size of conv layers in the reference encoder.
-    gru_layers : int, optional
-        The number of GRU layers in the reference encoder.
-    gru_units : int, optional
-        The number of GRU units in the reference encoder.
+    
+    Args:
+        idim (int, optional): Dimension of the input mel-spectrogram.
+        conv_layers (int, optional): The number of conv layers in the reference encoder.
+        conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
+        conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
+        conv_stride (int, optional): Stride size of conv layers in the reference encoder.
+        gru_layers (int, optional): The number of GRU layers in the reference encoder.
+        gru_units (int, optional): The number of GRU units in the reference encoder.
 
     """
 
@@ -191,16 +167,11 @@ class ReferenceEncoder(nn.Layer):
 
     def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
         """Calculate forward propagation.
+        Args:
+            speech (Tensor): Batch of padded target features (B, Lmax, idim).
 
-        Parameters
-        ----------
-        speech : Tensor
-            Batch of padded target features (B, Lmax, idim).
-
-        Return
-        ----------
-        Tensor
-            Reference embedding (B, gru_units)
+        Returns:
+            Tensor: Reference embedding (B, gru_units)
 
         """
         batch_size = speech.shape[0]
@@ -228,19 +199,12 @@ class StyleTokenLayer(nn.Layer):
 
     .. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
         Speech Synthesis`: https://arxiv.org/abs/1803.09017
-
-    Parameters
-    ----------
-    ref_embed_dim : int, optional
-        Dimension of the input reference embedding.
-    gst_tokens : int, optional
-        The number of GST embeddings.
-    gst_token_dim : int, optional
-        Dimension of each GST embedding.
-    gst_heads : int, optional
-        The number of heads in GST multihead attention.
-    dropout_rate : float, optional
-        Dropout rate in multi-head attention.
+    Args:
+        ref_embed_dim (int, optional): Dimension of the input reference embedding.
+        gst_tokens (int, optional): The number of GST embeddings.
+        gst_token_dim (int, optional): Dimension of each GST embedding.
+        gst_heads (int, optional): The number of heads in GST multihead attention.
+        dropout_rate (float, optional): Dropout rate in multi-head attention.
 
     """
 
@@ -271,15 +235,11 @@ class StyleTokenLayer(nn.Layer):
     def forward(self, ref_embs: paddle.Tensor) -> paddle.Tensor:
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        ref_embs : Tensor
-            Reference embeddings (B, ref_embed_dim).
+        Args:
+            ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
 
-        Returns
-        ----------
-        Tensor
-            Style token embeddings (B, gst_token_dim).
+        Returns: 
+            Tensor: Style token embeddings (B, gst_token_dim).
 
         """
         batch_size = ref_embs.shape[0]
diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py
index af7a94f3..a6fde742 100644
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@@ -30,21 +30,14 @@ def _apply_attention_constraint(e,
     introduced in `Deep Voice 3: Scaling
     Text-to-Speech with Convolutional Sequence Learning`_.
 
-    Parameters
-    ----------
-    e : Tensor
-        Attention energy before applying softmax (1, T).
-    last_attended_idx : int
-        The index of the inputs of the last attended [0, T].
-    backward_window : int, optional
-        Backward window size in attention constraint.
-    forward_window : int, optional
-        Forward window size in attetion constraint.
-
-    Returns
-    ----------
-    Tensor
-        Monotonic constrained attention energy (1, T).
+    Args:
+        e(Tensor): Attention energy before applying softmax (1, T).
+       last_attended_idx(int): The index of the inputs of the last attended [0, T].
+       backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
+       forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
+
+    Returns:
+        Tensor: Monotonic constrained attention energy (1, T).
 
     .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`:
         https://arxiv.org/abs/1710.07654
@@ -67,20 +60,14 @@ class AttLoc(nn.Layer):
 
     Reference: Attention-Based Models for Speech Recognition
         (https://arxiv.org/pdf/1506.07503.pdf)
-    Parameters
-    ----------
-    eprojs : int
-        projection-units of encoder
-    dunits : int
-        units of decoder
-    att_dim :  int
-        att_dim: attention dimension
-    aconv_chans : int
-        channels of attention convolution
-    aconv_filts : int
-        filter size of attention convolution
-    han_mode : bool
-        flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
+
+    Args:
+        eprojs (int): projection-units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+        han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
     """
 
     def __init__(self,
@@ -129,33 +116,19 @@ class AttLoc(nn.Layer):
             backward_window=1,
             forward_window=3, ):
         """Calculate AttLoc forward propagation.
-        Parameters
-        ----------
-        enc_hs_pad : paddle.Tensor
-            padded encoder hidden state (B, T_max, D_enc)
-        enc_hs_len : paddle.Tensor
-            padded encoder hidden state length (B)
-        dec_z : paddle.Tensor dec_z
-            decoder hidden state (B, D_dec)
-        att_prev : paddle.Tensor
-            previous attention weight (B, T_max)
-        scaling : float
-            scaling parameter before applying softmax
-        forward_window : paddle.Tensor
-            forward window size when constraining attention
-        last_attended_idx : int
-            index of the inputs of the last attended
-        backward_window : int
-            backward window size in attention constraint
-        forward_window : int
-            forward window size in attetion constraint
-
-        Returns
-        ----------
-        paddle.Tensor
-            attention weighted encoder state (B, D_enc)
-        paddle.Tensor  
-            previous attention weights (B, T_max)
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(Tensor): padded encoder hidden state length (B)
+            dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
+            att_prev(Tensor): previous attention weight (B, T_max)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
+            forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
+        Returns:
+            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: previous attention weights (B, T_max)
         """
         batch = paddle.shape(enc_hs_pad)[0]
         # pre-compute all h outside the decoder loop
@@ -217,19 +190,13 @@ class AttForward(nn.Layer):
     ----------
     Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
         (https://arxiv.org/pdf/1807.06736.pdf)
-    
-    Parameters
-    ----------
-    eprojs : int
-        projection-units of encoder
-    dunits : int
-        units of decoder
-    att_dim : int
-        attention dimension
-    aconv_chans : int
-        channels of attention convolution
-    aconv_filts : int 
-        filter size of attention convolution
+
+    Args:
+        eprojs (int): projection-units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
     """
 
     def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
@@ -270,30 +237,20 @@ class AttForward(nn.Layer):
             backward_window=1,
             forward_window=3, ):
         """Calculate AttForward forward propagation.
-        Parameters
-        ----------
-        enc_hs_pad : paddle.Tensor
-            padded encoder hidden state (B, T_max, D_enc)
-        enc_hs_len : list
-            padded encoder hidden state length (B,)
-        dec_z : paddle.Tensor
-            decoder hidden state (B, D_dec)
-        att_prev : paddle.Tensor
-            attention weights of previous step (B, T_max)
-        scaling : float
-            scaling parameter before applying softmax
-        last_attended_idx : int
-            index of the inputs of the last attended
-        backward_window : int
-            backward window size in attention constraint
-        forward_window : int
-            forward window size in attetion constraint
-        Returns
-        ----------
-        paddle.Tensor
-            attention weighted encoder state (B, D_enc)
-        paddle.Tensor
-            previous attention weights (B, T_max)
+
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
+            enc_hs_len(list): padded encoder hidden state length (B,)
+            dec_z(Tensor): decoder hidden state (B, D_dec)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional):  (Default value = 3)
+
+        Returns:
+            Tensor: attention weighted encoder state (B, D_enc)
+            Tensor: previous attention weights (B, T_max)
         """
         batch = len(enc_hs_pad)
         # pre-compute all h outside the decoder loop
@@ -359,24 +316,17 @@ class AttForward(nn.Layer):
 
 class AttForwardTA(nn.Layer):
     """Forward attention with transition agent module.
-    Reference
-    ----------
-    Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
-        (https://arxiv.org/pdf/1807.06736.pdf)
-    Parameters
-    ----------
-    eunits : int
-        units of encoder
-    dunits : int
-        units of decoder
-    att_dim : int
-        attention dimension
-    aconv_chans : int
-        channels of attention convolution
-    aconv_filts : int
-        filter size of attention convolution
-    odim : int
-        output dimension
+    Reference:
+        Forward attention in sequence-to-sequence acoustic modeling for speech synthesis
+            (https://arxiv.org/pdf/1807.06736.pdf)
+
+    Args:
+        eunits (int): units of encoder
+        dunits (int): units of decoder
+        att_dim (int): attention dimension
+        aconv_chans (int): channels of attention convolution
+        aconv_filts (int): filter size of attention convolution
+        odim (int): output dimension
     """
 
     def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
@@ -420,32 +370,21 @@ class AttForwardTA(nn.Layer):
             backward_window=1,
             forward_window=3, ):
         """Calculate AttForwardTA forward propagation.
-        Parameters
-        ----------
-        enc_hs_pad : paddle.Tensor
-            padded encoder hidden state (B, Tmax, eunits)
-        enc_hs_len : list paddle.Tensor
-            padded encoder hidden state length (B,)
-        dec_z : paddle.Tensor
-            decoder hidden state (B, dunits)
-        att_prev : paddle.Tensor
-            attention weights of previous step (B, T_max)
-        out_prev : paddle.Tensor
-            decoder outputs of previous step (B, odim)
-        scaling : float
-            scaling parameter before applying softmax
-        last_attended_idx : int
-            index of the inputs of the last attended
-        backward_window : int
-            backward window size in attention constraint
-        forward_window : int
-            forward window size in attetion constraint
-        Returns
-        ----------
-        paddle.Tensor
-            attention weighted encoder state (B, dunits)
-        paddle.Tensor
-            previous attention weights (B, Tmax)
+
+        Args:
+            enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
+            enc_hs_len(list Tensor): padded encoder hidden state length (B,)
+            dec_z(Tensor): decoder hidden state (B, dunits)
+            att_prev(Tensor): attention weights of previous step (B, T_max)
+            out_prev(Tensor): decoder outputs of previous step (B, odim)
+            scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
+            last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
+            backward_window(int, optional): backward window size in attention constraint (Default value = 1)
+            forward_window(int, optional):  (Default value = 3)
+
+        Returns:
+            Tensor: attention weighted encoder state (B, dunits)
+            Tensor: previous attention weights (B, Tmax)
         """
         batch = len(enc_hs_pad)
         # pre-compute all h outside the decoder loop
diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py
index 0cfe0b84..ebdfa387 100644
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@@ -44,16 +44,11 @@ class Prenet(nn.Layer):
     def __init__(self, idim, n_layers=2, n_units=256, dropout_rate=0.5):
         """Initialize prenet module.
 
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        n_layers : int, optional
-            The number of prenet layers.
-        n_units : int, optional
-            The number of prenet units.
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of prenet layers.
+            n_units (int, optional): The number of prenet units.
         """
         super().__init__()
         self.dropout_rate = dropout_rate
@@ -66,15 +61,11 @@ class Prenet(nn.Layer):
     def forward(self, x):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        x : Tensor
-            Batch of input tensors (B, ..., idim).
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., idim).
 
-        Returns
-        ----------
-        Tensor
-            Batch of output tensors (B, ..., odim).
+        Returns: 
+            Tensor: Batch of output tensors (B, ..., odim).
 
         """
         for i in range(len(self.prenet)):
@@ -109,22 +100,14 @@ class Postnet(nn.Layer):
             use_batch_norm=True, ):
         """Initialize postnet module.
 
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        n_layers : int, optional
-            The number of layers.
-        n_filts : int, optional
-            The number of filter size.
-        n_units : int, optional
-            The number of filter channels.
-        use_batch_norm : bool, optional
-            Whether to use batch normalization..
-        dropout_rate : float, optional
-            Dropout rate..
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            n_layers (int, optional): The number of layers.
+            n_filts (int, optional): The number of filter size.
+            n_units (int, optional): The number of filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization..
+            dropout_rate (float, optional): Dropout rate..
         """
         super().__init__()
         self.postnet = nn.LayerList()
@@ -184,16 +167,10 @@ class Postnet(nn.Layer):
     def forward(self, xs):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        xs : Tensor
-            Batch of the sequences of padded input tensors (B, idim, Tmax).
-
-        Returns
-        ----------
-        Tensor
-            Batch of padded output tensor. (B, odim, Tmax).
-
+        Args:
+            xs (Tensor): Batch of the sequences of padded input tensors (B, idim, Tmax).
+        Returns:
+            Tensor: Batch of padded output tensor. (B, odim, Tmax).
         """
         for i in range(len(self.postnet)):
             xs = self.postnet[i](xs)
@@ -217,13 +194,11 @@ class ZoneOutCell(nn.Layer):
 
     def __init__(self, cell, zoneout_rate=0.1):
         """Initialize zone out cell module.
-        Parameters
-        ----------
-        cell : nn.Layer:
-            Paddle recurrent cell module
-            e.g. `paddle.nn.LSTMCell`.
-        zoneout_rate : float, optional
-            Probability of zoneout from 0.0 to 1.0.
+
+        Args:
+            cell (nn.Layer): Paddle recurrent cell module
+                e.g. `paddle.nn.LSTMCell`.
+            zoneout_rate (float, optional): Probability of zoneout from 0.0 to 1.0.
         """
         super().__init__()
         self.cell = cell
@@ -235,20 +210,18 @@ class ZoneOutCell(nn.Layer):
 
     def forward(self, inputs, hidden):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        inputs : Tensor
-            Batch of input tensor (B, input_size).
-        hidden : tuple
-            - Tensor: Batch of initial hidden states (B, hidden_size).
-            - Tensor: Batch of initial cell states (B, hidden_size).
-        Returns
-        ----------
-        Tensor
-            Batch of next hidden states (B, hidden_size).
-        tuple:
-            - Tensor: Batch of next hidden states (B, hidden_size).
-            - Tensor: Batch of next cell states (B, hidden_size).
+
+        Args:
+            inputs (Tensor): Batch of input tensor (B, input_size).
+            hidden (tuple):
+                - Tensor: Batch of initial hidden states (B, hidden_size).
+                - Tensor: Batch of initial cell states (B, hidden_size).
+        Returns:
+            Tensor:
+                Batch of next hidden states (B, hidden_size).
+            tuple:
+                - Tensor: Batch of next hidden states (B, hidden_size).
+                - Tensor: Batch of next cell states (B, hidden_size).
         """
         # we only use the second output of LSTMCell in paddle
         _, next_hidden = self.cell(inputs, hidden)
@@ -302,42 +275,29 @@ class Decoder(nn.Layer):
             zoneout_rate=0.1,
             reduction_factor=1, ):
         """Initialize Tacotron2 decoder module.
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        odim : int
-            Dimension of the outputs.
-        att nn.Layer
-            Instance of attention class.
-        dlayers int, optional
-            The number of decoder lstm layers.
-        dunits : int, optional
-            The number of decoder lstm units.
-        prenet_layers : int, optional
-            The number of prenet layers.
-        prenet_units : int, optional
-            The number of prenet units.
-        postnet_layers : int, optional
-            The number of postnet layers.
-        postnet_filts : int, optional
-            The number of postnet filter size.
-        postnet_chans : int, optional
-            The number of postnet filter channels.
-        output_activation_fn : nn.Layer, optional
-            Activation function for outputs.
-        cumulate_att_w : bool, optional
-            Whether to cumulate previous attention weight.
-        use_batch_norm : bool, optional
-            Whether to use batch normalization.
-        use_concate : bool, optional
-            Whether to concatenate encoder embedding with decoder lstm outputs.
-        dropout_rate : float, optional
-            Dropout rate.
-        zoneout_rate : float, optional
-            Zoneout rate.
-        reduction_factor : int, optional
-            Reduction factor.
+
+        Args:
+            idim (int): Dimension of the inputs.
+            odim (int): Dimension of the outputs.
+            att (nn.Layer): Instance of attention class.
+            dlayers (int, optional): The number of decoder lstm layers.
+            dunits (int, optional): The number of decoder lstm units.
+            prenet_layers (int, optional): The number of prenet layers.
+            prenet_units (int, optional): The number of prenet units.
+            postnet_layers (int, optional): The number of postnet layers.
+            postnet_filts (int, optional): The number of postnet filter size.
+            postnet_chans (int, optional): The number of postnet filter channels.
+            output_activation_fn (nn.Layer, optional): Activation function for outputs.
+            cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
+            use_batch_norm (bool, optional): Whether to use batch normalization.
+            use_concate : bool, optional
+                Whether to concatenate encoder embedding with decoder lstm outputs.
+            dropout_rate : float, optional
+                Dropout rate.
+            zoneout_rate : float, optional
+                Zoneout rate.
+            reduction_factor : int, optional
+                Reduction factor.
         """
         super().__init__()
 
@@ -401,26 +361,19 @@ class Decoder(nn.Layer):
 
     def forward(self, hs, hlens, ys):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of the sequences of padded hidden states (B, Tmax, idim).
-        hlens : Tensor(int64) padded
-            Batch of lengths of each input batch (B,).
-        ys : Tensor
-            Batch of the sequences of padded target features (B, Lmax, odim).
-        Returns
-        ----------
-        Tensor
-            Batch of output tensors after postnet (B, Lmax, odim).
-        Tensor
-            Batch of output tensors before postnet (B, Lmax, odim).
-        Tensor
-            Batch of logits of stop prediction (B, Lmax).
-        Tensor
-            Batch of attention weights (B, Lmax, Tmax).
-        Note
-        ----------
+
+        Args:
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+        Returns:
+            Tensor: Batch of output tensors after postnet (B, Lmax, odim).
+            Tensor: Batch of output tensors before postnet (B, Lmax, odim).
+            Tensor: Batch of logits of stop prediction (B, Lmax).
+            Tensor: Batch of attention weights (B, Lmax, Tmax).
+            
+        Note: 
             This computation is performed in teacher-forcing manner.
         """
         # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
@@ -517,37 +470,24 @@ class Decoder(nn.Layer):
             backward_window=None,
             forward_window=None, ):
         """Generate the sequence of features given the sequences of characters.
-        Parameters
-        ----------
-        h : Tensor
-            Input sequence of encoder hidden states (T, C).
-        threshold : float, optional
-            Threshold to stop generation.
-        minlenratio : float, optional
-            Minimum length ratio.
-            If set to 1.0 and the length of input is 10,
-            the minimum length of outputs will be 10 * 1 = 10.
-        minlenratio : float, optional
-            Minimum length ratio.
-            If set to 10 and the length of input is 10,
-            the maximum length of outputs will be 10 * 10 = 100.
-        use_att_constraint : bool
-            Whether to apply attention constraint introduced in `Deep Voice 3`_.
-        backward_window : int
-            Backward window size in attention constraint.
-        forward_window : int
-            Forward window size in attention constraint.
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        Tensor
-            Output sequence of stop probabilities (L,).
-        Tensor
-            Attention weights (L, T).
-        Note
-        ----------
-        This computation is performed in auto-regressive manner.
+        Args:
+            h(Tensor): Input sequence of encoder hidden states (T, C).
+            threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
+            minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
+                the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
+            maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
+                the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
+            use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
+            backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
+            forward_window(int, optional):  (Default value = None)
+
+        Returns:
+            Tensor: Output sequence of features (L, odim).
+            Tensor: Output sequence of stop probabilities (L,).
+            Tensor: Attention weights (L, T).
+
+        Note: 
+            This computation is performed in auto-regressive manner.
     .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654
         """
         # setup
@@ -683,21 +623,18 @@ class Decoder(nn.Layer):
 
     def calculate_all_attentions(self, hs, hlens, ys):
         """Calculate all of the attention weights.
-        Parameters
-        ----------
-        hs : Tensor
-            Batch of the sequences of padded hidden states (B, Tmax, idim).
-        hlens : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-        ys : Tensor
-            Batch of the sequences of padded target features (B, Lmax, odim).
-        Returns
-        ----------
-        numpy.ndarray
-            Batch of attention weights (B, Lmax, Tmax).
-        Note
-        ----------
-        This computation is performed in teacher-forcing manner.
+
+        Args:
+            hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
+            hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
+            ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
+
+        Returns:
+            numpy.ndarray:
+                Batch of attention weights (B, Lmax, Tmax).
+    
+        Note:
+            This computation is performed in teacher-forcing manner.
         """
         # thin out frames (B, Lmax, odim) ->  (B, Lmax/r, odim)
         if self.reduction_factor > 1:
diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py
index 80c213a1..db102a11 100644
--- a/paddlespeech/t2s/modules/tacotron2/encoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/encoder.py
@@ -45,31 +45,18 @@ class Encoder(nn.Layer):
             dropout_rate=0.5,
             padding_idx=0, ):
         """Initialize Tacotron2 encoder module.
-
-        Parameters
-        ----------
-        idim : int
-            Dimension of the inputs.
-        input_layer : str
-            Input layer type.
-        embed_dim : int, optional
-            Dimension of character embedding.
-        elayers : int, optional
-            The number of encoder blstm layers.
-        eunits : int, optional
-            The number of encoder blstm units.
-        econv_layers : int, optional
-            The number of encoder conv layers.
-        econv_filts : int, optional
-            The number of encoder conv filter size.
-        econv_chans : int, optional
-            The number of encoder conv filter channels.
-        use_batch_norm : bool, optional
-            Whether to use batch normalization.
-        use_residual : bool, optional
-            Whether to use residual connection.
-        dropout_rate : float, optional
-            Dropout rate.
+        Args:
+            idim (int): Dimension of the inputs.
+            input_layer (str): Input layer type.
+            embed_dim (int, optional): Dimension of character embedding.
+            elayers (int, optional): The number of encoder blstm layers.
+            eunits (int, optional): The number of encoder blstm units.
+            econv_layers (int, optional): The number of encoder conv layers.
+            econv_filts (int, optional): The number of encoder conv filter size.
+            econv_chans (int, optional): The number of encoder conv filter channels.
+            use_batch_norm (bool, optional): Whether to use batch normalization.
+            use_residual (bool, optional): Whether to use residual connection.
+            dropout_rate (float, optional): Dropout rate.
 
         """
         super().__init__()
@@ -139,21 +126,15 @@ class Encoder(nn.Layer):
     def forward(self, xs, ilens=None):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        xs : Tensor
-            Batch of the padded sequence. Either character ids (B, Tmax)
-            or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
-            Padded value should be 0.
-        ilens : Tensor(int64)
-            Batch of lengths of each input batch (B,).
-
-        Returns
-        ----------
-        Tensor
-            Batch of the sequences of encoder states(B, Tmax, eunits).
-        Tensor(int64)
-            Batch of lengths of each sequence (B,)
+        Args:
+            xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
+                or acoustic feature (B, Tmax, idim * encoder_reduction_factor). 
+                Padded value should be 0.
+            ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
+
+        Returns:
+            Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
+            Tensor(int64): Batch of lengths of each sequence (B,)
         """
         xs = self.embed(xs).transpose([0, 2, 1])
         if self.convs is not None:
@@ -179,16 +160,12 @@ class Encoder(nn.Layer):
     def inference(self, x):
         """Inference.
 
-        Parameters
-        ----------
-        x : Tensor
-            The sequeunce of character ids (T,) 
-            or acoustic feature (T, idim * encoder_reduction_factor).
+        Args:
+            x (Tensor): The sequeunce of character ids (T,) 
+                or acoustic feature (T, idim * encoder_reduction_factor).
 
-        Returns
-        ----------
-        Tensor
-            The sequences of encoder states(T, eunits).
+        Returns:
+            Tensor: The sequences of encoder states(T, eunits).
 
         """
         xs = x.unsqueeze(0)
diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py
index 1ca4e6d8..b2275e23 100644
--- a/paddlespeech/t2s/modules/tade_res_block.py
+++ b/paddlespeech/t2s/modules/tade_res_block.py
@@ -59,18 +59,12 @@ class TADELayer(nn.Layer):
 
     def forward(self, x, c):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        c : Tensor
-            Auxiliary input tensor (B, aux_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, in_channels, T * upsample_factor).
-        Tensor
-            Upsampled aux tensor (B, in_channels, T * upsample_factor).
+        Args:
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
         """
 
         x = self.norm(x)
@@ -142,18 +136,13 @@ class TADEResBlock(nn.Layer):
 
     def forward(self, x, c):
         """Calculate forward propagation.
-        Parameters
-        ----------
-        x : Tensor
-            Input tensor (B, in_channels, T).
-        c : Tensor
-            Auxiliary input tensor (B, aux_channels, T).
-        Returns
-        ----------
-        Tensor
-            Output tensor (B, in_channels, T * upsample_factor).
-        Tensor
-            Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
+        Args:
+
+            x (Tensor): Input tensor (B, in_channels, T).
+            c (Tensor): Auxiliary input tensor (B, aux_channels, T).
+        Returns:
+            Tensor: Output tensor (B, in_channels, T * upsample_factor).
+            Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
         """
         residual = x
         x, c = self.tade1(x, c)
diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
index 34386f2a..cdb95b21 100644
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -24,15 +24,10 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill
 
 class MultiHeadedAttention(nn.Layer):
     """Multi-Head Attention layer.
-
-    Parameters
-    ----------
-    n_head : int
-        The number of heads.
-    n_feat : int
-        The number of features.
-    dropout_rate : float
-        Dropout rate.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
     """
 
     def __init__(self, n_head, n_feat, dropout_rate):
@@ -52,23 +47,15 @@ class MultiHeadedAttention(nn.Layer):
     def forward_qkv(self, query, key, value):
         """Transform query, key and value.
 
-        Parameters
-        ----------
-        query : paddle.Tensor
-            query tensor (#batch, time1, size).
-        key : paddle.Tensor
-            Key tensor (#batch, time2, size).
-        value : paddle.Tensor
-            Value tensor (#batch, time2, size).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Transformed query tensor (#batch, n_head, time1, d_k).
-        paddle.Tensor
-            Transformed key tensor (#batch, n_head, time2, d_k).
-        paddle.Tensor
-            Transformed value tensor (#batch, n_head, time2, d_k).
+        Args:
+            query(Tensor): query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+
+        Returns:
+            Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
         """
         n_batch = paddle.shape(query)[0]
 
@@ -89,20 +76,13 @@ class MultiHeadedAttention(nn.Layer):
     def forward_attention(self, value, scores, mask=None):
         """Compute attention context vector.
 
-        Parameters
-        ----------
-        value : paddle.Tensor
-            Transformed value (#batch, n_head, time2, d_k).
-        scores : paddle.Tensor
-            Attention score (#batch, n_head, time1, time2).
-        mask :  paddle.Tensor
-            Mask (#batch, 1, time2) or (#batch, time1, time2).
-
-        Returns
-        ----------
-        paddle.Tensor:
-            Transformed value (#batch, time1, d_model)
-            weighted by the attention score (#batch, time1, time2).
+        Args:
+            value(Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores(Tensor): Attention score (#batch, n_head, time1, time2).
+            mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+        Returns:
+            Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
         """
         n_batch = paddle.shape(value)[0]
         softmax = paddle.nn.Softmax(axis=-1)
@@ -132,21 +112,14 @@ class MultiHeadedAttention(nn.Layer):
     def forward(self, query, key, value, mask=None):
         """Compute scaled dot product attention.
 
-        Parameters
-        ----------
-        query : paddle.Tensor
-            Query tensor (#batch, time1, size).
-        key : paddle.Tensor
-            Key tensor (#batch, time2, size).
-        value : paddle.Tensor
-            Value tensor (#batch, time2, size).
-        mask : paddle.Tensor
-            Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time1, d_model).
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
         """
         q, k, v = self.forward_qkv(query, key, value)
         scores = paddle.matmul(q, k.transpose(
@@ -159,16 +132,12 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
     """Multi-Head Attention layer with relative position encoding (new implementation).
     Details can be found in https://github.com/espnet/espnet/pull/2816.
     Paper: https://arxiv.org/abs/1901.02860
-    Parameters
-    ----------
-    n_head : int
-        The number of heads.
-    n_feat : int
-        The number of features.
-    dropout_rate : float
-        Dropout rate.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
+
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
     """
 
     def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@@ -191,15 +160,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
 
     def rel_shift(self, x):
         """Compute relative positional encoding.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, head, time1, 2*time1-1).
-            time1 means the length of query vector.
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
+        Args:
+            x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
+
+        Returns:
+            Tensor:Output tensor.
         """
         b, h, t1, t2 = paddle.shape(x)
         zero_pad = paddle.zeros((b, h, t1, 1))
@@ -216,24 +181,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
 
     def forward(self, query, key, value, pos_emb, mask):
         """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
-        Parameters
-        ----------
-        query : paddle.Tensor 
-            Query tensor (#batch, time1, size).
-        key : paddle.Tensor
-            Key tensor (#batch, time2, size).
-        value : paddle.Tensor
-            Value tensor (#batch, time2, size).
-        pos_emb : paddle.Tensor
-            Positional embedding tensor
-            (#batch, 2*time1-1, size).
-        mask : paddle.Tensor
-            Mask tensor (#batch, 1, time2) or
-            (#batch, time1, time2).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time1, d_model).
+
+        Args:
+            query(Tensor): Query tensor (#batch, time1, size).
+            key(Tensor): Key tensor (#batch, time2, size).
+            value(Tensor): Value tensor (#batch, time2, size).
+            pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
+            mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
+
+        Returns:
+            Tensor: Output tensor (#batch, time1, d_model).
         """
         q, k, v = self.forward_qkv(query, key, value)
         # (batch, time1, head, d_k)
diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py
index fe2949f4..a8db7345 100644
--- a/paddlespeech/t2s/modules/transformer/decoder.py
+++ b/paddlespeech/t2s/modules/transformer/decoder.py
@@ -36,51 +36,32 @@ from paddlespeech.t2s.modules.transformer.repeat import repeat
 class Decoder(nn.Layer):
     """Transfomer decoder module.
 
-    Parameters
-    ----------
-    odim : int
-        Output diminsion.
-    self_attention_layer_type : str
-        Self-attention layer type.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    conv_wshare : int
-        The number of kernel of convolution. Only used in
-        self_attention_layer_type == "lightconv*" or "dynamiconv*".
-    conv_kernel_length : Union[int, str])
-        Kernel size str of convolution
-        (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
-    conv_usebias : bool
-        Whether to use bias in convolution. Only used in
-        self_attention_layer_type == "lightconv*" or "dynamiconv*".
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    self_attention_dropout_rate : float
-        Dropout rate in self-attention.
-    src_attention_dropout_rate : float
-        Dropout rate in source-attention.
-    input_layer : (Union[str, nn.Layer])
-        Input layer type.
-    use_output_layer : bool
-        Whether to use output layer.
-    pos_enc_class : nn.Layer
-        Positional encoding module class.
-        `PositionalEncoding `or `ScaledPositionalEncoding`
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    Args:
+        odim (int): Output diminsion.
+        self_attention_layer_type (str): Self-attention layer type.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        conv_wshare (int): The number of kernel of convolution. Only used in
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        conv_kernel_length (Union[int, str]):Kernel size str of convolution
+            (e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        conv_usebias (bool): Whether to use bias in convolution. Only used in
+            self_attention_layer_type == "lightconv*" or "dynamiconv*".
+        linear_units(int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        self_attention_dropout_rate (float): Dropout rate in self-attention.
+        src_attention_dropout_rate (float): Dropout rate in source-attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        use_output_layer (bool): Whether to use output layer.
+        pos_enc_class (nn.Layer): Positional encoding module class.
+            `PositionalEncoding `or `ScaledPositionalEncoding`
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
 
     """
 
@@ -161,27 +142,18 @@ class Decoder(nn.Layer):
 
     def forward(self, tgt, tgt_mask, memory, memory_mask):
         """Forward decoder.
-
-        Parameters
-        ----------
-        tgt : paddle.Tensor
-            Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed". 
-            In the other case, input tensor (#batch, maxlen_out, odim).
-        tgt_mask : paddle.Tensor
-            Input token mask (#batch, maxlen_out).
-        memory : paddle.Tensor
-            Encoded memory, float32 (#batch, maxlen_in, feat).
-        memory_mask : paddle.Tensor
-            Encoded memory mask (#batch, maxlen_in).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Decoded token score before softmax (#batch, maxlen_out, odim)
-            if use_output_layer is True. In the other case,final block outputs
-            (#batch, maxlen_out, attention_dim).
-        paddle.Tensor
-            Score mask before softmax (#batch, maxlen_out).
+        Args:
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
+                In the other case, input tensor (#batch, maxlen_out, odim).
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+
+        Returns:
+            Tensor:
+                Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True. 
+                In the other case,final block outputs (#batch, maxlen_out, attention_dim).
+            Tensor: Score mask before softmax (#batch, maxlen_out).
 
         """
         x = self.embed(tgt)
@@ -196,23 +168,15 @@ class Decoder(nn.Layer):
     def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
         """Forward one step.
 
-        Parameters
-        ----------
-        tgt : paddle.Tensor
-            Input token ids, int64 (#batch, maxlen_out).
-        tgt_mask : paddle.Tensor
-            Input token mask (#batch, maxlen_out).
-        memory : paddle.Tensor
-            Encoded memory, float32 (#batch, maxlen_in, feat).
-        cache : (List[paddle.Tensor])
-            List of cached tensors.
-            Each tensor shape should be (#batch, maxlen_out - 1, size).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (batch, maxlen_out, odim).
-        List[paddle.Tensor]
-            List of cache tensors of each decoder layer.
+        Args:
+            tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
+            tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
+            cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
+
+        Returns:
+            Tensor: Output tensor (batch, maxlen_out, odim).
+            List[Tensor]: List of cache tensors of each decoder layer.
 
         """
         x = self.embed(tgt)
@@ -254,20 +218,14 @@ class Decoder(nn.Layer):
                     xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
         """Score new token batch (required).
 
-        Parameters
-        ----------
-        ys : paddle.Tensor
-            paddle.int64 prefix tokens (n_batch, ylen).
-        states : List[Any]
-            Scorer states for prefix tokens.
-        xs : paddle.Tensor
-            The encoder feature that generates ys (n_batch, xlen, n_feat).
+        Args:
+            ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
+            states(List[Any]): Scorer states for prefix tokens.
+            xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).
 
-        Returns
-        ----------
-        tuple[paddle.Tensor, List[Any]]
-        Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)`
-        and next state list for ys.
+        Returns:
+            tuple[Tensor, List[Any]]:
+                Tuple ofbatchfied scores for next token with shape of `(n_batch, n_vocab)` and next state list for ys.
 
         """
         # merge states
diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py
index 44978f1e..9a13cd79 100644
--- a/paddlespeech/t2s/modules/transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/decoder_layer.py
@@ -22,28 +22,21 @@ from paddlespeech.t2s.modules.layer_norm import LayerNorm
 class DecoderLayer(nn.Layer):
     """Single decoder layer module.
 
-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` instance can be used as the argument.
-    src_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention` instance can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+ 
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
 
     """
 
@@ -75,30 +68,22 @@ class DecoderLayer(nn.Layer):
     def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None):
         """Compute decoded features.
 
-        Parameters
-        ----------
-        tgt : paddle.Tensor
-            Input tensor (#batch, maxlen_out, size).
-        tgt_mask : paddle.Tensor
-            Mask for input tensor (#batch, maxlen_out).
-        memory : paddle.Tensor
-            Encoded memory, float32 (#batch, maxlen_in, size).
-        memory_mask : paddle.Tensor
-            Encoded memory mask (#batch, maxlen_in).
-        cache : List[paddle.Tensor]
-            List of cached tensors.
-            Each tensor shape should be (#batch, maxlen_out - 1, size).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor(#batch, maxlen_out, size).
-        paddle.Tensor
-            Mask for output tensor (#batch, maxlen_out).
-        paddle.Tensor
-            Encoded memory (#batch, maxlen_in, size).
-        paddle.Tensor
-            Encoded memory mask (#batch, maxlen_in).
+        Args:
+            tgt(Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
+            memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+            memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
+            cache(List[Tensor], optional): List of cached tensors.
+                Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
+        Returns:
+            Tensor
+                Output tensor(#batch, maxlen_out, size).
+            Tensor
+                Mask for output tensor (#batch, maxlen_out).
+            Tensor
+                Encoded memory (#batch, maxlen_in, size).
+            Tensor
+                Encoded memory mask (#batch, maxlen_in).
 
         """
         residual = tgt
diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
index 40ab03ee..d9339d20 100644
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -22,18 +22,12 @@ from paddle import nn
 class PositionalEncoding(nn.Layer):
     """Positional encoding.
 
-    Parameters
-    ----------
-    d_model : int
-        Embedding dimension.
-    dropout_rate : float
-        Dropout rate.
-    max_len : int
-        Maximum input length.
-    reverse : bool
-        Whether to reverse the input position.
-    type : str
-        dtype of param
+    Args:
+        d_model (int):  Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        reverse (bool): Whether to reverse the input position.
+        type (str): dtype of param
     """
 
     def __init__(self,
@@ -73,15 +67,11 @@ class PositionalEncoding(nn.Layer):
     def forward(self, x: paddle.Tensor):
         """Add positional encoding.
 
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, time, `*`).
+        Args:
+            x (Tensor): Input tensor (batch, time, `*`).
 
-        Returns
-        ----------
-        paddle.Tensor
-            Encoded tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
         """
         self.extend_pe(x)
         T = paddle.shape(x)[1]
@@ -91,19 +81,13 @@ class PositionalEncoding(nn.Layer):
 
 class ScaledPositionalEncoding(PositionalEncoding):
     """Scaled positional encoding module.
-
     See Sec. 3.2  https://arxiv.org/abs/1809.08895
 
-    Parameters
-    ----------
-    d_model : int
-        Embedding dimension.
-    dropout_rate : float
-        Dropout rate.
-    max_len : int
-        Maximum input length.
-    dtype : str
-        dtype of param
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+        dtype (str): dtype of param
     """
 
     def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -126,14 +110,10 @@ class ScaledPositionalEncoding(PositionalEncoding):
     def forward(self, x):
         """Add positional encoding.
 
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, time, `*`).
-        Returns
-        ----------
-        paddle.Tensor
-            Encoded tensor (batch, time, `*`).
+        Args:
+            x (Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
         """
         self.extend_pe(x)
         T = paddle.shape(x)[1]
@@ -145,14 +125,11 @@ class RelPositionalEncoding(nn.Layer):
     """Relative positional encoding module (new implementation).
     Details can be found in https://github.com/espnet/espnet/pull/2816.
     See : Appendix B in https://arxiv.org/abs/1901.02860
-    Parameters
-    ----------
-    d_model : int
-        Embedding dimension.
-    dropout_rate : float
-        Dropout rate.
-    max_len : int
-        Maximum input length.
+
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
     """
 
     def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@@ -197,14 +174,10 @@ class RelPositionalEncoding(nn.Layer):
 
     def forward(self, x: paddle.Tensor):
         """Add positional encoding.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (batch, time, `*`).
-        Returns
-        ----------
-        paddle.Tensor
-            Encoded tensor (batch, time, `*`).
+        Args:
+            x (Tensor):Input tensor (batch, time, `*`).
+        Returns:
+            Tensor: Encoded tensor (batch, time, `*`).
         """
         self.extend_pe(x)
         x = x * self.xscale
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index 8bf71b41..2b3ee788 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -37,62 +37,37 @@ from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
 class BaseEncoder(nn.Layer):
     """Base Encoder module.
 
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, nn.Layer]
-        Input layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    macaron_style : bool
-        Whether to use macaron style for positionwise layer.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    use_cnn_module : bool
-        Whether to use convolution module.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
-    cnn_module_kernel : int
-        Kernerl size of convolution module.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    stochastic_depth_rate : float
-        Maximum probability to skip the encoder layer.
-    intermediate_layers : Union[List[int], None]
-        indices of intermediate CTC layer.
-        indices start from 1.
-        if not None, intermediate outputs are returned (which changes return type
-        signature.)
-    encoder_type: str
-         "transformer", or "conformer".
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
+            indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type
+            signature.)
+        encoder_type (str): "transformer", or "conformer".
     """
 
     def __init__(self,
@@ -290,19 +265,13 @@ class BaseEncoder(nn.Layer):
     def forward(self, xs, masks):
         """Encode input sequence.
 
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, 1, time).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, 1, time).
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+
+        Returns: 
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: Mask tensor (#batch, 1, time).
         """
         xs = self.embed(xs)
         xs, masks = self.encoders(xs, masks)
@@ -313,45 +282,28 @@ class BaseEncoder(nn.Layer):
 
 class TransformerEncoder(BaseEncoder):
     """Transformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, paddle.nn.Layer]
-        Input layer type.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    padding_idx : int
-        Padding idx for input_layer=embed.
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        padding_idx (int): Padding idx for input_layer=embed.
     """
 
     def __init__(
@@ -397,19 +349,13 @@ class TransformerEncoder(BaseEncoder):
     def forward(self, xs, masks):
         """Encode input sequence.
 
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, 1, time).
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, 1, time).
+        Args:
+            xs(Tensor): Input tensor (#batch, time, idim).
+            masks(Tensor): Mask tensor (#batch, 1, time).
+
+        Returns:
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor:Mask tensor (#batch, 1, time).
         """
         xs = self.embed(xs)
         xs, masks = self.encoders(xs, masks)
@@ -420,23 +366,15 @@ class TransformerEncoder(BaseEncoder):
     def forward_one_step(self, xs, masks, cache=None):
         """Encode input frame.
 
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor.
-        masks : paddle.Tensor
-            Mask tensor.
-        cache : List[paddle.Tensor]
-            List of cache tensors.
-
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor.
-        paddle.Tensor
-            Mask tensor.
-        List[paddle.Tensor]
-            List of new cache tensors.
+        Args:
+            xs (Tensor): Input tensor.
+            masks (Tensor): Mask tensor.
+            cache (List[Tensor]): List of cache tensors.
+
+        Returns:
+            Tensor: Output tensor.
+            Tensor: Mask tensor.
+            List[Tensor]: List of new cache tensors.
         """
 
         xs = self.embed(xs)
@@ -453,60 +391,35 @@ class TransformerEncoder(BaseEncoder):
 
 class ConformerEncoder(BaseEncoder):
     """Conformer encoder module.
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    attention_dim : int
-        Dimention of attention.
-    attention_heads : int
-        The number of heads of multi head attention.
-    linear_units : int
-        The number of units of position-wise feed forward.
-    num_blocks : int
-        The number of decoder blocks.
-    dropout_rate : float
-        Dropout rate.
-    positional_dropout_rate : float
-        Dropout rate after adding positional encoding.
-    attention_dropout_rate : float
-        Dropout rate in attention.
-    input_layer : Union[str, nn.Layer]
-        Input layer type.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
-    positionwise_layer_type : str
-        "linear", "conv1d", or "conv1d-linear".
-    positionwise_conv_kernel_size : int
-        Kernel size of positionwise conv1d layer.
-    macaron_style : bool
-        Whether to use macaron style for positionwise layer.
-    pos_enc_layer_type : str
-        Encoder positional encoding layer type.
-    selfattention_layer_type : str
-        Encoder attention layer type.
-    activation_type : str
-        Encoder activation function type.
-    use_cnn_module : bool
-        Whether to use convolution module.
-    zero_triu : bool
-        Whether to zero the upper triangular part of attention matrix.
-    cnn_module_kernel : int
-        Kernerl size of convolution module.
-    padding_idx : int
-        Padding idx for input_layer=embed.
-    stochastic_depth_rate : float
-        Maximum probability to skip the encoder layer.
-    intermediate_layers : Union[List[int], None]
-        indices of intermediate CTC layer.
-        indices start from 1.
-        if not None, intermediate outputs are returned (which changes return type
-        signature.)
+
+    Args:
+        idim (int): Input dimension.
+        attention_dim (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        attention_dropout_rate (float): Dropout rate in attention.
+        input_layer (Union[str, nn.Layer]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool):Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        pos_enc_layer_type (str): Encoder positional encoding layer type.
+        selfattention_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+        stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+        intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
+            if not None, intermediate outputs are returned (which changes return type signature.)
     """
 
     def __init__(
@@ -563,18 +476,13 @@ class ConformerEncoder(BaseEncoder):
 
     def forward(self, xs, masks):
         """Encode input sequence.
-        Parameters
-        ----------
-        xs : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        masks : paddle.Tensor
-            Mask tensor (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, attention_dim).
-        paddle.Tensor
-            Mask tensor (#batch, 1, time).
+
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, attention_dim).
+            Tensor: Mask tensor (#batch, 1, time).
         """
         if isinstance(self.embed, (Conv2dSubsampling)):
             xs, masks = self.embed(xs, masks)
diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py
index f55ded3d..72372b69 100644
--- a/paddlespeech/t2s/modules/transformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/transformer/encoder_layer.py
@@ -20,25 +20,18 @@ from paddle import nn
 class EncoderLayer(nn.Layer):
     """Encoder layer module.
 
-    Parameters
-    ----------
-    size : int
-        Input dimension.
-    self_attn : nn.Layer
-        Self-attention module instance.
-        `MultiHeadedAttention`  instance can be used as the argument.
-    feed_forward : nn.Layer
-        Feed-forward module instance.
-        `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
-    dropout_rate : float
-        Dropout rate.
-    normalize_before : bool
-        Whether to use layer_norm before the first block.
-    concat_after : bool
-        Whether to concat attention layer's input and output.
-        if True, additional linear will be applied.
-        i.e. x -> x + linear(concat(x, att(x)))
-        if False, no additional linear will be applied. i.e. x -> x + att(x)
+    Args:
+        size (int): Input dimension.
+        self_attn (nn.Layer): Self-attention module instance.
+            `MultiHeadedAttention`  instance can be used as the argument.
+        feed_forward (nn.Layer): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
     """
 
     def __init__(
@@ -65,21 +58,14 @@ class EncoderLayer(nn.Layer):
     def forward(self, x, mask, cache=None):
         """Compute encoded features.
 
-        Parameters
-        ----------
-        x_input : paddle.Tensor
-            Input tensor (#batch, time, size).
-        mask : paddle.Tensor
-            Mask tensor for the input (#batch, time).
-        cache : paddle.Tensor
-                Cache tensor of the input (#batch, time - 1, size).
+        Args:
+            x(Tensor): Input tensor (#batch, time, size).
+            mask(Tensor): Mask tensor for the input (#batch, time).
+            cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size). 
 
-        Returns
-        ----------
-        paddle.Tensor
-            Output tensor (#batch, time, size).
-        paddle.Tensor
-            Mask tensor (#batch, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, size).
+            Tensor: Mask tensor (#batch, time).
         """
         residual = x
         if self.normalize_before:
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index ccf84c8a..9bcc1acf 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -30,20 +30,13 @@ class LightweightConvolution(nn.Layer):
     This implementation is based on
     https://github.com/pytorch/fairseq/tree/master/fairseq
 
-    Parameters
-    ----------
-    wshare : int
-        the number of kernel of convolution
-    n_feat : int
-        the number of features
-    dropout_rate : float
-        dropout_rate
-    kernel_size : int
-        kernel size (length)
-    use_kernel_mask : bool
-        Use causal mask or not for convolution kernel
-    use_bias : bool
-        Use bias term or not.
+    Args:
+        wshare (int): the number of kernel of convolution
+        n_feat (int): the number of features
+        dropout_rate (float): dropout_rate
+        kernel_size (int): kernel size (length)
+        use_kernel_mask (bool): Use causal mask or not for convolution kernel
+        use_bias (bool): Use bias term or not.
 
     """
 
@@ -100,21 +93,14 @@ class LightweightConvolution(nn.Layer):
         This function takes query, key and value but uses only query.
         This is just for compatibility with self-attention layer (attention.py)
 
-        Parameters
-        ----------
-        query : paddle.Tensor
-            (batch, time1, d_model) input tensor
-        key : paddle.Tensor
-            (batch, time2, d_model) NOT USED
-        value : paddle.Tensor
-            (batch, time2, d_model) NOT USED
-        mask : paddle.Tensor
-            (batch, time1, time2) mask
-
-        Return
-        ----------
-        x : paddle.Tensor
-            (batch, time1, d_model) ouput
+        Args:
+            query (Tensor): input tensor. (batch, time1, d_model)
+            key (Tensor): NOT USED. (batch, time2, d_model)  
+            value (Tensor): NOT USED. (batch, time2, d_model) 
+            mask : (Tensor): (batch, time1, time2) mask
+
+        Return:
+            Tensor: ouput. (batch, time1, d_model) 
 
         """
         # linear -> GLU -> lightconv -> linear
diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py
index fd97b004..c10e6add 100644
--- a/paddlespeech/t2s/modules/transformer/mask.py
+++ b/paddlespeech/t2s/modules/transformer/mask.py
@@ -17,19 +17,16 @@ import paddle
 
 def subsequent_mask(size, dtype=paddle.bool):
     """Create mask for subsequent steps (size, size).
-    Parameters
-    ----------
-    size : int
-        size of mask
-    dtype : paddle.dtype
-        result dtype
-    Return
-    ----------
-    paddle.Tensor
-    >>> subsequent_mask(3)
-    [[1, 0, 0],
-     [1, 1, 0],
-     [1, 1, 1]]
+
+    Args:
+        size (int): size of mask
+        dtype (paddle.dtype): result dtype
+    Return:
+        Tensor:
+            >>> subsequent_mask(3)
+            [[1, 0, 0],
+            [1, 1, 0],
+            [1, 1, 1]]
     """
     ret = paddle.ones([size, size], dtype=dtype)
     return paddle.tril(ret)
@@ -37,19 +34,13 @@ def subsequent_mask(size, dtype=paddle.bool):
 
 def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
     """Create mask for decoder self-attention.
-    Parameters
-    ----------
 
-    ys_pad : paddle.Tensor
-        batch of padded target sequences (B, Lmax)
-    ignore_id : int
-        index of padding
-    dtype : torch.dtype
-        result dtype
-    Return
-    ----------
-    paddle.Tensor 
-        (B, Lmax, Lmax)
+    Args:
+        ys_pad (Tensor): batch of padded target sequences (B, Lmax)
+        ignore_id (int): index of padding
+        dtype (paddle.dtype): result dtype
+    Return: 
+        Tensor: (B, Lmax, Lmax)
     """
     ys_mask = ys_in_pad != ignore_id
     m = subsequent_mask(ys_mask.shape[-1]).unsqueeze(0)
diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index df8929e3..d3285b65 100644
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -31,16 +31,11 @@ class MultiLayeredConv1d(nn.Layer):
     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
         """Initialize MultiLayeredConv1d module.
 
-        Parameters
-        ----------
-        in_chans : int
-            Number of input channels.
-        hidden_chans : int
-            Number of hidden channels.
-        kernel_size : int
-            Kernel size of conv1d.
-        dropout_rate : float
-            Dropout rate.
+        Args: 
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
 
         """
         super().__init__()
@@ -62,15 +57,11 @@ class MultiLayeredConv1d(nn.Layer):
     def forward(self, x):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Batch of input tensors (B, T, in_chans).
+        Args:
+            x (Tensor): Batch of input tensors (B, T, in_chans).
 
-        Returns
-        ----------
-        paddle.Tensor
-            Batch of output tensors (B, T, in_chans).
+        Returns: 
+            Tensor: Batch of output tensors (B, T, in_chans).
         """
         x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
         return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
@@ -87,16 +78,11 @@ class Conv1dLinear(nn.Layer):
     def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
         """Initialize Conv1dLinear module.
 
-        Parameters
-        ----------
-        in_chans : int
-            Number of input channels.
-        hidden_chans : int
-            Number of hidden channels.
-        kernel_size : int
-            Kernel size of conv1d.
-        dropout_rate : float
-            Dropout rate.
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
         """
         super().__init__()
         self.w_1 = nn.Conv1D(
@@ -112,15 +98,11 @@ class Conv1dLinear(nn.Layer):
     def forward(self, x):
         """Calculate forward propagation.
 
-        Parameters
-        ----------
-        x : paddle.Tensor
-        Batch of input tensors (B, T, in_chans).
+        Args:
+            x (Tensor): Batch of input tensors (B, T, in_chans).
 
-        Returns
-        ----------
-        paddle.Tensor
-            Batch of output tensors (B, T, in_chans).
+        Returns:
+            Tensor: Batch of output tensors (B, T, in_chans).
 
         """
         x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
index 28ed1c31..92af6851 100644
--- a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
+++ b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py
@@ -20,14 +20,10 @@ from paddle import nn
 class PositionwiseFeedForward(nn.Layer):
     """Positionwise feed forward layer.
 
-    Parameters
-    ----------
-    idim : int
-        Input dimenstion.
-    hidden_units : int
-        The number of hidden units.
-    dropout_rate : float
-        Dropout rate.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
     """
 
     def __init__(self,
diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py
index f738b556..2073a78b 100644
--- a/paddlespeech/t2s/modules/transformer/repeat.py
+++ b/paddlespeech/t2s/modules/transformer/repeat.py
@@ -29,16 +29,11 @@ class MultiSequential(paddle.nn.Sequential):
 def repeat(N, fn):
     """Repeat module N times.
 
-    Parameters
-    ----------
-    N : int
-        Number of repeat time.
-    fn : Callable
-        Function to generate module.
+    Args:
+        N (int): Number of repeat time.
+        fn (Callable): Function to generate module.
 
-    Returns
-    ----------
-    MultiSequential
-        Repeated model instance.
+    Returns:
+        MultiSequential: Repeated model instance.
     """
     return MultiSequential(*[fn(n) for n in range(N)])
diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py
index cf0fca8a..07439705 100644
--- a/paddlespeech/t2s/modules/transformer/subsampling.py
+++ b/paddlespeech/t2s/modules/transformer/subsampling.py
@@ -21,16 +21,12 @@ from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
 
 class Conv2dSubsampling(nn.Layer):
     """Convolutional 2D subsampling (to 1/4 length).
-    Parameters
-    ----------
-    idim : int
-        Input dimension.
-    odim : int
-        Output dimension.
-    dropout_rate : float
-        Dropout rate.
-    pos_enc : nn.Layer
-        Custom position encoding layer.
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (nn.Layer): Custom position encoding layer.
     """
 
     def __init__(self, idim, odim, dropout_rate, pos_enc=None):
@@ -48,20 +44,12 @@ class Conv2dSubsampling(nn.Layer):
 
     def forward(self, x, x_mask):
         """Subsample x.
-        Parameters
-        ----------
-        x : paddle.Tensor
-            Input tensor (#batch, time, idim).
-        x_mask : paddle.Tensor
-            Input mask (#batch, 1, time).
-        Returns
-        ----------
-        paddle.Tensor
-            Subsampled tensor (#batch, time', odim),
-            where time' = time // 4.
-        paddle.Tensor
-            Subsampled mask (#batch, 1, time'),
-            where time' = time // 4.
+        Args:
+            x (Tensor): Input tensor (#batch, time, idim).
+            x_mask (Tensor): Input mask (#batch, 1, time).
+        Returns:
+            Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
+            Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
         """
         # (b, c, t, f)
         x = x.unsqueeze(1)
diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py
index 82e30414..65e78a89 100644
--- a/paddlespeech/t2s/modules/upsample.py
+++ b/paddlespeech/t2s/modules/upsample.py
@@ -27,17 +27,12 @@ class Stretch2D(nn.Layer):
     def __init__(self, w_scale: int, h_scale: int, mode: str="nearest"):
         """Strech an image (or image-like object) with some interpolation.
 
-        Parameters
-        ----------
-        w_scale : int
-            Scalar of width.
-        h_scale : int
-            Scalar of the height.
-        mode : str, optional
-            Interpolation mode, modes suppored are "nearest", "bilinear", 
-            "trilinear", "bicubic", "linear" and "area",by default "nearest"
-
-            For more details about interpolation, see 
+        Args:
+            w_scale (int): Scalar of width.
+            h_scale (int): Scalar of the height.
+            mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear", 
+                "trilinear", "bicubic", "linear" and "area",by default "nearest"
+        For more details about interpolation, see 
             `paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
         """
         super().__init__()
@@ -47,16 +42,14 @@ class Stretch2D(nn.Layer):
 
     def forward(self, x):
         """
-        Parameters
-        ----------
-        x : Tensor
-            Shape (N, C, H, W)
-
-        Returns
-        -------
-        Tensor
-            Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
-            The stretched image.
+
+        Args: 
+            x (Tensor): Shape (N, C, H, W)
+
+        Returns:
+            Tensor: The stretched image.
+                Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
+            
         """
         out = F.interpolate(
             x, scale_factor=(self.h_scale, self.w_scale), mode=self.mode)
@@ -67,26 +60,16 @@ class UpsampleNet(nn.Layer):
     """A Layer to upsample spectrogram by applying consecutive stretch and
     convolutions.
 
-    Parameters
-    ----------
-    upsample_scales : List[int]
-        Upsampling factors for each strech.
-    nonlinear_activation : Optional[str], optional
-        Activation after each convolution, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to construct the activation, by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the strech, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Convolution kernel size along the frequency axis, by default 1
-    use_causal_conv : bool, optional
-        Whether to use causal padding before convolution, by default False
-
-        If True, Causal padding is used along the time axis, i.e. padding
-        amount is ``receptive field - 1`` and 0 for before and after,
-        respectively.
-
-        If False, "same" padding is used along the time axis.
+    Args:
+        upsample_scales (List[int]): Upsampling factors for each strech.
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            If True, Causal padding is used along the time axis, 
+            i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
+            If False, "same" padding is used along the time axis.
     """
 
     def __init__(self,
@@ -122,16 +105,12 @@ class UpsampleNet(nn.Layer):
 
     def forward(self, c):
         """
-        Parameters
-        ----------
-        c : Tensor
-            Shape (N, F, T), spectrogram
-
-        Returns
-        -------
-        Tensor
-            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
-            spectrogram
+        Args:
+            c (Tensor): spectrogram. Shape (N, F, T)
+
+        Returns: 
+            Tensor: upsampled spectrogram.
+                Shape (N, F, T'), where ``T' = upsample_factor * T``, 
         """
         c = c.unsqueeze(1)
         for f in self.up_layers:
@@ -145,35 +124,22 @@ class UpsampleNet(nn.Layer):
 class ConvInUpsampleNet(nn.Layer):
     """A Layer to upsample spectrogram composed of a convolution and an 
     UpsampleNet.
-
-    Parameters
-    ----------
-    upsample_scales : List[int]
-        Upsampling factors for each strech.
-    nonlinear_activation : Optional[str], optional
-        Activation after each convolution, by default None
-    nonlinear_activation_params : Dict[str, Any], optional
-        Parameters passed to construct the activation, by default {}
-    interpolate_mode : str, optional
-        Interpolation mode of the strech, by default "nearest"
-    freq_axis_kernel_size : int, optional
-        Convolution kernel size along the frequency axis, by default 1
-    aux_channels : int, optional
-        Feature size of the input, by default 80
-    aux_context_window : int, optional
-        Context window of the first 1D convolution applied to the input. It 
-        related to the kernel size of the convolution, by default 0
-
-        If use causal convolution, the kernel size is ``window + 1``, else
-        the kernel size is ``2 * window + 1``.
-    use_causal_conv : bool, optional
-        Whether to use causal padding before convolution, by default False
-
-        If True, Causal padding is used along the time axis, i.e. padding 
-        amount is ``receptive field - 1`` and 0 for before and after, 
-        respectively.
-
-        If False, "same" padding is used along the time axis.
+    
+    Args:
+        upsample_scales (List[int]): Upsampling factors for each strech.
+        nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
+        nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
+        interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
+        freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
+        aux_channels (int, optional): Feature size of the input, by default 80
+        aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It 
+            related to the kernel size of the convolution, by default 0
+            If use causal convolution, the kernel size is ``window + 1``, 
+            else the kernel size is ``2 * window + 1``.
+        use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
+            If True, Causal padding is used along the time axis, i.e. padding 
+            amount is ``receptive field - 1`` and 0 for before and after, respectively.
+            If False, "same" padding is used along the time axis.
     """
 
     def __init__(self,
@@ -204,16 +170,11 @@ class ConvInUpsampleNet(nn.Layer):
 
     def forward(self, c):
         """
-        Parameters
-        ----------
-        c : Tensor
-            Shape (N, F, T), spectrogram
-
-        Returns
-        -------
-        Tensors
-            Shape (N, F, T'), where ``T' = upsample_factor * T``, upsampled 
-            spectrogram
+        Args:
+            c (Tensor): spectrogram. Shape (N, F, T)
+
+        Returns:
+            Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``, 
         """
         c_ = self.conv_in(c)
         c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_
diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py
index de36db24..05a363ff 100644
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
@@ -57,35 +57,30 @@ class ExperimentBase(object):
     Feel free to add/overwrite other methods and standalone functions if you
     need.
 
-    Parameters
-    ----------
-    config: yacs.config.CfgNode
-        The configuration used for the experiment.
-
-    args: argparse.Namespace
-        The parsed command line arguments.
-
-    Examples
-    --------
-    >>> def main_sp(config, args):
-    >>>     exp = Experiment(config, args)
-    >>>     exp.setup()
-    >>>     exe.resume_or_load()
-    >>>     exp.run()
-    >>>
-    >>> config = get_cfg_defaults()
-    >>> parser = default_argument_parser()
-    >>> args = parser.parse_args()
-    >>> if args.config:
-    >>>     config.merge_from_file(args.config)
-    >>> if args.opts:
-    >>>     config.merge_from_list(args.opts)
-    >>> config.freeze()
-    >>>
-    >>> if args.ngpu > 1:
-    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
-    >>> else:
-    >>>     main_sp(config, args)
+    Args:
+        config (yacs.config.CfgNode): The configuration used for the experiment.
+        args (argparse.Namespace): The parsed command line arguments.
+
+    Examples:
+        >>> def main_sp(config, args):
+        >>>     exp = Experiment(config, args)
+        >>>     exp.setup()
+        >>>     exe.resume_or_load()
+        >>>     exp.run()
+        >>>
+        >>> config = get_cfg_defaults()
+        >>> parser = default_argument_parser()
+        >>> args = parser.parse_args()
+        >>> if args.config:
+        >>>     config.merge_from_file(args.config)
+        >>> if args.opts:
+        >>>     config.merge_from_list(args.opts)
+        >>> config.freeze()
+        >>>
+        >>> if args.ngpu > 1:
+        >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
+        >>> else:
+        >>>     main_sp(config, args)
     """
 
     def __init__(self, config, args):
diff --git a/paddlespeech/t2s/training/extensions/snapshot.py b/paddlespeech/t2s/training/extensions/snapshot.py
index 3a86556b..5f8d3c45 100644
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
@@ -43,10 +43,8 @@ class Snapshot(extension.Extension):
     parameters and optimizer states. If the updater inside the trainer
     subclasses StandardUpdater, everything is good to go.
 
-    Parameters
-    ----------
-    checkpoint_dir : Union[str, Path]
-        The directory to save checkpoints into.
+    Arsg:
+        checkpoint_dir (Union[str, Path]): The directory to save checkpoints into.
     """
 
     trigger = (1, 'epoch')
diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py
index 7a9fe5ad..41b13b75 100644
--- a/paddlespeech/t2s/utils/error_rate.py
+++ b/paddlespeech/t2s/utils/error_rate.py
@@ -70,21 +70,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
     """Compute the levenshtein distance between reference sequence and
     hypothesis sequence in word-level.
 
-    Parameters
-    ----------
-    reference : str
-        The reference sentence.
-    hypothesis : str
-        The hypothesis sentence.
-    ignore_case : bool
-        Whether case-sensitive or not.
-    delimiter : char(str)
-        Delimiter of input sentences.
-
-    Returns
-    ----------
-    list
-        Levenshtein distance and word number of reference sentence.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        delimiter (char(str)): Delimiter of input sentences.
+
+    Returns:
+        list: Levenshtein distance and word number of reference sentence.
     """
     if ignore_case:
         reference = reference.lower()
@@ -101,21 +94,14 @@ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
     """Compute the levenshtein distance between reference sequence and
     hypothesis sequence in char-level.
 
-    Parameters
-    ----------
-    reference: str
-        The reference sentence.
-    hypothesis: str
-        The hypothesis sentence.
-    ignore_case: bool
-        Whether case-sensitive or not.
-    remove_space: bool
-        Whether remove internal space characters
-
-    Returns
-    ----------
-    list
-        Levenshtein distance and length of reference sentence.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        remove_space (bool): Whether remove internal space characters
+
+    Returns:
+        list: Levenshtein distance and length of reference sentence.
     """
     if ignore_case:
         reference = reference.lower()
@@ -146,27 +132,17 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
     We can use levenshtein distance to calculate WER. Please draw an attention
     that empty items will be removed when splitting sentences by delimiter.
 
-    Parameters
-    ----------
-    reference: str
-        The reference sentence.
-
-    hypothesis: str
-        The hypothesis sentence.
-    ignore_case: bool
-        Whether case-sensitive or not.
-    delimiter: char
-        Delimiter of input sentences.
-
-    Returns
-    ----------
-    float
-         Word error rate.
-
-    Raises
-    ----------
-    ValueError
-        If word number of reference is zero.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        delimiter (char): Delimiter of input sentences.
+
+    Returns: 
+        float: Word error rate.
+
+    Raises:
+        ValueError: If word number of reference is zero.
     """
     edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case,
                                          delimiter)
@@ -194,26 +170,17 @@ def cer(reference, hypothesis, ignore_case=False, remove_space=False):
     space characters will be truncated and multiple consecutive space
     characters in a sentence will be replaced by one space character.
 
-    Parameters
-    ----------
-    reference: str
-        The reference sentence.
-    hypothesis: str
-        The hypothesis sentence.
-    ignore_case: bool
-        Whether case-sensitive or not.
-    remove_space: bool
-        Whether remove internal space characters
-
-    Returns
-    ----------
-    float
-        Character error rate.
-
-    Raises
-    ----------
-    ValueError
-        If the reference length is zero.
+    Args:
+        reference (str): The reference sentence.
+        hypothesis (str): The hypothesis sentence.
+        ignore_case (bool): Whether case-sensitive or not.
+        remove_space (bool): Whether remove internal space characters
+
+    Returns: 
+        float: Character error rate.
+
+    Raises: 
+        ValueError: If the reference length is zero.
     """
     edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
                                          remove_space)
diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py
index d0e277db..75c2e448 100644
--- a/paddlespeech/t2s/utils/h5_utils.py
+++ b/paddlespeech/t2s/utils/h5_utils.py
@@ -23,18 +23,12 @@ import numpy as np
 
 def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
     """Read a dataset from a HDF5 file.
+    Args:
+        filename (Union[Path, str]): Path of the HDF5 file.
+        dataset_name (str): Name of the dataset to read.
 
-    Parameters
-    ----------
-    filename : Union[Path, str]
-        Path of the HDF5 file.
-    dataset_name : str
-        Name of the dataset to read.
-
-    Returns
-    -------
-    Any
-        The retrieved dataset.
+    Returns:
+        Any: The retrieved dataset.
     """
     filename = Path(filename)
 
@@ -60,17 +54,11 @@ def write_hdf5(filename: Union[Path, str],
                write_data: np.ndarray,
                is_overwrite: bool=True) -> None:
     """Write dataset to HDF5 file.
-
-    Parameters
-    ----------
-    filename : Union[Path, str]
-        Path of the HDF5 file.
-    dataset_name : str
-        Name of the dataset to write to.
-    write_data : np.ndarrays
-        The data to write.
-    is_overwrite : bool, optional
-        Whether to overwrite, by default True
+    Args:
+        filename (Union[Path, str]): Path of the HDF5 file.
+        dataset_name (str): Name of the dataset to write to.
+        write_data (np.ndarrays): The data to write.
+        is_overwrite (bool, optional): Whether to overwrite, by default True
     """
     # convert to numpy array
     filename = Path(filename)