diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py
index 2b920284..c1155654 100644
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import warnings
 from typing import Optional
 from typing import Tuple
@@ -19,7 +20,6 @@ from typing import Union
 import numpy as np
 import resampy
 import soundfile as sf
-from numpy import ndarray as array
 from scipy.io import wavfile
 
 from ..utils import ParameterError
@@ -38,13 +38,21 @@ RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
 EPS = 1e-8
 
 
-def resample(y: array, src_sr: int, target_sr: int,
-             mode: str='kaiser_fast') -> array:
-    """ Audio resampling
-     This function is the same as using resampy.resample().
-     Notes:
-        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
-     """
+def resample(y: np.ndarray,
+             src_sr: int,
+             target_sr: int,
+             mode: str='kaiser_fast') -> np.ndarray:
+    """Audio resampling.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        src_sr (int): Source sample rate.
+        target_sr (int): Target sample rate.
+        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        np.ndarray: `y` resampled to `target_sr`
+    """
 
     if mode == 'kaiser_best':
         warnings.warn(
@@ -53,7 +61,7 @@ def resample(y: array, src_sr: int, target_sr: int,
 
     if not isinstance(y, np.ndarray):
         raise ParameterError(
-            'Only support numpy array, but received y in {type(y)}')
+            'Only support numpy np.ndarray, but received y in {type(y)}')
 
     if mode not in RESAMPLE_MODES:
         raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
@@ -61,9 +69,17 @@ def resample(y: array, src_sr: int, target_sr: int,
     return resampy.resample(y, src_sr, target_sr, filter=mode)
 
 
-def to_mono(y: array, merge_type: str='average') -> array:
-    """ convert sterior audio to mono
+def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
+    """Convert sterior audio to mono.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
+
+    Returns:
+        np.ndarray: `y` with mono channel.
     """
+
     if merge_type not in MERGE_TYPES:
         raise ParameterError(
             f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
@@ -101,18 +117,34 @@ def to_mono(y: array, merge_type: str='average') -> array:
     return y_out
 
 
-def _safe_cast(y: array, dtype: Union[type, str]) -> array:
-    """ data type casting in a safe way, i.e., prevent overflow or underflow
-    This function is used internally.
+def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Data type casting in a safe way, i.e., prevent overflow or underflow.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
     """
-    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+    if 'float' in str(y.dtype):
+        return np.clip(y, np.finfo(dtype).min,
+                       np.finfo(dtype).max).astype(dtype)
+    else:
+        return np.clip(y, np.iinfo(dtype).min,
+                       np.iinfo(dtype).max).astype(dtype)
 
 
-def depth_convert(y: array, dtype: Union[type, str],
-                  dithering: bool=True) -> array:
-    """Convert audio array to target dtype safely
-    This function convert audio waveform to a target dtype, with addition steps of
+def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
     preventing overflow/underflow and preserving audio range.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
     """
 
     SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
@@ -157,14 +189,20 @@ def depth_convert(y: array, dtype: Union[type, str],
     return y
 
 
-def sound_file_load(file: str,
+def sound_file_load(file: os.PathLike,
                     offset: Optional[float]=None,
                     dtype: str='int16',
-                    duration: Optional[int]=None) -> Tuple[array, int]:
-    """Load audio using soundfile library
-    This function load audio file using libsndfile.
-    Reference:
-        http://www.mega-nerd.com/libsndfile/#Features
+                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
+    """Load audio using soundfile library. This function load audio file using libsndfile.
+
+    Args:
+        file (os.PathLike): File of waveform.
+        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
     """
     with sf.SoundFile(file) as sf_desc:
         sr_native = sf_desc.samplerate
@@ -179,9 +217,17 @@ def sound_file_load(file: str,
     return y, sf_desc.samplerate
 
 
-def normalize(y: array, norm_type: str='linear',
-              mul_factor: float=1.0) -> array:
-    """ normalize an input audio with additional multiplier.
+def normalize(y: np.ndarray, norm_type: str='linear',
+              mul_factor: float=1.0) -> np.ndarray:
+    """Normalize an input audio with additional multiplier.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+
+    Returns:
+        np.ndarray: `y` after normalization.
     """
 
     if norm_type == 'linear':
@@ -199,12 +245,13 @@ def normalize(y: array, norm_type: str='linear',
     return y
 
 
-def save(y: array, sr: int, file: str) -> None:
-    """Save audio file to disk.
-    This function saves audio to disk using scipy.io.wavfile, with additional step
-    to convert input waveform to int16 unless it already is int16
-    Notes:
-        It only support raw wav format.
+def save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
+    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        sr (int): Sample rate.
+        file (os.PathLike): Path of auido file to save.
     """
     if not file.endswith('.wav'):
         raise ParameterError(
@@ -226,7 +273,7 @@ def save(y: array, sr: int, file: str) -> None:
 
 
 def load(
-        file: str,
+        file: os.PathLike,
         sr: Optional[int]=None,
         mono: bool=True,
         merge_type: str='average',  # ch0,ch1,random,average
@@ -236,11 +283,24 @@ def load(
         offset: float=0.0,
         duration: Optional[int]=None,
         dtype: str='float32',
-        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
-    """Load audio file from disk.
-    This function loads audio from disk using using audio beackend.
-    Parameters:
-    Notes:
+        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
+    """Load audio file from disk. This function loads audio from disk using using audio beackend.
+
+    Args:
+        file (os.PathLike): Path of auido file to load.
+        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
+        mono (bool, optional): Return waveform with mono channel. Defaults to True.
+        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
+        normal (bool, optional): Waveform normalization. Defaults to True.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
+        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
     """
 
     y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py
index 8cb9b666..538be019 100644
--- a/paddleaudio/paddleaudio/compliance/kaldi.py
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@@ -220,7 +220,7 @@ def spectrogram(waveform: Tensor,
     """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
 
     Args:
-        waveform (Tensor): A waveform tensor with shape [C, T].
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
         blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
         channel (int, optional): Select the channel of waveform. Defaults to -1.
         dither (float, optional): Dithering constant . Defaults to 0.0.
@@ -239,7 +239,7 @@ def spectrogram(waveform: Tensor,
         window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
+        Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
             depends on frame_length and frame_shift.
     """
     dtype = waveform.dtype
@@ -422,7 +422,7 @@ def fbank(waveform: Tensor,
     """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
 
     Args:
-        waveform (Tensor): A waveform tensor with shape [C, T].
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
         blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
         channel (int, optional): Select the channel of waveform. Defaults to -1.
         dither (float, optional): Dithering constant . Defaults to 0.0.
@@ -451,7 +451,7 @@ def fbank(waveform: Tensor,
         window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: A filter banks tensor with shape (m, n_mels).
+        Tensor: A filter banks tensor with shape `(m, n_mels)`.
     """
     dtype = waveform.dtype
 
@@ -542,7 +542,7 @@ def mfcc(waveform: Tensor,
             identical to Kaldi's.
 
     Args:
-        waveform (Tensor): A waveform tensor with shape [C, T].
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
         blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
         cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
         channel (int, optional): Select the channel of waveform. Defaults to -1.
@@ -571,7 +571,7 @@ def mfcc(waveform: Tensor,
         window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
+        Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
     """
     assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
         n_mfcc, n_mels)
diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py
index 167795c3..740584ca 100644
--- a/paddleaudio/paddleaudio/compliance/librosa.py
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@@ -19,7 +19,6 @@ from typing import Union
 
 import numpy as np
 import scipy
-from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
 from scipy import signal
 
@@ -32,7 +31,6 @@ __all__ = [
     'mfcc',
     'hz_to_mel',
     'mel_to_hz',
-    'split_frames',
     'mel_frequencies',
     'power_to_db',
     'compute_fbank_matrix',
@@ -49,7 +47,8 @@ __all__ = [
 ]
 
 
-def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
+def _pad_center(data: np.ndarray, size: int, axis: int=-1,
+                **kwargs) -> np.ndarray:
     """Pad an array to a target length along a target axis.
 
     This differs from `np.pad` by centering the data prior to padding,
@@ -69,8 +68,10 @@ def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
     return np.pad(data, lengths, **kwargs)
 
 
-def split_frames(x: array, frame_length: int, hop_length: int,
-                 axis: int=-1) -> array:
+def _split_frames(x: np.ndarray,
+                  frame_length: int,
+                  hop_length: int,
+                  axis: int=-1) -> np.ndarray:
     """Slice a data array into (overlapping) frames.
 
     This function is aligned with librosa.frame
@@ -142,11 +143,16 @@ def _check_audio(y, mono=True) -> bool:
     return True
 
 
-def hz_to_mel(frequencies: Union[float, List[float], array],
-              htk: bool=False) -> array:
-    """Convert Hz to Mels
+def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
+              htk: bool=False) -> np.ndarray:
+    """Convert Hz to Mels.
 
-    This function is aligned with librosa.
+    Args:
+        frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Frequency in mels.
     """
     freq = np.asanyarray(frequencies)
 
@@ -177,10 +183,16 @@ def hz_to_mel(frequencies: Union[float, List[float], array],
     return mels
 
 
-def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+def mel_to_hz(mels: Union[float, List[float], np.ndarray],
+              htk: int=False) -> np.ndarray:
     """Convert mel bin numbers to frequencies.
 
-    This function is aligned with librosa.
+    Args:
+        mels (Union[float, List[float], np.ndarray]): Frequency in mels.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Frequencies in Hz.
     """
     mel_array = np.asanyarray(mels)
 
@@ -212,10 +224,17 @@ def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
 def mel_frequencies(n_mels: int=128,
                     fmin: float=0.0,
                     fmax: float=11025.0,
-                    htk: bool=False) -> array:
-    """Compute mel frequencies
+                    htk: bool=False) -> np.ndarray:
+    """Compute mel frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 128.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
 
-    This function is aligned with librosa.
+    Returns:
+        np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
     """
     # 'Center freqs' of mel bands - uniformly spaced between limits
     min_mel = hz_to_mel(fmin, htk=htk)
@@ -226,10 +245,15 @@ def mel_frequencies(n_mels: int=128,
     return mel_to_hz(mels, htk=htk)
 
 
-def fft_frequencies(sr: int, n_fft: int) -> array:
+def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
     """Compute fourier frequencies.
 
-    This function is aligned with librosa.
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): FFT size.
+
+    Returns:
+        np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
     """
     return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
 
@@ -241,10 +265,22 @@ def compute_fbank_matrix(sr: int,
                          fmax: Optional[float]=None,
                          htk: bool=False,
                          norm: str="slaney",
-                         dtype: type=np.float32):
+                         dtype: type=np.float32) -> np.ndarray:
     """Compute fbank matrix.
 
-    This funciton is aligned with librosa.
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): FFT size.
+        n_mels (int, optional): Number of mel bins. Defaults to 128.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (str, optional): Type of normalization. Defaults to "slaney".
+        dtype (type, optional): Data type. Defaults to np.float32.
+
+
+    Returns:
+        np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
     """
     if norm != "slaney":
         raise ParameterError('norm must set to slaney')
@@ -289,17 +325,28 @@ def compute_fbank_matrix(sr: int,
     return weights
 
 
-def stft(x: array,
+def stft(x: np.ndarray,
          n_fft: int=2048,
          hop_length: Optional[int]=None,
          win_length: Optional[int]=None,
          window: str="hann",
          center: bool=True,
          dtype: type=np.complex64,
-         pad_mode: str="reflect") -> array:
+         pad_mode: str="reflect") -> np.ndarray:
     """Short-time Fourier transform (STFT).
 
-    This function is aligned with librosa.
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        n_fft (int, optional): FFT size. Defaults to 2048.
+        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
+        win_length (Optional[int], optional): The size of window. Defaults to None.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+
+    Returns:
+        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
     """
     _check_audio(x)
 
@@ -314,7 +361,7 @@ def stft(x: array,
     fft_window = signal.get_window(window, win_length, fftbins=True)
 
     # Pad the window out to n_fft size
-    fft_window = pad_center(fft_window, n_fft)
+    fft_window = _pad_center(fft_window, n_fft)
 
     # Reshape so that the window can be broadcast
     fft_window = fft_window.reshape((-1, 1))
@@ -333,7 +380,7 @@ def stft(x: array,
         )
 
     # Window the time series.
-    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
     # Pre-allocate the STFT matrix
     stft_matrix = np.empty(
         (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
@@ -352,16 +399,20 @@ def stft(x: array,
     return stft_matrix
 
 
-def power_to_db(spect: array,
+def power_to_db(spect: np.ndarray,
                 ref: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> array:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+                top_db: Optional[float]=80.0) -> np.ndarray:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
 
-    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
-    stable way.
+    Args:
+        spect (np.ndarray): STFT power spectrogram of an input waveform.
+        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
 
-    This function is aligned with librosa.
+    Returns:
+        np.ndarray: Power spectrogram in db scale.
     """
     spect = np.asarray(spect)
 
@@ -394,49 +445,27 @@ def power_to_db(spect: array,
     return log_spec
 
 
-def mfcc(x,
+def mfcc(x: np.ndarray,
          sr: int=16000,
-         spect: Optional[array]=None,
+         spect: Optional[np.ndarray]=None,
          n_mfcc: int=20,
          dct_type: int=2,
          norm: str="ortho",
          lifter: int=0,
-         **kwargs) -> array:
+         **kwargs) -> np.ndarray:
     """Mel-frequency cepstral coefficients (MFCCs)
 
-    This function is NOT strictly aligned with librosa. The following example shows how to get the
-    same result with librosa:
-
-    # mfcc:
-     kwargs = {
-        'window_size':512,
-        'hop_length':320,
-        'mel_bins':64,
-        'fmin':50,
-         'to_db':False}
-    a = mfcc(x,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-
-    # librosa mfcc:
-    spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512,
-                                              win_length=512,
-                                              hop_length=320,
-                                              n_mels=64, fmin=50)
-    b = librosa.feature.mfcc(y=x,
-        sr=16000,
-        S=spect,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0)
-
-    assert np.mean( (a-b)**2) < 1e-8
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
+        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
+        dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
+        norm (str, optional): Type of normalization. Defaults to "ortho".
+        lifter (int, optional): Cepstral filtering. Defaults to 0.
 
+    Returns:
+        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
     """
     if spect is None:
         spect = melspectrogram(x, sr=sr, **kwargs)
@@ -454,12 +483,12 @@ def mfcc(x,
             f"MFCC lifter={lifter} must be a non-negative number")
 
 
-def melspectrogram(x: array,
+def melspectrogram(x: np.ndarray,
                    sr: int=16000,
                    window_size: int=512,
                    hop_length: int=320,
                    n_mels: int=64,
-                   fmin: int=50,
+                   fmin: float=50.0,
                    fmax: Optional[float]=None,
                    window: str='hann',
                    center: bool=True,
@@ -468,27 +497,28 @@ def melspectrogram(x: array,
                    to_db: bool=True,
                    ref: float=1.0,
                    amin: float=1e-10,
-                   top_db: Optional[float]=None) -> array:
+                   top_db: Optional[float]=None) -> np.ndarray:
     """Compute mel-spectrogram.
 
-    Parameters:
-        x: numpy.ndarray
-        The input wavform is a numpy array [shape=(n,)]
-
-        window_size: int, typically 512, 1024, 2048, etc.
-        The window size for framing, also used as n_fft for stft
-
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        window_size (int, optional): Size of FFT and window length. Defaults to 512.
+        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
+        to_db (bool, optional): Enable db scale. Defaults to True.
+        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
 
     Returns:
-        The mel-spectrogram in power scale or db scale(default)
-
-
-    Notes:
-    1. sr is default to 16000, which is commonly used in speech/speaker processing.
-    2. when fmax is None, it is set to sr//2.
-    3. this function will convert mel spectgrum to db scale by default. This is different
-    that of librosa.
-
+        np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
     """
     _check_audio(x, mono=True)
     if len(x) <= 0:
@@ -518,18 +548,28 @@ def melspectrogram(x: array,
         return mel_spect
 
 
-def spectrogram(x: array,
+def spectrogram(x: np.ndarray,
                 sr: int=16000,
                 window_size: int=512,
                 hop_length: int=320,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
-                power: float=2.0) -> array:
-    """Compute spectrogram from an input waveform.
+                power: float=2.0) -> np.ndarray:
+    """Compute spectrogram.
+
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        window_size (int, optional): Size of FFT and window length. Defaults to 512.
+        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
 
-    This function is a wrapper for librosa.feature.stft, with addition step to
-    compute the magnitude of the complex spectrogram.
+    Returns:
+        np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
     """
 
     s = stft(
@@ -544,18 +584,16 @@ def spectrogram(x: array,
     return np.abs(s)**power
 
 
-def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
-    """Mu-law encoding.
-
-    Compute the mu-law decoding given an input code.
-    When quantized is True, the result will be converted to
-    integer in range [0,mu-1]. Otherwise, the resulting signal
-    is in range [-1,1]
-
+def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
+    """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
 
-    Reference:
-        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+    Args:
+        x (np.ndarray): The input waveform to encode.
+        mu (int, optional): The endoceding parameter. Defaults to 255.
+        quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
 
+    Returns:
+        np.ndarray: The mu-law encoded waveform.
     """
     mu = 255
     y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
@@ -564,17 +602,16 @@ def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
     return y
 
 
-def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
-    """Mu-law decoding.
-
-    Compute the mu-law decoding given an input code.
+def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
+    """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
 
-    it assumes that the input y is in
-    range [0,mu-1] when quantize is True and [-1,1] otherwise
-
-    Reference:
-        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+    Args:
+        y (np.ndarray): The encoded waveform.
+        mu (int, optional): The endoceding parameter. Defaults to 255.
+        quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
 
+    Returns:
+        np.ndarray: The mu-law decoded waveform.
     """
     if mu < 1:
         raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
@@ -586,7 +623,7 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
     return x
 
 
-def randint(high: int) -> int:
+def _randint(high: int) -> int:
     """Generate one random integer in range [0 high)
 
      This is a helper function for random data augmentaiton
@@ -594,20 +631,18 @@ def randint(high: int) -> int:
     return int(np.random.randint(0, high=high))
 
 
-def rand() -> float:
-    """Generate one floating-point number in range [0 1)
-
-    This is a helper function for random data augmentaiton
-    """
-    return float(np.random.rand(1))
-
-
-def depth_augment(y: array,
+def depth_augment(y: np.ndarray,
                   choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> array:
-    """ Audio depth augmentation
+                  probs: List[float]=[0.5, 0.5]) -> np.ndarray:
+    """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
+        probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
 
-    Do audio depth augmentation to simulate the distortion brought by quantization.
+    Returns:
+        np.ndarray: The augmented waveform.
     """
     assert len(probs) == len(
         choices
@@ -621,13 +656,18 @@ def depth_augment(y: array,
     return y2
 
 
-def adaptive_spect_augment(spect: array, tempo_axis: int=0,
-                           level: float=0.1) -> array:
-    """Do adpative spectrogram augmentation
+def adaptive_spect_augment(spect: np.ndarray,
+                           tempo_axis: int=0,
+                           level: float=0.1) -> np.ndarray:
+    """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
 
-    The level of the augmentation is gowern by the paramter level,
-    ranging from 0 to 1, with 0 represents no augmentation。
+    Args:
+        spect (np.ndarray): Input spectrogram.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+        level (float, optional): The level factor of masking. Defaults to 0.1.
 
+    Returns:
+        np.ndarray: The augmented spectrogram.
     """
     assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
     if tempo_axis == 0:
@@ -643,32 +683,40 @@ def adaptive_spect_augment(spect: array, tempo_axis: int=0,
 
     if tempo_axis == 0:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[start:start + time_mask_width, :] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[:, start:start + freq_mask_width] = 0
     else:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[:, start:start + time_mask_width] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[start:start + freq_mask_width, :] = 0
 
     return spect
 
 
-def spect_augment(spect: array,
+def spect_augment(spect: np.ndarray,
                   tempo_axis: int=0,
                   max_time_mask: int=3,
                   max_freq_mask: int=3,
                   max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> array:
-    """Do spectrogram augmentation in both time and freq axis
+                  max_freq_mask_width: int=20) -> np.ndarray:
+    """Do spectrogram augmentation in both time and freq axis.
 
-    Reference:
+    Args:
+        spect (np.ndarray): Input spectrogram.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+        max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
+        max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
+        max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
+        max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
 
+    Returns:
+        np.ndarray: The augmented spectrogram.
     """
     assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
     if tempo_axis == 0:
@@ -676,52 +724,64 @@ def spect_augment(spect: array,
     else:
         nf, nt = spect.shape
 
-    num_time_mask = randint(max_time_mask)
-    num_freq_mask = randint(max_freq_mask)
+    num_time_mask = _randint(max_time_mask)
+    num_freq_mask = _randint(max_freq_mask)
 
-    time_mask_width = randint(max_time_mask_width)
-    freq_mask_width = randint(max_freq_mask_width)
+    time_mask_width = _randint(max_time_mask_width)
+    freq_mask_width = _randint(max_freq_mask_width)
 
     if tempo_axis == 0:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[start:start + time_mask_width, :] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[:, start:start + freq_mask_width] = 0
     else:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[:, start:start + time_mask_width] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[start:start + freq_mask_width, :] = 0
 
     return spect
 
 
-def random_crop1d(y: array, crop_len: int) -> array:
-    """ Do random cropping on 1d input signal
+def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
+    """ Random cropping on a input waveform.
 
-    The input is a 1d signal, typically a sound waveform
+    Args:
+        y (np.ndarray): Input waveform array in 1D.
+        crop_len (int): Length of waveform to crop.
+
+    Returns:
+        np.ndarray: The cropped waveform.
     """
     if y.ndim != 1:
         'only accept 1d tensor or numpy array'
     n = len(y)
-    idx = randint(n - crop_len)
+    idx = _randint(n - crop_len)
     return y[idx:idx + crop_len]
 
 
-def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
-    """ Do random cropping for 2D array, typically a spectrogram.
+def random_crop2d(s: np.ndarray, crop_len: int,
+                  tempo_axis: int=0) -> np.ndarray:
+    """ Random cropping on a spectrogram.
 
-    The cropping is done in temporal direction on the time-freq input signal.
+    Args:
+        s (np.ndarray): Input spectrogram in 2D.
+        crop_len (int): Length of spectrogram to crop.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+
+    Returns:
+        np.ndarray: The cropped spectrogram.
     """
     if tempo_axis >= s.ndim:
         raise ParameterError('axis out of range')
 
     n = s.shape[tempo_axis]
-    idx = randint(high=n - crop_len)
+    idx = _randint(high=n - crop_len)
     sli = [slice(None) for i in range(s.ndim)]
     sli[tempo_axis] = slice(idx, idx + crop_len)
     out = s[tuple(sli)]
diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py
index 6afd234a..09037255 100644
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@@ -17,6 +17,7 @@ from typing import Union
 
 import paddle
 import paddle.nn as nn
+from paddle import Tensor
 
 from ..functional import compute_fbank_matrix
 from ..functional import create_dct
@@ -32,6 +33,20 @@ __all__ = [
 
 
 class Spectrogram(nn.Layer):
+    """Compute spectrogram of given signals, typically audio waveforms.
+    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
+
+    Args:
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  n_fft: int=512,
                  hop_length: Optional[int]=None,
@@ -40,34 +55,7 @@ class Spectrogram(nn.Layer):
                  power: float=2.0,
                  center: bool=True,
                  pad_mode: str='reflect',
-                 dtype: str=paddle.float32):
-        """Compute spectrogram of a given signal, typically an audio waveform.
-        The spectorgram is defined as the complex norm of the short-time
-        Fourier transformation.
-        Parameters:
-            n_fft (int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window (str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
-            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'. The default value is 'reflect'.
-            dtype (str): the data type of input and window.
-        Notes:
-            The Spectrogram transform relies on STFT transform to compute the spectrogram.
-            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
-            set stop_gradient=False before training.
-            For more information, see STFT().
-        """
+                 dtype: str='float32') -> None:
         super(Spectrogram, self).__init__()
 
         assert power > 0, 'Power of spectrogram must be > 0.'
@@ -88,13 +76,39 @@ class Spectrogram(nn.Layer):
             pad_mode=pad_mode)
         self.register_buffer('fft_window', self.fft_window)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
+        """
         stft = self._stft(x)
         spectrogram = paddle.pow(paddle.abs(stft), self.power)
         return spectrogram
 
 
 class MelSpectrogram(nn.Layer):
+    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  sr: int=22050,
                  n_fft: int=512,
@@ -109,39 +123,7 @@ class MelSpectrogram(nn.Layer):
                  f_max: Optional[float]=None,
                  htk: bool=False,
                  norm: Union[str, float]='slaney',
-                 dtype: str=paddle.float32):
-        """Compute the melspectrogram of a given signal, typically an audio waveform.
-        The melspectrogram is also known as filterbank or fbank feature in audio community.
-        It is computed by multiplying spectrogram with Mel filter bank matrix.
-        Parameters:
-            sr(int): the audio sample rate.
-                The default value is 22050.
-            n_fft(int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window(str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
-            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'.
-                The default value is 'reflect'.
-            n_mels(int): the mel bins.
-            f_min(float): the lower cut-off frequency, below which the filter response is zero.
-            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
-            htk(bool): whether to use HTK formula in computing fbank matrix.
-            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
-                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-        """
+                 dtype: str='float32') -> None:
         super(MelSpectrogram, self).__init__()
 
         self._spectrogram = Spectrogram(
@@ -171,13 +153,42 @@ class MelSpectrogram(nn.Layer):
             dtype=dtype)  # float64 for better numerical results
         self.register_buffer('fbank_matrix', self.fbank_matrix)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
         spect_feature = self._spectrogram(x)
         mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
         return mel_feature
 
 
 class LogMelSpectrogram(nn.Layer):
+    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  sr: int=22050,
                  n_fft: int=512,
@@ -195,41 +206,7 @@ class LogMelSpectrogram(nn.Layer):
                  ref_value: float=1.0,
                  amin: float=1e-10,
                  top_db: Optional[float]=None,
-                 dtype: str=paddle.float32):
-        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
-        typically an audio waveform.
-        Parameters:
-            sr (int): the audio sample rate.
-                The default value is 22050.
-            n_fft (int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window (str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'.
-                The default value is 'reflect'.
-            n_mels (int): the mel bins.
-            f_min (float): the lower cut-off frequency, below which the filter response is zero.
-            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
-            htk (bool): whether to use HTK formula in computing fbank matrix.
-            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
-                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
-            amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
-            top_db (float): the maximum db value of resulting spectrum, above which the
-                spectrum is clipped(to top_db).
-            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-        """
+                 dtype: str='float32') -> None:
         super(LogMelSpectrogram, self).__init__()
 
         self._melspectrogram = MelSpectrogram(
@@ -252,7 +229,14 @@ class LogMelSpectrogram(nn.Layer):
         self.amin = amin
         self.top_db = top_db
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
         mel_feature = self._melspectrogram(x)
         log_mel_feature = power_to_db(
             mel_feature,
@@ -263,6 +247,29 @@ class LogMelSpectrogram(nn.Layer):
 
 
 class MFCC(nn.Layer):
+    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_mfcc (int, optional): [description]. Defaults to 40.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  sr: int=22050,
                  n_mfcc: int=40,
@@ -281,43 +288,7 @@ class MFCC(nn.Layer):
                  ref_value: float=1.0,
                  amin: float=1e-10,
                  top_db: Optional[float]=None,
-                 dtype: str=paddle.float32):
-        """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
-
-        Parameters:
-            sr(int): the audio sample rate.
-                The default value is 22050.
-            n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
-            n_fft (int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window (str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
-            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'.
-                The default value is 'reflect'.
-            n_mels (int): the mel bins.
-            f_min (float): the lower cut-off frequency, below which the filter response is zero.
-            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
-            htk (bool): whether to use HTK formula in computing fbank matrix.
-            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
-                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
-            amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
-            top_db (float): the maximum db value of resulting spectrum, above which the
-                spectrum is clipped(to top_db).
-            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-        """
+                 dtype: str=paddle.float32) -> None:
         super(MFCC, self).__init__()
         assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
             n_mfcc, n_mels)
@@ -342,7 +313,14 @@ class MFCC(nn.Layer):
         self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
         self.register_buffer('dct_matrix', self.dct_matrix)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
+        """
         log_mel_feature = self._log_melspectrogram(x)
         mfcc = paddle.matmul(
             log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py
index c5ab3045..19c63a9a 100644
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@@ -17,6 +17,7 @@ from typing import Optional
 from typing import Union
 
 import paddle
+from paddle import Tensor
 
 __all__ = [
     'hz_to_mel',
@@ -29,19 +30,20 @@ __all__ = [
 ]
 
 
-def hz_to_mel(freq: Union[paddle.Tensor, float],
-              htk: bool=False) -> Union[paddle.Tensor, float]:
+def hz_to_mel(freq: Union[Tensor, float],
+              htk: bool=False) -> Union[Tensor, float]:
     """Convert Hz to Mels.
-    Parameters:
-        freq: the input tensor of arbitrary shape, or a single floating point number.
-        htk: use HTK formula to do the conversion.
-            The default value is False.
+
+    Args:
+        freq (Union[Tensor, float]): The input tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
     Returns:
-        The frequencies represented in Mel-scale.
+        Union[Tensor, float]: Frequency in mels.
     """
 
     if htk:
-        if isinstance(freq, paddle.Tensor):
+        if isinstance(freq, Tensor):
             return 2595.0 * paddle.log10(1.0 + freq / 700.0)
         else:
             return 2595.0 * math.log10(1.0 + freq / 700.0)
@@ -58,7 +60,7 @@ def hz_to_mel(freq: Union[paddle.Tensor, float],
     min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
     logstep = math.log(6.4) / 27.0  # step size for log region
 
-    if isinstance(freq, paddle.Tensor):
+    if isinstance(freq, Tensor):
         target = min_log_mel + paddle.log(
             freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
         mask = (freq > min_log_hz).astype(freq.dtype)
@@ -71,14 +73,16 @@ def hz_to_mel(freq: Union[paddle.Tensor, float],
     return mels
 
 
-def mel_to_hz(mel: Union[float, paddle.Tensor],
-              htk: bool=False) -> Union[float, paddle.Tensor]:
+def mel_to_hz(mel: Union[float, Tensor],
+              htk: bool=False) -> Union[float, Tensor]:
     """Convert mel bin numbers to frequencies.
-    Parameters:
-        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
-        htk: use HTK formula to do the conversion.
+
+    Args:
+        mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
     Returns:
-        The frequencies represented in hz.
+        Union[float, Tensor]: Frequencies in Hz.
     """
     if htk:
         return 700.0 * (10.0**(mel / 2595.0) - 1.0)
@@ -90,7 +94,7 @@ def mel_to_hz(mel: Union[float, paddle.Tensor],
     min_log_hz = 1000.0  # beginning of log region (Hz)
     min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
     logstep = math.log(6.4) / 27.0  # step size for log region
-    if isinstance(mel, paddle.Tensor):
+    if isinstance(mel, Tensor):
         target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
         mask = (mel > min_log_mel).astype(mel.dtype)
         freqs = target * mask + freqs * (
@@ -106,16 +110,18 @@ def mel_frequencies(n_mels: int=64,
                     f_min: float=0.0,
                     f_max: float=11025.0,
                     htk: bool=False,
-                    dtype: str=paddle.float32):
+                    dtype: str='float32') -> Tensor:
     """Compute mel frequencies.
-    Parameters:
-        n_mels(int): number of Mel bins.
-        f_min(float): the lower cut-off frequency, below which the filter response is zero.
-        f_max(float): the upper cut-off frequency, above which the filter response is zero.
-        htk(bool): whether to use htk formula.
-        dtype(str): the datatype of the return frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
     Returns:
-        The frequencies represented in Mel-scale
+        Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
     """
     # 'Center freqs' of mel bands - uniformly spaced between limits
     min_mel = hz_to_mel(f_min, htk=htk)
@@ -125,14 +131,16 @@ def mel_frequencies(n_mels: int=64,
     return freqs
 
 
-def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
+def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
     """Compute fourier frequencies.
-    Parameters:
-        sr(int): the audio sample rate.
-        n_fft(float): the number of fft bins.
-        dtype(str): the datatype of the return frequencies.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
     Returns:
-        The frequencies represented in hz.
+        Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
     """
     return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
 
@@ -144,23 +152,21 @@ def compute_fbank_matrix(sr: int,
                          f_max: Optional[float]=None,
                          htk: bool=False,
                          norm: Union[str, float]='slaney',
-                         dtype: str=paddle.float32):
+                         dtype: str='float32') -> Tensor:
     """Compute fbank matrix.
-    Parameters:
-        sr(int): the audio sample rate.
-        n_fft(int): the number of fft bins.
-        n_mels(int): the number of Mel bins.
-        f_min(float): the lower cut-off frequency, below which the filter response is zero.
-        f_max(float): the upper cut-off frequency, above which the filter response is zero.
-        htk: whether to use htk formula.
-        return_complex(bool): whether to return complex matrix. If True, the matrix will
-            be complex type. Otherwise, the real and image part will be stored in the last
-            axis of returned tensor.
-        dtype(str): the datatype of the returned fbank matrix.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
     Returns:
-        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
-    Shape:
-        output: (n_mels, int(1+n_fft//2))
+        Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
     """
 
     if f_max is None:
@@ -199,27 +205,20 @@ def compute_fbank_matrix(sr: int,
     return weights
 
 
-def power_to_db(magnitude: paddle.Tensor,
+def power_to_db(spect: Tensor,
                 ref_value: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=None) -> paddle.Tensor:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
-    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
-    stable way.
-    Parameters:
-        magnitude(Tensor): the input magnitude tensor of any shape.
-        ref_value(float): the reference value. If smaller than 1.0, the db level
-            of the signal will be pulled up accordingly. Otherwise, the db level
-            is pushed down.
-        amin(float): the minimum value of input magnitude, below which the input
-            magnitude is clipped(to amin).
-        top_db(float): the maximum db value of resulting spectrum, above which the
-            spectrum is clipped(to top_db).
+                top_db: Optional[float]=None) -> Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
+
+    Args:
+        spect (Tensor): STFT power spectrogram.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
+
     Returns:
-        The spectrogram in log-scale.
-    shape:
-        input: any shape
-        output: same as input
+        Tensor: Power spectrogram in db scale.
     """
     if amin <= 0:
         raise Exception("amin must be strictly positive")
@@ -227,8 +226,8 @@ def power_to_db(magnitude: paddle.Tensor,
     if ref_value <= 0:
         raise Exception("ref_value must be strictly positive")
 
-    ones = paddle.ones_like(magnitude)
-    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    ones = paddle.ones_like(spect)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
     log_spec -= 10.0 * math.log10(max(ref_value, amin))
 
     if top_db is not None:
@@ -242,15 +241,17 @@ def power_to_db(magnitude: paddle.Tensor,
 def create_dct(n_mfcc: int,
                n_mels: int,
                norm: Optional[str]='ortho',
-               dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
+               dtype: str='float32') -> Tensor:
     """Create a discrete cosine transform(DCT) matrix.
 
-    Parameters:
+    Args:
         n_mfcc (int): Number of mel frequency cepstral coefficients. 
         n_mels (int): Number of mel filterbanks.
-        norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
+        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
     Returns:
-        Tensor: The DCT matrix with shape (n_mels, n_mfcc).
+        Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
     """
     n = paddle.arange(n_mels, dtype=dtype)
     k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
diff --git a/paddleaudio/paddleaudio/functional/window.py b/paddleaudio/paddleaudio/functional/window.py
index f321b38e..c99d5046 100644
--- a/paddleaudio/paddleaudio/functional/window.py
+++ b/paddleaudio/paddleaudio/functional/window.py
@@ -20,24 +20,11 @@ from paddle import Tensor
 
 __all__ = [
     'get_window',
-
-    # windows
-    'taylor',
-    'hamming',
-    'hann',
-    'tukey',
-    'kaiser',
-    'gaussian',
-    'exponential',
-    'triang',
-    'bohman',
-    'blackman',
-    'cosine',
 ]
 
 
-def _cat(a: List[Tensor], data_type: str) -> Tensor:
-    l = [paddle.to_tensor(_a, data_type) for _a in a]
+def _cat(x: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_, data_type) for _ in x]
     return paddle.concat(l)
 
 
@@ -48,7 +35,7 @@ def _acosh(x: Union[Tensor, float]) -> Tensor:
 
 
 def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry"""
+    """Extend window by 1 sample if needed for DFT-even symmetry. """
     if not sym:
         return M + 1, True
     else:
@@ -56,7 +43,7 @@ def _extend(M: int, sym: bool) -> bool:
 
 
 def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths"""
+    """Handle small or incorrect window lengths. """
     if int(M) != M or M < 0:
         raise ValueError('Window length M must be a non-negative integer')
 
@@ -64,15 +51,15 @@ def _len_guards(M: int) -> bool:
 
 
 def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry"""
+    """Truncate window by 1 sample if needed for DFT-even symmetry. """
     if needed:
         return w[:-1]
     else:
         return w
 
 
-def general_gaussian(M: int, p, sig, sym: bool=True,
-                     dtype: str='float64') -> Tensor:
+def _general_gaussian(M: int, p, sig, sym: bool=True,
+                      dtype: str='float64') -> Tensor:
     """Compute a window with a generalized Gaussian shape.
     This function is consistent with scipy.signal.windows.general_gaussian().
     """
@@ -86,8 +73,8 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
-def general_cosine(M: int, a: float, sym: bool=True,
-                   dtype: str='float64') -> Tensor:
+def _general_cosine(M: int, a: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
     """Compute a generic weighted sum of cosine terms window.
     This function is consistent with scipy.signal.windows.general_cosine().
     """
@@ -101,31 +88,23 @@ def general_cosine(M: int, a: float, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
-def general_hamming(M: int, alpha: float, sym: bool=True,
-                    dtype: str='float64') -> Tensor:
+def _general_hamming(M: int, alpha: float, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
     """Compute a generalized Hamming window.
     This function is consistent with scipy.signal.windows.general_hamming()
     """
-    return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
 
 
-def taylor(M: int,
-           nbar=4,
-           sll=30,
-           norm=True,
-           sym: bool=True,
-           dtype: str='float64') -> Tensor:
+def _taylor(M: int,
+            nbar=4,
+            sll=30,
+            norm=True,
+            sym: bool=True,
+            dtype: str='float64') -> Tensor:
     """Compute a Taylor window.
     The Taylor window taper function approximates the Dolph-Chebyshev window's
     constant sidelobe level for a parameterized number of near-in sidelobes.
-    Parameters:
-        M(int): window size
-        nbar, sil, norm: the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -171,46 +150,25 @@ def taylor(M: int,
     return _truncate(w, needs_trunc)
 
 
-def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Hamming window.
     The Hamming window is a taper formed by using a raised cosine with
     non-zero endpoints, optimized to minimize the nearest side lobe.
-    Parameters:
-        M(int): window size
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
-    return general_hamming(M, 0.54, sym, dtype=dtype)
+    return _general_hamming(M, 0.54, sym, dtype=dtype)
 
 
-def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Hann window.
     The Hann window is a taper formed by using a raised cosine or sine-squared
     with ends that touch zero.
-    Parameters:
-        M(int): window size
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
-    return general_hamming(M, 0.5, sym, dtype=dtype)
+    return _general_hamming(M, 0.5, sym, dtype=dtype)
 
 
-def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Tukey window.
     The Tukey window is also known as a tapered cosine window.
-    Parameters:
-        M(int): window size
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -237,32 +195,18 @@ def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor:
+def _kaiser(M: int, beta: float, sym: bool=True,
+            dtype: str='float64') -> Tensor:
     """Compute a Kaiser window.
     The Kaiser window is a taper formed by using a Bessel function.
-    Parameters:
-        M(int): window size.
-        beta(float): the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-    Returns:
-        Tensor: the window tensor
     """
     raise NotImplementedError()
 
 
-def gaussian(M: int, std: float, sym: bool=True,
-             dtype: str='float64') -> Tensor:
+def _gaussian(M: int, std: float, sym: bool=True,
+              dtype: str='float64') -> Tensor:
     """Compute a Gaussian window.
     The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
-    Parameters:
-        M(int): window size.
-        std(float): the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -275,21 +219,12 @@ def gaussian(M: int, std: float, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
-def exponential(M: int,
-                center=None,
-                tau=1.,
-                sym: bool=True,
-                dtype: str='float64') -> Tensor:
-    """Compute an exponential (or Poisson) window.
-    Parameters:
-        M(int): window size.
-        tau(float): the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
-    """
+def _exponential(M: int,
+                 center=None,
+                 tau=1.,
+                 sym: bool=True,
+                 dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window. """
     if sym and center is not None:
         raise ValueError("If sym==True, center must be None.")
     if _len_guards(M):
@@ -305,15 +240,8 @@ def exponential(M: int,
     return _truncate(w, needs_trunc)
 
 
-def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a triangular window.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -330,16 +258,9 @@ def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Bohman window.
     The Bohman window is the autocorrelation of a cosine window.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -353,32 +274,18 @@ def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Blackman window.
     The Blackman window is a taper formed by using the first three terms of
     a summation of cosines. It was designed to have close to the minimal
     leakage possible.  It is close to optimal, only slightly worse than a
     Kaiser window.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
-    return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
 
 
-def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a window with a simple cosine shape.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -388,19 +295,20 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-## factory function
 def get_window(window: Union[str, Tuple[str, float]],
                win_length: int,
                fftbins: bool=True,
                dtype: str='float64') -> Tensor:
     """Return a window of a given length and type.
-    Parameters:
-        window(str|(str,float)): the type of window to create.
-        win_length(int): the number of samples in the window.
-        fftbins(bool): If True, create a "periodic" window. Otherwise,
-            create a "symmetric" window, for use in filter design.
+
+    Args:
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        win_length (int): Number of samples.
+        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
+        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
+
     Returns:
-       The window represented as a tensor.
+        Tensor: The window represented as a tensor.
     """
     sym = not fftbins
 
@@ -420,7 +328,7 @@ def get_window(window: Union[str, Tuple[str, float]],
                          str(type(window)))
 
     try:
-        winfunc = eval(winstr)
+        winfunc = eval('_' + winstr)
     except KeyError as e:
         raise ValueError("Unknown window type.") from e
 
diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py
index d27f56e2..c4dc7a28 100644
--- a/paddleaudio/paddleaudio/metric/dtw.py
+++ b/paddleaudio/paddleaudio/metric/dtw.py
@@ -20,9 +20,7 @@ __all__ = [
 
 
 def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
-    """dtw distance
-
-    Dynamic Time Warping.
+    """Dynamic Time Warping.
     This function keeps a compact matrix, not the full warping paths matrix.
     Uses dynamic programming to compute:
 
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index d77d27b0..064939a8 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -178,7 +178,8 @@ class BaseExecutor(ABC):
         Returns:
             bool: return `True` for job input, `False` otherwise.
         """
-        return input_ and os.path.isfile(input_) and input_.endswith('.job')
+        return input_ and os.path.isfile(input_) and (input_.endswith('.job') or
+                                                      input_.endswith('.txt'))
 
     def _get_job_contents(
             self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]: