Merge branch 'develop' into patch-12

8 months ago · 0cb86f60af
parent 24591c050a e02a52221a
commit 0cb86f60af
62 changed files with 152 additions and 134 deletions
--- a/paddlespeech/audiotools/README.md
+++ b/paddlespeech/audiotools/README.md
--- a/paddlespeech/audiotools/init.py
+++ b/paddlespeech/audiotools/init.py
--- a/paddlespeech/audiotools/core/init.py
+++ b/paddlespeech/audiotools/core/init.py
--- a/paddlespeech/audiotools/core/_julius.py
+++ b/paddlespeech/audiotools/core/_julius.py
--- a/paddlespeech/audiotools/core/audio_signal.py
+++ b/paddlespeech/audiotools/core/audio_signal.py
--- a/paddlespeech/audiotools/core/display.py
+++ b/paddlespeech/audiotools/core/display.py
--- a/paddlespeech/audiotools/core/dsp.py
+++ b/paddlespeech/audiotools/core/dsp.py
--- a/paddlespeech/audiotools/core/effects.py
+++ b/paddlespeech/audiotools/core/effects.py
--- a/paddlespeech/audiotools/core/ffmpeg.py
+++ b/paddlespeech/audiotools/core/ffmpeg.py
--- a/paddlespeech/audiotools/core/loudness.py
+++ b/paddlespeech/audiotools/core/loudness.py
--- a/paddlespeech/audiotools/core/util.py
+++ b/paddlespeech/audiotools/core/util.py
@ -734,8 +734,8 @@ def default_collate(batch,
        if not all(len(elem) == elem_size for elem in it):
            raise RuntimeError(
                "each element in list of batch should be of equal size")
-        transposed = list(zip(
+        transposed = list(
-            *batch))  # It may be accessed twice, so we use a list.
+            zip(*batch))  # It may be accessed twice, so we use a list.
        if isinstance(elem, tuple):
            return [
--- a/paddlespeech/audiotools/data/init.py
+++ b/paddlespeech/audiotools/data/init.py
--- a/paddlespeech/audiotools/data/datasets.py
+++ b/paddlespeech/audiotools/data/datasets.py
@ -202,9 +202,9 @@ class AudioDataset:
    Examples
    --------
-    >>> from audio.audiotools.data.datasets import AudioLoader
+    >>> from paddlespeech.audiotools.data.datasets import AudioLoader
-    >>> from audio.audiotools.data.datasets import AudioDataset
+    >>> from paddlespeech.audiotools.data.datasets import AudioDataset
-    >>> from audio.audiotools import transforms as tfm
+    >>> from paddlespeech.audiotools import transforms as tfm
    >>> import numpy as np
    >>>
    >>> loaders = [
@ -237,9 +237,9 @@ class AudioDataset:
    Below is an example of how one could load MUSDB multitrack data:
-    >>> from audio import audiotools as at
+    >>> from paddlespeech import audiotools as at
    >>> from pathlib import Path
-    >>> from audio.audiotools import transforms as tfm
+    >>> from paddlespeech.audiotools import transforms as tfm
    >>> import numpy as np
    >>> import torch
    >>>
@ -296,9 +296,9 @@ class AudioDataset:
    Similarly, here's example code for loading Slakh data:
-    >>> from audio import audiotools as at
+    >>> from paddlespeech import audiotools as at
    >>> from pathlib import Path
-    >>> from audio.audiotools import transforms as tfm
+    >>> from paddlespeech.audiotools import transforms as tfm
    >>> import numpy as np
    >>> import torch
    >>> import glob
--- a/paddlespeech/audiotools/data/preprocess.py
+++ b/paddlespeech/audiotools/data/preprocess.py
@ -37,7 +37,7 @@ def create_csv(audio_files: list,
    You can produce a CSV file from a directory of audio files via:
-    >>> from audio import audiotools
+    >>> from paddlespeech import audiotools
    >>> directory = ...
    >>> audio_files = audiotools.util.find_audio(directory)
    >>> output_path = "train.csv"
--- a/paddlespeech/audiotools/data/transforms.py
+++ b/paddlespeech/audiotools/data/transforms.py
--- a/paddlespeech/audiotools/metrics/init.py
+++ b/paddlespeech/audiotools/metrics/init.py
--- a/paddlespeech/audiotools/metrics/quality.py
+++ b/paddlespeech/audiotools/metrics/quality.py
--- a/paddlespeech/audiotools/ml/init.py
+++ b/paddlespeech/audiotools/ml/init.py
--- a/paddlespeech/audiotools/ml/accelerator.py
+++ b/paddlespeech/audiotools/ml/accelerator.py
--- a/paddlespeech/audiotools/ml/basemodel.py
+++ b/paddlespeech/audiotools/ml/basemodel.py
--- a/paddlespeech/audiotools/ml/decorators.py
+++ b/paddlespeech/audiotools/ml/decorators.py
--- a/paddlespeech/audiotools/post.py
+++ b/paddlespeech/audiotools/post.py
@ -6,7 +6,7 @@ import typing
 import paddle
-from audio.audiotools.core import AudioSignal
+from paddlespeech.audiotools.core import AudioSignal
 def audio_table(
--- a/paddlespeech/audiotools/requirements.txt
+++ b/paddlespeech/audiotools/requirements.txt
@ -2,5 +2,4 @@ ffmpeg-python
 ffmpy
 flatten_dict
 pyloudnorm
-pytest
+rich
 rich
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@ -19,7 +19,7 @@ from typing import Tuple
 import paddle
 from paddle import nn
 from paddle.nn import initializer as I
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.s2t.modules.align import BatchNorm1D
 from paddlespeech.s2t.modules.align import Conv1D
@ -34,6 +34,7 @@ __all__ = ['ConvolutionModule']
 class ConvolutionModule(nn.Layer):
    """ConvolutionModule in Conformer model."""
    @typechecked
    def __init__(self,
                 channels: int,
                 kernel_size: int=15,
@ -52,7 +53,6 @@ class ConvolutionModule(nn.Layer):
            causal (bool): Whether use causal convolution or not
            bias (bool): Whether Conv with bias or not
        """
        assert check_argument_types()
        super().__init__()
        self.bias = bias
        self.channels = channels
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@ -17,7 +17,7 @@ from typing import Union
 import paddle
 from paddle import nn
 from paddle.nn import functional as F
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.loss import CTCLoss
@ -48,6 +48,7 @@ __all__ = ['CTCDecoder']
 class CTCDecoderBase(nn.Layer):
    @typechecked
    def __init__(self,
                 odim,
                 enc_n_units,
@ -66,7 +67,6 @@ class CTCDecoderBase(nn.Layer):
            batch_average (bool): do batch dim wise average.
            grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None.
        """
        assert check_argument_types()
        super().__init__()
        self.blank_id = blank_id
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@ -21,7 +21,7 @@ from typing import Tuple
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
 from paddlespeech.s2t.modules.align import Embedding
@ -61,6 +61,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
            False: x -> x + att(x)
    """
    @typechecked
    def __init__(self,
                 vocab_size: int,
                 encoder_output_size: int,
@ -77,8 +78,6 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
                 concat_after: bool=False,
                 max_len: int=5000):
        assert check_argument_types()
        nn.Layer.__init__(self)
        self.selfattention_layer_type = 'selfattn'
        attention_dim = encoder_output_size
@ -276,6 +275,7 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
            False: x -> x + att(x)
    """
    @typechecked
    def __init__(self,
                 vocab_size: int,
                 encoder_output_size: int,
@ -293,8 +293,6 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
                 concat_after: bool=False,
                 max_len: int=5000):
        assert check_argument_types()
        nn.Layer.__init__(self)
        self.left_decoder = TransformerDecoder(
            vocab_size, encoder_output_size, attention_heads, linear_units,
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@ -21,7 +21,7 @@ from typing import Union
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.s2t.modules.activation import get_activation
 from paddlespeech.s2t.modules.align import LayerNorm
@ -58,6 +58,7 @@ __all__ = [
 class BaseEncoder(nn.Layer):
    @typechecked
    def __init__(self,
                 input_size: int,
                 output_size: int=256,
@ -73,7 +74,7 @@ class BaseEncoder(nn.Layer):
                 concat_after: bool=False,
                 static_chunk_size: int=0,
                 use_dynamic_chunk: bool=False,
-                 global_cmvn: paddle.nn.Layer=None,
+                 global_cmvn: Optional[nn.Layer]=None,
                 use_dynamic_left_chunk: bool=False,
                 max_len: int=5000):
        """
@ -108,7 +109,6 @@ class BaseEncoder(nn.Layer):
            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
                dynamic chunk training
        """
        assert check_argument_types()
        super().__init__()
        self._output_size = output_size
@ -349,6 +349,7 @@ class BaseEncoder(nn.Layer):
 class TransformerEncoder(BaseEncoder):
    """Transformer encoder module."""
    @typechecked
    def __init__(
            self,
            input_size: int,
@ -365,12 +366,11 @@ class TransformerEncoder(BaseEncoder):
            concat_after: bool=False,
            static_chunk_size: int=0,
            use_dynamic_chunk: bool=False,
-            global_cmvn: nn.Layer=None,
+            global_cmvn: Optional[nn.Layer]=None,
            use_dynamic_left_chunk: bool=False, ):
        """ Construct TransformerEncoder
        See Encoder for the meaning of each parameter.
        """
        assert check_argument_types()
        super().__init__(input_size, output_size, attention_heads, linear_units,
                         num_blocks, dropout_rate, positional_dropout_rate,
                         attention_dropout_rate, input_layer,
@ -424,6 +424,7 @@ class TransformerEncoder(BaseEncoder):
 class ConformerEncoder(BaseEncoder):
    """Conformer encoder module."""
    @typechecked
    def __init__(self,
                 input_size: int,
                 output_size: int=256,
@ -439,7 +440,7 @@ class ConformerEncoder(BaseEncoder):
                 concat_after: bool=False,
                 static_chunk_size: int=0,
                 use_dynamic_chunk: bool=False,
-                 global_cmvn: nn.Layer=None,
+                 global_cmvn: Optional[nn.Layer]=None,
                 use_dynamic_left_chunk: bool=False,
                 positionwise_conv_kernel_size: int=1,
                 macaron_style: bool=True,
@ -466,8 +467,6 @@ class ConformerEncoder(BaseEncoder):
            causal (bool): whether to use causal convolution or not.
            cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
        """
        assert check_argument_types()
        super().__init__(input_size, output_size, attention_heads, linear_units,
                         num_blocks, dropout_rate, positional_dropout_rate,
                         attention_dropout_rate, input_layer,
@ -519,6 +518,7 @@ class ConformerEncoder(BaseEncoder):
 class SqueezeformerEncoder(nn.Layer):
    @typechecked
    def __init__(self,
                 input_size: int,
                 encoder_dim: int=256,
@ -541,7 +541,7 @@ class SqueezeformerEncoder(nn.Layer):
                 adaptive_scale: bool=True,
                 activation_type: str="swish",
                 init_weights: bool=True,
-                 global_cmvn: paddle.nn.Layer=None,
+                 global_cmvn: Optional[nn.Layer]=None,
                 normalize_before: bool=False,
                 use_dynamic_chunk: bool=False,
                 concat_after: bool=False,
@ -572,7 +572,6 @@ class SqueezeformerEncoder(nn.Layer):
            init_weights (bool): Whether to initialize weights.
            causal (bool): whether to use causal convolution or not.
        """
        assert check_argument_types()
        super().__init__()
        self.global_cmvn = global_cmvn
        self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@ -19,7 +19,7 @@ from typing import Union
 import paddle
 from paddle.optimizer.lr import LRScheduler
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
@ -57,13 +57,13 @@ class WarmupLR(LRScheduler):
    Note that the maximum lr equals to optimizer.lr in this scheduler.
    """
    @typechecked
    def __init__(self,
                 warmup_steps: Union[int, float]=25000,
                 learning_rate=1.0,
                 last_epoch=-1,
                 verbose=False,
                 **kwargs):
        assert check_argument_types()
        self.warmup_steps = warmup_steps
        super().__init__(learning_rate, last_epoch, verbose)
--- a/paddlespeech/t2s/models/diffsinger/diffsinger.py
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py
@ -20,7 +20,7 @@ from typing import Tuple
 import numpy as np
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI
 from paddlespeech.t2s.modules.diffnet import DiffNet
@ -40,6 +40,7 @@ class DiffSinger(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            # min and max spec for stretching before diffusion
@ -157,7 +158,6 @@ class DiffSinger(nn.Layer):
            denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
            diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
        """
        assert check_argument_types()
        super().__init__()
        self.fs2 = FastSpeech2MIDI(
            idim=idim,
@ -336,6 +336,7 @@ class DiffSingerInference(nn.Layer):
 class DiffusionLoss(nn.Layer):
    """Loss function module for Diffusion module on DiffSinger."""
    @typechecked
    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
@ -345,7 +346,6 @@ class DiffusionLoss(nn.Layer):
            use_weighted_masking (bool): 
                Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
--- a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
+++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
@ -19,7 +19,7 @@ from typing import Tuple
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
@ -33,6 +33,7 @@ class FastSpeech2MIDI(FastSpeech2):
    """The Fastspeech2 module of DiffSinger.
    """
    @typechecked
    def __init__(
            self,
            # fastspeech2 network structure related
@ -57,7 +58,6 @@ class FastSpeech2MIDI(FastSpeech2):
                is_slur_ids will be provided as the input
        """
        assert check_argument_types()
        super().__init__(idim=idim, odim=odim, **fastspeech2_params)
        self.use_energy_pred = use_energy_pred
        self.use_postnet = use_postnet
@ -495,6 +495,7 @@ class FastSpeech2MIDI(FastSpeech2):
 class FastSpeech2MIDILoss(FastSpeech2Loss):
    """Loss function module for DiffSinger."""
    @typechecked
    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
@ -504,7 +505,6 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
            use_weighted_masking (bool): 
                Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__(use_masking, use_weighted_masking)
    def forward(
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@ -15,6 +15,7 @@
 """Fastspeech2 related modules for paddle"""
 from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Sequence
 from typing import Tuple
 from typing import Union
@ -23,7 +24,7 @@ import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer
 from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier
@ -60,6 +61,7 @@ class FastSpeech2(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            # network structure related
@ -131,12 +133,12 @@ class FastSpeech2(nn.Layer):
            pitch_embed_dropout: float=0.5,
            stop_gradient_from_pitch_predictor: bool=False,
            # spk emb
-            spk_num: int=None,
+            spk_num: Optional[int]=None,
-            spk_embed_dim: int=None,
+            spk_embed_dim: Optional[int]=None,
            spk_embed_integration_type: str="add",
            # tone emb
-            tone_num: int=None,
+            tone_num: Optional[int]=None,
-            tone_embed_dim: int=None,
+            tone_embed_dim: Optional[int]=None,
            tone_embed_integration_type: str="add",
            # training related
            init_type: str="xavier_uniform",
@ -282,7 +284,6 @@ class FastSpeech2(nn.Layer):
                The hidden layer dim of speaker classifier
        """
        assert check_argument_types()
        super().__init__()
        # store hyperparameters
@ -1070,6 +1071,7 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""
    @typechecked
    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize feed-forward Transformer loss module.
@ -1079,7 +1081,6 @@ class FastSpeech2Loss(nn.Layer):
            use_weighted_masking (bool): 
                Whether to weighted masking in loss calculation.
        """
        assert check_argument_types()
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
--- a/paddlespeech/t2s/models/jets/generator.py
+++ b/paddlespeech/t2s/models/jets/generator.py
@ -28,7 +28,6 @@ from typing import Tuple
 import numpy as np
 import paddle
 from paddle import nn
 from typeguard import check_argument_types
 from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
 from paddlespeech.t2s.models.jets.alignments import AlignmentModule
--- a/paddlespeech/t2s/models/jets/jets.py
+++ b/paddlespeech/t2s/models/jets/jets.py
@ -24,7 +24,7 @@ from typing import Optional
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
 from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
@ -64,6 +64,7 @@ class JETS(nn.Layer):
        Text-to-Speech`: https://arxiv.org/abs/2203.16852v1
    """
    @typechecked
    def __init__(
            self,
            # generator related
@ -225,7 +226,6 @@ class JETS(nn.Layer):
            cache_generator_outputs (bool):
                Whether to cache generator outputs.
        """
        assert check_argument_types()
        super().__init__()
        # define modules
@ -279,8 +279,7 @@ class JETS(nn.Layer):
            lids: Optional[paddle.Tensor]=None,
            forward_generator: bool=True,
            use_alignment_module: bool=False,
-            **kwargs,
+            **kwargs, ) -> Dict[str, Any]:
    ) -> Dict[str, Any]:
        """Perform generator forward.
        Args:
            text (Tensor):
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@ -21,7 +21,7 @@ from typing import Tuple
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@ -44,6 +44,7 @@ class Tacotron2(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            # network structure related
@ -67,7 +68,7 @@ class Tacotron2(nn.Layer):
            postnet_layers: int=5,
            postnet_chans: int=512,
            postnet_filts: int=5,
-            output_activation: str=None,
+            output_activation: Optional[str]=None,
            use_batch_norm: bool=True,
            use_concate: bool=True,
            use_residual: bool=False,
@ -145,7 +146,6 @@ class Tacotron2(nn.Layer):
            zoneout_rate (float): 
                Zoneout rate.
        """
        assert check_argument_types()
        super().__init__()
        # store hyperparameters
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@ -13,7 +13,9 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """Fastspeech2 related modules for paddle"""
 from optparse import Option
 from typing import Dict
 from typing import Optional
 from typing import Sequence
 from typing import Tuple
@ -21,7 +23,7 @@ import numpy
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
@ -169,6 +171,7 @@ class TransformerTTS(nn.Layer):
            Number of layers to apply guided attention loss.
    """
    @typechecked
    def __init__(
            self,
            # network structure related
@ -198,7 +201,7 @@ class TransformerTTS(nn.Layer):
            encoder_concat_after: bool=False,
            decoder_concat_after: bool=False,
            reduction_factor: int=1,
-            spk_embed_dim: int=None,
+            spk_embed_dim: Optional[int]=None,
            spk_embed_integration_type: str="add",
            use_gst: bool=False,
            gst_tokens: int=10,
@ -227,7 +230,7 @@ class TransformerTTS(nn.Layer):
            num_heads_applied_guided_attn: int=2,
            num_layers_applied_guided_attn: int=2, ):
        """Initialize Transformer module."""
-        assert check_argument_types()
+
        super().__init__()
        # store hyperparameters
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@ -20,7 +20,7 @@ from typing import Optional
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
 from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
@ -60,6 +60,7 @@ class VITS(nn.Layer):
        Text-to-Speech`: https://arxiv.org/abs/2006.04558
    """
    @typechecked
    def __init__(
            self,
            # generator related
@ -181,7 +182,6 @@ class VITS(nn.Layer):
            cache_generator_outputs (bool):
                Whether to cache generator outputs.
        """
        assert check_argument_types()
        super().__init__()
        # define modules
@ -504,8 +504,9 @@ class VITS(nn.Layer):
    def reset_parameters(self):
        def _reset_parameters(module):
-            if isinstance(module,
+            if isinstance(
-                        (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+                    module,
                (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
                kaiming_uniform_(module.weight, a=math.sqrt(5))
                if module.bias is not None:
                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
@ -513,8 +514,9 @@ class VITS(nn.Layer):
                        bound = 1 / math.sqrt(fan_in)
                        uniform_(module.bias, -bound, bound)
-            if isinstance(module,
+            if isinstance(
-                          (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+                    module,
                (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
                ones_(module.weight)
                zeros_(module.bias)
@ -533,13 +535,13 @@ class VITS(nn.Layer):
        self.apply(_reset_parameters)
 class VITSInference(nn.Layer):
    def __init__(self, model):
        super().__init__()
        self.acoustic_model = model
    def forward(self, text, sids=None):
-        out = self.acoustic_model.inference(
+        out = self.acoustic_model.inference(text, sids=sids)
            text, sids=sids)
        wav = out['wav']
        return wav
--- a/paddlespeech/t2s/modules/adversarial_loss/speaker_classifier.py
+++ b/paddlespeech/t2s/modules/adversarial_loss/speaker_classifier.py
@ -14,16 +14,16 @@
 # Modified from Cross-Lingual-Voice-Cloning(https://github.com/deterministic-algorithms-lab/Cross-Lingual-Voice-Cloning)
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 class SpeakerClassifier(nn.Layer):
    @typechecked
    def __init__(
            self,
            idim: int,
            hidden_sc_dim: int,
            spk_num: int, ):
        assert check_argument_types()
        super().__init__()
        # store hyperparameters
        self.idim = idim
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -21,7 +21,7 @@ from paddle import nn
 from paddle.nn import functional as F
 from scipy import signal
 from scipy.stats import betabinom
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.predictor.duration_predictor import (
@ -1137,6 +1137,7 @@ class MLMLoss(nn.Layer):
 class VarianceLoss(nn.Layer):
    @typechecked
    def __init__(self, use_masking: bool=True,
                 use_weighted_masking: bool=False):
        """Initialize JETS variance loss module.
@ -1147,7 +1148,6 @@ class VarianceLoss(nn.Layer):
                calculation.
        """
        assert check_argument_types()
        super().__init__()
        assert (use_masking != use_weighted_masking) or not use_masking
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@ -18,7 +18,7 @@ from typing import Tuple
 import numpy as np
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
 from paddlespeech.utils.initialize import kaiming_uniform_
@ -301,6 +301,7 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1):
    return paddle.logical_not(make_pad_mask(lengths, xs, length_dim))
@typechecked
 def initialize(model: nn.Layer, init: str):
    """Initialize weights of a neural network module.
@ -314,8 +315,6 @@ def initialize(model: nn.Layer, init: str):
        init (str):
            Method of initialization.
    """
    assert check_argument_types()
    if init == "xavier_uniform":
        nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
                                              nn.initializer.Constant())
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@ -15,7 +15,7 @@
 """Variance predictor related modules."""
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.modules.layer_norm import LayerNorm
 from paddlespeech.t2s.modules.masked_fill import masked_fill
@ -32,6 +32,7 @@ class VariancePredictor(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            idim: int,
@ -54,7 +55,6 @@ class VariancePredictor(nn.Layer):
            dropout_rate (float, optional): 
                Dropout rate.
        """
        assert check_argument_types()
        super().__init__()
        self.conv = nn.LayerList()
        for idx in range(n_layers):
@ -96,7 +96,7 @@ class VariancePredictor(nn.Layer):
            xs = f(xs)
        # (B, Tmax, 1)
        xs = self.linear(xs.transpose([0, 2, 1]))
-    
+
        if x_masks is not None:
            xs = masked_fill(xs, x_masks, 0.0)
        return xs
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
@ -17,7 +17,7 @@ from typing import Sequence
 import paddle
 from paddle import nn
-from typeguard import check_argument_types
+from typeguard import typechecked
 from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
@ -58,6 +58,7 @@ class StyleEncoder(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            idim: int=80,
@ -71,7 +72,6 @@ class StyleEncoder(nn.Layer):
            gru_layers: int=1,
            gru_units: int=128, ):
        """Initilize global style encoder module."""
        assert check_argument_types()
        super().__init__()
        self.ref_enc = ReferenceEncoder(
@ -132,6 +132,7 @@ class ReferenceEncoder(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            idim=80,
@ -142,7 +143,6 @@ class ReferenceEncoder(nn.Layer):
            gru_layers: int=1,
            gru_units: int=128, ):
        """Initilize reference encoder module."""
        assert check_argument_types()
        super().__init__()
        # check hyperparameters are valid
@ -232,6 +232,7 @@ class StyleTokenLayer(nn.Layer):
    """
    @typechecked
    def __init__(
            self,
            ref_embed_dim: int=128,
@ -240,7 +241,6 @@ class StyleTokenLayer(nn.Layer):
            gst_heads: int=4,
            dropout_rate: float=0.0, ):
        """Initilize style token layer module."""
        assert check_argument_types()
        super().__init__()
        gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])
--- a/setup.py
+++ b/setup.py
@ -31,6 +31,26 @@ HERE = Path(os.path.abspath(os.path.dirname(__file__)))
 VERSION = '0.0.0'
 COMMITID = 'none'
 def determine_opencc_version():
    # get gcc version
    gcc_version = None
    try:
        output = sp.check_output(
            ['gcc', '--version'], stderr=sp.STDOUT, text=True)
        for line in output.splitlines():
            if "gcc" in line:
                gcc_version = line.split()[-1]
    except Exception as e:
        gcc_version = None
    # determine opencc version
    if gcc_version:
        if int(gcc_version.split(".")[0]) <= 9:
            return "opencc==1.1.6"  # GCC<=9 need opencc==1.1.6
    return "opencc"  # default
 base = [
    "braceexpand",
    "editdistance",
@ -48,7 +68,7 @@ base = [
    "matplotlib",
    "nara_wpe",
    "onnxruntime>=1.11.0",
-    "opencc==1.1.6",
+    determine_opencc_version(),  # opencc or opencc==1.1.6
    "opencc-python-reimplemented",
    "pandas",
    "paddleaudio>=1.1.0",
@ -69,8 +89,8 @@ base = [
    "soundfile",
    "textgrid",
    "timer",
-    "ToJyutping==0.2.1",
+    "ToJyutping",
-    "typeguard==2.13.3",
+    "typeguard",
    "webrtcvad",
    "yacs~=0.1.8",
    "zhon",
@ -318,9 +338,9 @@ setup_info = dict(
        'License :: OSI Approved :: Apache Software License',
        'Programming Language :: Python',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3.9',
        'Programming Language :: Python :: 3.10',
    ],
    entry_points={
        'console_scripts': [
--- a/audio/tests/audiotools/core/test_audio_signal.py
+++ b/audio/tests/audiotools/core/test_audio_signal.py
@ -12,9 +12,9 @@ import paddle
 import pytest
 import rich
-from audio import audiotools
+from paddlespeech import audiotools
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
-from audio.audiotools import util
+from paddlespeech.audiotools import util
 def test_io():
--- a/audio/tests/audiotools/core/test_bands.py
+++ b/audio/tests/audiotools/core/test_bands.py
@ -8,9 +8,9 @@ import unittest
 import paddle
-from audio.audiotools.core import pure_tone
+from paddlespeech.audiotools.core import pure_tone
-from audio.audiotools.core import split_bands
+from paddlespeech.audiotools.core import split_bands
-from audio.audiotools.core import SplitBands
+from paddlespeech.audiotools.core import SplitBands
 def delta(a, b, ref, fraction=0.9):
--- a/audio/tests/audiotools/core/test_display.py
+++ b/audio/tests/audiotools/core/test_display.py
@ -8,7 +8,7 @@ from pathlib import Path
 import numpy as np
 from visualdl import LogWriter
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
 def test_specshow():
--- a/audio/tests/audiotools/core/test_dsp.py
+++ b/audio/tests/audiotools/core/test_dsp.py
@ -8,8 +8,8 @@ import numpy as np
 import paddle
 import pytest
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
-from audio.audiotools.core.util import sample_from_dist
+from paddlespeech.audiotools.core.util import sample_from_dist
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])
--- a/audio/tests/audiotools/core/test_effects.py
+++ b/audio/tests/audiotools/core/test_effects.py
@ -8,7 +8,7 @@ import numpy as np
 import paddle
 import pytest
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
 def test_normalize():
--- a/audio/tests/audiotools/core/test_fftconv.py
+++ b/audio/tests/audiotools/core/test_fftconv.py
@ -9,8 +9,8 @@ import unittest
 import paddle
 import paddle.nn.functional as F
-from audio.audiotools.core import fft_conv1d
+from paddlespeech.audiotools.core import fft_conv1d
-from audio.audiotools.core import FFTConv1D
+from paddlespeech.audiotools.core import FFTConv1D
 TOLERANCE = 1e-4  # as relative delta in percentage
--- a/audio/tests/audiotools/core/test_grad.py
+++ b/audio/tests/audiotools/core/test_grad.py
@ -9,7 +9,7 @@ import numpy as np
 import paddle
 import pytest
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
 def test_audio_grad():
--- a/audio/tests/audiotools/core/test_highpass.py
+++ b/audio/tests/audiotools/core/test_highpass.py
@ -9,8 +9,8 @@ import unittest
 import paddle
-from audio.audiotools.core import highpass_filter
+from paddlespeech.audiotools.core import highpass_filter
-from audio.audiotools.core import highpass_filters
+from paddlespeech.audiotools.core import highpass_filters
 def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
--- a/audio/tests/audiotools/core/test_loudness.py
+++ b/audio/tests/audiotools/core/test_loudness.py
@ -8,10 +8,10 @@ import numpy as np
 import pyloudnorm
 import soundfile as sf
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
-from audio.audiotools import datasets
+from paddlespeech.audiotools import datasets
-from audio.audiotools import Meter
+from paddlespeech.audiotools import Meter
-from audio.audiotools import transforms
+from paddlespeech.audiotools import transforms
 ATOL = 1e-1
--- a/audio/tests/audiotools/core/test_lowpass.py
+++ b/audio/tests/audiotools/core/test_lowpass.py
@ -10,10 +10,10 @@ import unittest
 import numpy as np
 import paddle
-from audio.audiotools.core import lowpass_filter
+from paddlespeech.audiotools.core import lowpass_filter
-from audio.audiotools.core import LowPassFilter
+from paddlespeech.audiotools.core import LowPassFilter
-from audio.audiotools.core import LowPassFilters
+from paddlespeech.audiotools.core import LowPassFilters
-from audio.audiotools.core import resample_frac
+from paddlespeech.audiotools.core import resample_frac
 def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):
--- a/audio/tests/audiotools/core/test_util.py
+++ b/audio/tests/audiotools/core/test_util.py
@ -11,8 +11,8 @@ import numpy as np
 import paddle
 import pytest
-from audio.audiotools import util
+from paddlespeech.audiotools import util
-from audio.audiotools.core.audio_signal import AudioSignal
+from paddlespeech.audiotools.core.audio_signal import AudioSignal
 from paddlespeech.vector.training.seeding import seed_everything
--- a/audio/tests/audiotools/data/test_datasets.py
+++ b/audio/tests/audiotools/data/test_datasets.py
@ -10,8 +10,8 @@ import numpy as np
 import paddle
 import pytest
-from audio import audiotools
+from paddlespeech import audiotools
-from audio.audiotools.data import transforms as tfm
+from paddlespeech.audiotools.data import transforms as tfm
 def test_align_lists():
--- a/audio/tests/audiotools/data/test_preprocess.py
+++ b/audio/tests/audiotools/data/test_preprocess.py
@ -8,9 +8,9 @@ from pathlib import Path
 import paddle
-from audio.audiotools.core.util import find_audio
+from paddlespeech.audiotools.core.util import find_audio
-from audio.audiotools.core.util import read_sources
+from paddlespeech.audiotools.core.util import read_sources
-from audio.audiotools.data import preprocess
+from paddlespeech.audiotools.data import preprocess
 def test_create_csv():
--- a/audio/tests/audiotools/data/test_transforms.py
+++ b/audio/tests/audiotools/data/test_transforms.py
@ -11,11 +11,11 @@ import numpy as np
 import paddle
 import pytest
-from audio import audiotools
+from paddlespeech import audiotools
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
-from audio.audiotools import util
+from paddlespeech.audiotools import util
-from audio.audiotools.data import transforms as tfm
+from paddlespeech.audiotools.data import transforms as tfm
-from audio.audiotools.data.datasets import AudioDataset
+from paddlespeech.audiotools.data.datasets import AudioDataset
 from paddlespeech.vector.training.seeding import seed_everything
 non_deterministic_transforms = ["TimeNoise", "FrequencyNoise"]
--- a/audio/tests/audiotools/ml/test_decorators.py
+++ b/audio/tests/audiotools/ml/test_decorators.py
@ -8,10 +8,10 @@ import time
 import paddle
 from visualdl import LogWriter
-from audio.audiotools import util
+from paddlespeech.audiotools import util
-from audio.audiotools.ml.decorators import timer
+from paddlespeech.audiotools.ml.decorators import timer
-from audio.audiotools.ml.decorators import Tracker
+from paddlespeech.audiotools.ml.decorators import Tracker
-from audio.audiotools.ml.decorators import when
+from paddlespeech.audiotools.ml.decorators import when
 def test_all_decorators():
--- a/audio/tests/audiotools/ml/test_model.py
+++ b/audio/tests/audiotools/ml/test_model.py
@ -8,10 +8,10 @@ import tempfile
 import paddle
 from paddle import nn
-from audio.audiotools import ml
+from paddlespeech.audiotools import ml
-from audio.audiotools import util
+from paddlespeech.audiotools import util
 from paddlespeech.vector.training.seeding import seed_everything
-SEED = 0
+SEED = 1024
 def seed_and_run(model, *args, **kwargs):
--- a/audio/tests/audiotools/test_audiotools.sh
+++ b/audio/tests/audiotools/test_audiotools.sh
@ -1,5 +1,4 @@
-python -m pip install -r ../../audiotools/requirements.txt
+python -m pip install -r ../../../paddlespeech/audiotools/requirements.txt
 export PYTHONPATH=$PYTHONPATH:$(realpath ../../..) # this is root path of `PaddleSpeech`
 wget  https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz
 wget  https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz
 tar -zxvf audio.tar.gz
--- a/audio/tests/audiotools/test_post.py
+++ b/audio/tests/audiotools/test_post.py
@ -5,9 +5,9 @@
 import sys
 from pathlib import Path
-from audio.audiotools import AudioSignal
+from paddlespeech.audiotools import AudioSignal
-from audio.audiotools import post
+from paddlespeech.audiotools import post
-from audio.audiotools import transforms
+from paddlespeech.audiotools import transforms
 def test_audio_table():
--- a/tests/unit/ci.sh
+++ b/tests/unit/ci.sh
@ -34,7 +34,7 @@ function main(){
  echo "End server"
  echo "Start testing audiotools"
-  cd ${speech_ci_path}/../../audio/tests/audiotools
+  cd ${speech_ci_path}/audiotools
  bash test_audiotools.sh
  echo "End testing audiotools"
--- a/tests/unit/tts/test_expansion.py
+++ b/tests/unit/tts/test_expansion.py
@ -13,10 +13,10 @@
 # limitations under the License.
 import paddle
-from paddlespeech.t2s.modules import expansion
+# from paddlespeech.t2s.modules import expansion
-def test_expand():
+def _test_expand():
    x = paddle.randn([2, 4, 3])  # (B, T, C)
    lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]])
    y = expansion.expand(x, lengths)