Merge branch 'develop' into patch-12

pull/3970/head
liyulingyue 8 months ago
commit 0cb86f60af

@ -734,8 +734,8 @@ def default_collate(batch,
if not all(len(elem) == elem_size for elem in it): if not all(len(elem) == elem_size for elem in it):
raise RuntimeError( raise RuntimeError(
"each element in list of batch should be of equal size") "each element in list of batch should be of equal size")
transposed = list(zip( transposed = list(
*batch)) # It may be accessed twice, so we use a list. zip(*batch)) # It may be accessed twice, so we use a list.
if isinstance(elem, tuple): if isinstance(elem, tuple):
return [ return [

@ -202,9 +202,9 @@ class AudioDataset:
Examples Examples
-------- --------
>>> from audio.audiotools.data.datasets import AudioLoader >>> from paddlespeech.audiotools.data.datasets import AudioLoader
>>> from audio.audiotools.data.datasets import AudioDataset >>> from paddlespeech.audiotools.data.datasets import AudioDataset
>>> from audio.audiotools import transforms as tfm >>> from paddlespeech.audiotools import transforms as tfm
>>> import numpy as np >>> import numpy as np
>>> >>>
>>> loaders = [ >>> loaders = [
@ -237,9 +237,9 @@ class AudioDataset:
Below is an example of how one could load MUSDB multitrack data: Below is an example of how one could load MUSDB multitrack data:
>>> from audio import audiotools as at >>> from paddlespeech import audiotools as at
>>> from pathlib import Path >>> from pathlib import Path
>>> from audio.audiotools import transforms as tfm >>> from paddlespeech.audiotools import transforms as tfm
>>> import numpy as np >>> import numpy as np
>>> import torch >>> import torch
>>> >>>
@ -296,9 +296,9 @@ class AudioDataset:
Similarly, here's example code for loading Slakh data: Similarly, here's example code for loading Slakh data:
>>> from audio import audiotools as at >>> from paddlespeech import audiotools as at
>>> from pathlib import Path >>> from pathlib import Path
>>> from audio.audiotools import transforms as tfm >>> from paddlespeech.audiotools import transforms as tfm
>>> import numpy as np >>> import numpy as np
>>> import torch >>> import torch
>>> import glob >>> import glob

@ -37,7 +37,7 @@ def create_csv(audio_files: list,
You can produce a CSV file from a directory of audio files via: You can produce a CSV file from a directory of audio files via:
>>> from audio import audiotools >>> from paddlespeech import audiotools
>>> directory = ... >>> directory = ...
>>> audio_files = audiotools.util.find_audio(directory) >>> audio_files = audiotools.util.find_audio(directory)
>>> output_path = "train.csv" >>> output_path = "train.csv"

@ -6,7 +6,7 @@ import typing
import paddle import paddle
from audio.audiotools.core import AudioSignal from paddlespeech.audiotools.core import AudioSignal
def audio_table( def audio_table(

@ -2,5 +2,4 @@ ffmpeg-python
ffmpy ffmpy
flatten_dict flatten_dict
pyloudnorm pyloudnorm
pytest rich
rich

@ -19,7 +19,7 @@ from typing import Tuple
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import initializer as I from paddle.nn import initializer as I
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.s2t.modules.align import BatchNorm1D from paddlespeech.s2t.modules.align import BatchNorm1D
from paddlespeech.s2t.modules.align import Conv1D from paddlespeech.s2t.modules.align import Conv1D
@ -34,6 +34,7 @@ __all__ = ['ConvolutionModule']
class ConvolutionModule(nn.Layer): class ConvolutionModule(nn.Layer):
"""ConvolutionModule in Conformer model.""" """ConvolutionModule in Conformer model."""
@typechecked
def __init__(self, def __init__(self,
channels: int, channels: int,
kernel_size: int=15, kernel_size: int=15,
@ -52,7 +53,6 @@ class ConvolutionModule(nn.Layer):
causal (bool): Whether use causal convolution or not causal (bool): Whether use causal convolution or not
bias (bool): Whether Conv with bias or not bias (bool): Whether Conv with bias or not
""" """
assert check_argument_types()
super().__init__() super().__init__()
self.bias = bias self.bias = bias
self.channels = channels self.channels = channels

@ -17,7 +17,7 @@ from typing import Union
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.loss import CTCLoss from paddlespeech.s2t.modules.loss import CTCLoss
@ -48,6 +48,7 @@ __all__ = ['CTCDecoder']
class CTCDecoderBase(nn.Layer): class CTCDecoderBase(nn.Layer):
@typechecked
def __init__(self, def __init__(self,
odim, odim,
enc_n_units, enc_n_units,
@ -66,7 +67,6 @@ class CTCDecoderBase(nn.Layer):
batch_average (bool): do batch dim wise average. batch_average (bool): do batch dim wise average.
grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None. grad_norm_type (str): Default, None. one of 'instance', 'batch', 'frame', None.
""" """
assert check_argument_types()
super().__init__() super().__init__()
self.blank_id = blank_id self.blank_id = blank_id

@ -21,7 +21,7 @@ from typing import Tuple
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
from paddlespeech.s2t.modules.align import Embedding from paddlespeech.s2t.modules.align import Embedding
@ -61,6 +61,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
False: x -> x + att(x) False: x -> x + att(x)
""" """
@typechecked
def __init__(self, def __init__(self,
vocab_size: int, vocab_size: int,
encoder_output_size: int, encoder_output_size: int,
@ -77,8 +78,6 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
concat_after: bool=False, concat_after: bool=False,
max_len: int=5000): max_len: int=5000):
assert check_argument_types()
nn.Layer.__init__(self) nn.Layer.__init__(self)
self.selfattention_layer_type = 'selfattn' self.selfattention_layer_type = 'selfattn'
attention_dim = encoder_output_size attention_dim = encoder_output_size
@ -276,6 +275,7 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
False: x -> x + att(x) False: x -> x + att(x)
""" """
@typechecked
def __init__(self, def __init__(self,
vocab_size: int, vocab_size: int,
encoder_output_size: int, encoder_output_size: int,
@ -293,8 +293,6 @@ class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
concat_after: bool=False, concat_after: bool=False,
max_len: int=5000): max_len: int=5000):
assert check_argument_types()
nn.Layer.__init__(self) nn.Layer.__init__(self)
self.left_decoder = TransformerDecoder( self.left_decoder = TransformerDecoder(
vocab_size, encoder_output_size, attention_heads, linear_units, vocab_size, encoder_output_size, attention_heads, linear_units,

@ -21,7 +21,7 @@ from typing import Union
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.s2t.modules.activation import get_activation from paddlespeech.s2t.modules.activation import get_activation
from paddlespeech.s2t.modules.align import LayerNorm from paddlespeech.s2t.modules.align import LayerNorm
@ -58,6 +58,7 @@ __all__ = [
class BaseEncoder(nn.Layer): class BaseEncoder(nn.Layer):
@typechecked
def __init__(self, def __init__(self,
input_size: int, input_size: int,
output_size: int=256, output_size: int=256,
@ -73,7 +74,7 @@ class BaseEncoder(nn.Layer):
concat_after: bool=False, concat_after: bool=False,
static_chunk_size: int=0, static_chunk_size: int=0,
use_dynamic_chunk: bool=False, use_dynamic_chunk: bool=False,
global_cmvn: paddle.nn.Layer=None, global_cmvn: Optional[nn.Layer]=None,
use_dynamic_left_chunk: bool=False, use_dynamic_left_chunk: bool=False,
max_len: int=5000): max_len: int=5000):
""" """
@ -108,7 +109,6 @@ class BaseEncoder(nn.Layer):
use_dynamic_left_chunk (bool): whether use dynamic left chunk in use_dynamic_left_chunk (bool): whether use dynamic left chunk in
dynamic chunk training dynamic chunk training
""" """
assert check_argument_types()
super().__init__() super().__init__()
self._output_size = output_size self._output_size = output_size
@ -349,6 +349,7 @@ class BaseEncoder(nn.Layer):
class TransformerEncoder(BaseEncoder): class TransformerEncoder(BaseEncoder):
"""Transformer encoder module.""" """Transformer encoder module."""
@typechecked
def __init__( def __init__(
self, self,
input_size: int, input_size: int,
@ -365,12 +366,11 @@ class TransformerEncoder(BaseEncoder):
concat_after: bool=False, concat_after: bool=False,
static_chunk_size: int=0, static_chunk_size: int=0,
use_dynamic_chunk: bool=False, use_dynamic_chunk: bool=False,
global_cmvn: nn.Layer=None, global_cmvn: Optional[nn.Layer]=None,
use_dynamic_left_chunk: bool=False, ): use_dynamic_left_chunk: bool=False, ):
""" Construct TransformerEncoder """ Construct TransformerEncoder
See Encoder for the meaning of each parameter. See Encoder for the meaning of each parameter.
""" """
assert check_argument_types()
super().__init__(input_size, output_size, attention_heads, linear_units, super().__init__(input_size, output_size, attention_heads, linear_units,
num_blocks, dropout_rate, positional_dropout_rate, num_blocks, dropout_rate, positional_dropout_rate,
attention_dropout_rate, input_layer, attention_dropout_rate, input_layer,
@ -424,6 +424,7 @@ class TransformerEncoder(BaseEncoder):
class ConformerEncoder(BaseEncoder): class ConformerEncoder(BaseEncoder):
"""Conformer encoder module.""" """Conformer encoder module."""
@typechecked
def __init__(self, def __init__(self,
input_size: int, input_size: int,
output_size: int=256, output_size: int=256,
@ -439,7 +440,7 @@ class ConformerEncoder(BaseEncoder):
concat_after: bool=False, concat_after: bool=False,
static_chunk_size: int=0, static_chunk_size: int=0,
use_dynamic_chunk: bool=False, use_dynamic_chunk: bool=False,
global_cmvn: nn.Layer=None, global_cmvn: Optional[nn.Layer]=None,
use_dynamic_left_chunk: bool=False, use_dynamic_left_chunk: bool=False,
positionwise_conv_kernel_size: int=1, positionwise_conv_kernel_size: int=1,
macaron_style: bool=True, macaron_style: bool=True,
@ -466,8 +467,6 @@ class ConformerEncoder(BaseEncoder):
causal (bool): whether to use causal convolution or not. causal (bool): whether to use causal convolution or not.
cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm'] cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
""" """
assert check_argument_types()
super().__init__(input_size, output_size, attention_heads, linear_units, super().__init__(input_size, output_size, attention_heads, linear_units,
num_blocks, dropout_rate, positional_dropout_rate, num_blocks, dropout_rate, positional_dropout_rate,
attention_dropout_rate, input_layer, attention_dropout_rate, input_layer,
@ -519,6 +518,7 @@ class ConformerEncoder(BaseEncoder):
class SqueezeformerEncoder(nn.Layer): class SqueezeformerEncoder(nn.Layer):
@typechecked
def __init__(self, def __init__(self,
input_size: int, input_size: int,
encoder_dim: int=256, encoder_dim: int=256,
@ -541,7 +541,7 @@ class SqueezeformerEncoder(nn.Layer):
adaptive_scale: bool=True, adaptive_scale: bool=True,
activation_type: str="swish", activation_type: str="swish",
init_weights: bool=True, init_weights: bool=True,
global_cmvn: paddle.nn.Layer=None, global_cmvn: Optional[nn.Layer]=None,
normalize_before: bool=False, normalize_before: bool=False,
use_dynamic_chunk: bool=False, use_dynamic_chunk: bool=False,
concat_after: bool=False, concat_after: bool=False,
@ -572,7 +572,6 @@ class SqueezeformerEncoder(nn.Layer):
init_weights (bool): Whether to initialize weights. init_weights (bool): Whether to initialize weights.
causal (bool): whether to use causal convolution or not. causal (bool): whether to use causal convolution or not.
""" """
assert check_argument_types()
super().__init__() super().__init__()
self.global_cmvn = global_cmvn self.global_cmvn = global_cmvn
self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \ self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \

@ -19,7 +19,7 @@ from typing import Union
import paddle import paddle
from paddle.optimizer.lr import LRScheduler from paddle.optimizer.lr import LRScheduler
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.dynamic_import import instance_class from paddlespeech.s2t.utils.dynamic_import import instance_class
@ -57,13 +57,13 @@ class WarmupLR(LRScheduler):
Note that the maximum lr equals to optimizer.lr in this scheduler. Note that the maximum lr equals to optimizer.lr in this scheduler.
""" """
@typechecked
def __init__(self, def __init__(self,
warmup_steps: Union[int, float]=25000, warmup_steps: Union[int, float]=25000,
learning_rate=1.0, learning_rate=1.0,
last_epoch=-1, last_epoch=-1,
verbose=False, verbose=False,
**kwargs): **kwargs):
assert check_argument_types()
self.warmup_steps = warmup_steps self.warmup_steps = warmup_steps
super().__init__(learning_rate, last_epoch, verbose) super().__init__(learning_rate, last_epoch, verbose)

@ -20,7 +20,7 @@ from typing import Tuple
import numpy as np import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI
from paddlespeech.t2s.modules.diffnet import DiffNet from paddlespeech.t2s.modules.diffnet import DiffNet
@ -40,6 +40,7 @@ class DiffSinger(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
# min and max spec for stretching before diffusion # min and max spec for stretching before diffusion
@ -157,7 +158,6 @@ class DiffSinger(nn.Layer):
denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module. denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
diffusion_params (Dict[str, Any]): Parameter dict for diffusion module. diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
""" """
assert check_argument_types()
super().__init__() super().__init__()
self.fs2 = FastSpeech2MIDI( self.fs2 = FastSpeech2MIDI(
idim=idim, idim=idim,
@ -336,6 +336,7 @@ class DiffSingerInference(nn.Layer):
class DiffusionLoss(nn.Layer): class DiffusionLoss(nn.Layer):
"""Loss function module for Diffusion module on DiffSinger.""" """Loss function module for Diffusion module on DiffSinger."""
@typechecked
def __init__(self, use_masking: bool=True, def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False): use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module. """Initialize feed-forward Transformer loss module.
@ -345,7 +346,6 @@ class DiffusionLoss(nn.Layer):
use_weighted_masking (bool): use_weighted_masking (bool):
Whether to weighted masking in loss calculation. Whether to weighted masking in loss calculation.
""" """
assert check_argument_types()
super().__init__() super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking assert (use_masking != use_weighted_masking) or not use_masking

@ -19,7 +19,7 @@ from typing import Tuple
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
@ -33,6 +33,7 @@ class FastSpeech2MIDI(FastSpeech2):
"""The Fastspeech2 module of DiffSinger. """The Fastspeech2 module of DiffSinger.
""" """
@typechecked
def __init__( def __init__(
self, self,
# fastspeech2 network structure related # fastspeech2 network structure related
@ -57,7 +58,6 @@ class FastSpeech2MIDI(FastSpeech2):
is_slur_ids will be provided as the input is_slur_ids will be provided as the input
""" """
assert check_argument_types()
super().__init__(idim=idim, odim=odim, **fastspeech2_params) super().__init__(idim=idim, odim=odim, **fastspeech2_params)
self.use_energy_pred = use_energy_pred self.use_energy_pred = use_energy_pred
self.use_postnet = use_postnet self.use_postnet = use_postnet
@ -495,6 +495,7 @@ class FastSpeech2MIDI(FastSpeech2):
class FastSpeech2MIDILoss(FastSpeech2Loss): class FastSpeech2MIDILoss(FastSpeech2Loss):
"""Loss function module for DiffSinger.""" """Loss function module for DiffSinger."""
@typechecked
def __init__(self, use_masking: bool=True, def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False): use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module. """Initialize feed-forward Transformer loss module.
@ -504,7 +505,6 @@ class FastSpeech2MIDILoss(FastSpeech2Loss):
use_weighted_masking (bool): use_weighted_masking (bool):
Whether to weighted masking in loss calculation. Whether to weighted masking in loss calculation.
""" """
assert check_argument_types()
super().__init__(use_masking, use_weighted_masking) super().__init__(use_masking, use_weighted_masking)
def forward( def forward(

@ -15,6 +15,7 @@
"""Fastspeech2 related modules for paddle""" """Fastspeech2 related modules for paddle"""
from typing import Dict from typing import Dict
from typing import List from typing import List
from typing import Optional
from typing import Sequence from typing import Sequence
from typing import Tuple from typing import Tuple
from typing import Union from typing import Union
@ -23,7 +24,7 @@ import numpy as np
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer
from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier
@ -60,6 +61,7 @@ class FastSpeech2(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
# network structure related # network structure related
@ -131,12 +133,12 @@ class FastSpeech2(nn.Layer):
pitch_embed_dropout: float=0.5, pitch_embed_dropout: float=0.5,
stop_gradient_from_pitch_predictor: bool=False, stop_gradient_from_pitch_predictor: bool=False,
# spk emb # spk emb
spk_num: int=None, spk_num: Optional[int]=None,
spk_embed_dim: int=None, spk_embed_dim: Optional[int]=None,
spk_embed_integration_type: str="add", spk_embed_integration_type: str="add",
# tone emb # tone emb
tone_num: int=None, tone_num: Optional[int]=None,
tone_embed_dim: int=None, tone_embed_dim: Optional[int]=None,
tone_embed_integration_type: str="add", tone_embed_integration_type: str="add",
# training related # training related
init_type: str="xavier_uniform", init_type: str="xavier_uniform",
@ -282,7 +284,6 @@ class FastSpeech2(nn.Layer):
The hidden layer dim of speaker classifier The hidden layer dim of speaker classifier
""" """
assert check_argument_types()
super().__init__() super().__init__()
# store hyperparameters # store hyperparameters
@ -1070,6 +1071,7 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
class FastSpeech2Loss(nn.Layer): class FastSpeech2Loss(nn.Layer):
"""Loss function module for FastSpeech2.""" """Loss function module for FastSpeech2."""
@typechecked
def __init__(self, use_masking: bool=True, def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False): use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module. """Initialize feed-forward Transformer loss module.
@ -1079,7 +1081,6 @@ class FastSpeech2Loss(nn.Layer):
use_weighted_masking (bool): use_weighted_masking (bool):
Whether to weighted masking in loss calculation. Whether to weighted masking in loss calculation.
""" """
assert check_argument_types()
super().__init__() super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking assert (use_masking != use_weighted_masking) or not use_masking

@ -28,7 +28,6 @@ from typing import Tuple
import numpy as np import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types
from paddlespeech.t2s.models.hifigan import HiFiGANGenerator from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
from paddlespeech.t2s.models.jets.alignments import AlignmentModule from paddlespeech.t2s.models.jets.alignments import AlignmentModule

@ -24,7 +24,7 @@ from typing import Optional
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
@ -64,6 +64,7 @@ class JETS(nn.Layer):
Text-to-Speech`: https://arxiv.org/abs/2203.16852v1 Text-to-Speech`: https://arxiv.org/abs/2203.16852v1
""" """
@typechecked
def __init__( def __init__(
self, self,
# generator related # generator related
@ -225,7 +226,6 @@ class JETS(nn.Layer):
cache_generator_outputs (bool): cache_generator_outputs (bool):
Whether to cache generator outputs. Whether to cache generator outputs.
""" """
assert check_argument_types()
super().__init__() super().__init__()
# define modules # define modules
@ -279,8 +279,7 @@ class JETS(nn.Layer):
lids: Optional[paddle.Tensor]=None, lids: Optional[paddle.Tensor]=None,
forward_generator: bool=True, forward_generator: bool=True,
use_alignment_module: bool=False, use_alignment_module: bool=False,
**kwargs, **kwargs, ) -> Dict[str, Any]:
) -> Dict[str, Any]:
"""Perform generator forward. """Perform generator forward.
Args: Args:
text (Tensor): text (Tensor):

@ -21,7 +21,7 @@ from typing import Tuple
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_pad_mask from paddlespeech.t2s.modules.nets_utils import make_pad_mask
@ -44,6 +44,7 @@ class Tacotron2(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
# network structure related # network structure related
@ -67,7 +68,7 @@ class Tacotron2(nn.Layer):
postnet_layers: int=5, postnet_layers: int=5,
postnet_chans: int=512, postnet_chans: int=512,
postnet_filts: int=5, postnet_filts: int=5,
output_activation: str=None, output_activation: Optional[str]=None,
use_batch_norm: bool=True, use_batch_norm: bool=True,
use_concate: bool=True, use_concate: bool=True,
use_residual: bool=False, use_residual: bool=False,
@ -145,7 +146,6 @@ class Tacotron2(nn.Layer):
zoneout_rate (float): zoneout_rate (float):
Zoneout rate. Zoneout rate.
""" """
assert check_argument_types()
super().__init__() super().__init__()
# store hyperparameters # store hyperparameters

@ -13,7 +13,9 @@
# limitations under the License. # limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet) # Modified from espnet(https://github.com/espnet/espnet)
"""Fastspeech2 related modules for paddle""" """Fastspeech2 related modules for paddle"""
from optparse import Option
from typing import Dict from typing import Dict
from typing import Optional
from typing import Sequence from typing import Sequence
from typing import Tuple from typing import Tuple
@ -21,7 +23,7 @@ import numpy
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
@ -169,6 +171,7 @@ class TransformerTTS(nn.Layer):
Number of layers to apply guided attention loss. Number of layers to apply guided attention loss.
""" """
@typechecked
def __init__( def __init__(
self, self,
# network structure related # network structure related
@ -198,7 +201,7 @@ class TransformerTTS(nn.Layer):
encoder_concat_after: bool=False, encoder_concat_after: bool=False,
decoder_concat_after: bool=False, decoder_concat_after: bool=False,
reduction_factor: int=1, reduction_factor: int=1,
spk_embed_dim: int=None, spk_embed_dim: Optional[int]=None,
spk_embed_integration_type: str="add", spk_embed_integration_type: str="add",
use_gst: bool=False, use_gst: bool=False,
gst_tokens: int=10, gst_tokens: int=10,
@ -227,7 +230,7 @@ class TransformerTTS(nn.Layer):
num_heads_applied_guided_attn: int=2, num_heads_applied_guided_attn: int=2,
num_layers_applied_guided_attn: int=2, ): num_layers_applied_guided_attn: int=2, ):
"""Initialize Transformer module.""" """Initialize Transformer module."""
assert check_argument_types()
super().__init__() super().__init__()
# store hyperparameters # store hyperparameters

@ -20,7 +20,7 @@ from typing import Optional
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
@ -60,6 +60,7 @@ class VITS(nn.Layer):
Text-to-Speech`: https://arxiv.org/abs/2006.04558 Text-to-Speech`: https://arxiv.org/abs/2006.04558
""" """
@typechecked
def __init__( def __init__(
self, self,
# generator related # generator related
@ -181,7 +182,6 @@ class VITS(nn.Layer):
cache_generator_outputs (bool): cache_generator_outputs (bool):
Whether to cache generator outputs. Whether to cache generator outputs.
""" """
assert check_argument_types()
super().__init__() super().__init__()
# define modules # define modules
@ -504,8 +504,9 @@ class VITS(nn.Layer):
def reset_parameters(self): def reset_parameters(self):
def _reset_parameters(module): def _reset_parameters(module):
if isinstance(module, if isinstance(
(nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)): module,
(nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
kaiming_uniform_(module.weight, a=math.sqrt(5)) kaiming_uniform_(module.weight, a=math.sqrt(5))
if module.bias is not None: if module.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out(module.weight) fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
@ -513,8 +514,9 @@ class VITS(nn.Layer):
bound = 1 / math.sqrt(fan_in) bound = 1 / math.sqrt(fan_in)
uniform_(module.bias, -bound, bound) uniform_(module.bias, -bound, bound)
if isinstance(module, if isinstance(
(nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)): module,
(nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
ones_(module.weight) ones_(module.weight)
zeros_(module.bias) zeros_(module.bias)
@ -533,13 +535,13 @@ class VITS(nn.Layer):
self.apply(_reset_parameters) self.apply(_reset_parameters)
class VITSInference(nn.Layer): class VITSInference(nn.Layer):
def __init__(self, model): def __init__(self, model):
super().__init__() super().__init__()
self.acoustic_model = model self.acoustic_model = model
def forward(self, text, sids=None): def forward(self, text, sids=None):
out = self.acoustic_model.inference( out = self.acoustic_model.inference(text, sids=sids)
text, sids=sids)
wav = out['wav'] wav = out['wav']
return wav return wav

@ -14,16 +14,16 @@
# Modified from Cross-Lingual-Voice-Cloning(https://github.com/deterministic-algorithms-lab/Cross-Lingual-Voice-Cloning) # Modified from Cross-Lingual-Voice-Cloning(https://github.com/deterministic-algorithms-lab/Cross-Lingual-Voice-Cloning)
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
class SpeakerClassifier(nn.Layer): class SpeakerClassifier(nn.Layer):
@typechecked
def __init__( def __init__(
self, self,
idim: int, idim: int,
hidden_sc_dim: int, hidden_sc_dim: int,
spk_num: int, ): spk_num: int, ):
assert check_argument_types()
super().__init__() super().__init__()
# store hyperparameters # store hyperparameters
self.idim = idim self.idim = idim

@ -21,7 +21,7 @@ from paddle import nn
from paddle.nn import functional as F from paddle.nn import functional as F
from scipy import signal from scipy import signal
from scipy.stats import betabinom from scipy.stats import betabinom
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
from paddlespeech.t2s.modules.predictor.duration_predictor import ( from paddlespeech.t2s.modules.predictor.duration_predictor import (
@ -1137,6 +1137,7 @@ class MLMLoss(nn.Layer):
class VarianceLoss(nn.Layer): class VarianceLoss(nn.Layer):
@typechecked
def __init__(self, use_masking: bool=True, def __init__(self, use_masking: bool=True,
use_weighted_masking: bool=False): use_weighted_masking: bool=False):
"""Initialize JETS variance loss module. """Initialize JETS variance loss module.
@ -1147,7 +1148,6 @@ class VarianceLoss(nn.Layer):
calculation. calculation.
""" """
assert check_argument_types()
super().__init__() super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking assert (use_masking != use_weighted_masking) or not use_masking

@ -18,7 +18,7 @@ from typing import Tuple
import numpy as np import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
from paddlespeech.utils.initialize import kaiming_uniform_ from paddlespeech.utils.initialize import kaiming_uniform_
@ -301,6 +301,7 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1):
return paddle.logical_not(make_pad_mask(lengths, xs, length_dim)) return paddle.logical_not(make_pad_mask(lengths, xs, length_dim))
@typechecked
def initialize(model: nn.Layer, init: str): def initialize(model: nn.Layer, init: str):
"""Initialize weights of a neural network module. """Initialize weights of a neural network module.
@ -314,8 +315,6 @@ def initialize(model: nn.Layer, init: str):
init (str): init (str):
Method of initialization. Method of initialization.
""" """
assert check_argument_types()
if init == "xavier_uniform": if init == "xavier_uniform":
nn.initializer.set_global_initializer(nn.initializer.XavierUniform(), nn.initializer.set_global_initializer(nn.initializer.XavierUniform(),
nn.initializer.Constant()) nn.initializer.Constant())

@ -15,7 +15,7 @@
"""Variance predictor related modules.""" """Variance predictor related modules."""
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.modules.layer_norm import LayerNorm from paddlespeech.t2s.modules.layer_norm import LayerNorm
from paddlespeech.t2s.modules.masked_fill import masked_fill from paddlespeech.t2s.modules.masked_fill import masked_fill
@ -32,6 +32,7 @@ class VariancePredictor(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
idim: int, idim: int,
@ -54,7 +55,6 @@ class VariancePredictor(nn.Layer):
dropout_rate (float, optional): dropout_rate (float, optional):
Dropout rate. Dropout rate.
""" """
assert check_argument_types()
super().__init__() super().__init__()
self.conv = nn.LayerList() self.conv = nn.LayerList()
for idx in range(n_layers): for idx in range(n_layers):
@ -96,7 +96,7 @@ class VariancePredictor(nn.Layer):
xs = f(xs) xs = f(xs)
# (B, Tmax, 1) # (B, Tmax, 1)
xs = self.linear(xs.transpose([0, 2, 1])) xs = self.linear(xs.transpose([0, 2, 1]))
if x_masks is not None: if x_masks is not None:
xs = masked_fill(xs, x_masks, 0.0) xs = masked_fill(xs, x_masks, 0.0)
return xs return xs

@ -17,7 +17,7 @@ from typing import Sequence
import paddle import paddle
from paddle import nn from paddle import nn
from typeguard import check_argument_types from typeguard import typechecked
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
@ -58,6 +58,7 @@ class StyleEncoder(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
idim: int=80, idim: int=80,
@ -71,7 +72,6 @@ class StyleEncoder(nn.Layer):
gru_layers: int=1, gru_layers: int=1,
gru_units: int=128, ): gru_units: int=128, ):
"""Initilize global style encoder module.""" """Initilize global style encoder module."""
assert check_argument_types()
super().__init__() super().__init__()
self.ref_enc = ReferenceEncoder( self.ref_enc = ReferenceEncoder(
@ -132,6 +132,7 @@ class ReferenceEncoder(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
idim=80, idim=80,
@ -142,7 +143,6 @@ class ReferenceEncoder(nn.Layer):
gru_layers: int=1, gru_layers: int=1,
gru_units: int=128, ): gru_units: int=128, ):
"""Initilize reference encoder module.""" """Initilize reference encoder module."""
assert check_argument_types()
super().__init__() super().__init__()
# check hyperparameters are valid # check hyperparameters are valid
@ -232,6 +232,7 @@ class StyleTokenLayer(nn.Layer):
""" """
@typechecked
def __init__( def __init__(
self, self,
ref_embed_dim: int=128, ref_embed_dim: int=128,
@ -240,7 +241,6 @@ class StyleTokenLayer(nn.Layer):
gst_heads: int=4, gst_heads: int=4,
dropout_rate: float=0.0, ): dropout_rate: float=0.0, ):
"""Initilize style token layer module.""" """Initilize style token layer module."""
assert check_argument_types()
super().__init__() super().__init__()
gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads]) gst_embs = paddle.randn(shape=[gst_tokens, gst_token_dim // gst_heads])

@ -31,6 +31,26 @@ HERE = Path(os.path.abspath(os.path.dirname(__file__)))
VERSION = '0.0.0' VERSION = '0.0.0'
COMMITID = 'none' COMMITID = 'none'
def determine_opencc_version():
# get gcc version
gcc_version = None
try:
output = sp.check_output(
['gcc', '--version'], stderr=sp.STDOUT, text=True)
for line in output.splitlines():
if "gcc" in line:
gcc_version = line.split()[-1]
except Exception as e:
gcc_version = None
# determine opencc version
if gcc_version:
if int(gcc_version.split(".")[0]) <= 9:
return "opencc==1.1.6" # GCC<=9 need opencc==1.1.6
return "opencc" # default
base = [ base = [
"braceexpand", "braceexpand",
"editdistance", "editdistance",
@ -48,7 +68,7 @@ base = [
"matplotlib", "matplotlib",
"nara_wpe", "nara_wpe",
"onnxruntime>=1.11.0", "onnxruntime>=1.11.0",
"opencc==1.1.6", determine_opencc_version(), # opencc or opencc==1.1.6
"opencc-python-reimplemented", "opencc-python-reimplemented",
"pandas", "pandas",
"paddleaudio>=1.1.0", "paddleaudio>=1.1.0",
@ -69,8 +89,8 @@ base = [
"soundfile", "soundfile",
"textgrid", "textgrid",
"timer", "timer",
"ToJyutping==0.2.1", "ToJyutping",
"typeguard==2.13.3", "typeguard",
"webrtcvad", "webrtcvad",
"yacs~=0.1.8", "yacs~=0.1.8",
"zhon", "zhon",
@ -318,9 +338,9 @@ setup_info = dict(
'License :: OSI Approved :: Apache Software License', 'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python', 'Programming Language :: Python',
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
], ],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [

@ -12,9 +12,9 @@ import paddle
import pytest import pytest
import rich import rich
from audio import audiotools from paddlespeech import audiotools
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
from audio.audiotools import util from paddlespeech.audiotools import util
def test_io(): def test_io():

@ -8,9 +8,9 @@ import unittest
import paddle import paddle
from audio.audiotools.core import pure_tone from paddlespeech.audiotools.core import pure_tone
from audio.audiotools.core import split_bands from paddlespeech.audiotools.core import split_bands
from audio.audiotools.core import SplitBands from paddlespeech.audiotools.core import SplitBands
def delta(a, b, ref, fraction=0.9): def delta(a, b, ref, fraction=0.9):

@ -8,7 +8,7 @@ from pathlib import Path
import numpy as np import numpy as np
from visualdl import LogWriter from visualdl import LogWriter
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
def test_specshow(): def test_specshow():

@ -8,8 +8,8 @@ import numpy as np
import paddle import paddle
import pytest import pytest
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
from audio.audiotools.core.util import sample_from_dist from paddlespeech.audiotools.core.util import sample_from_dist
@pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0]) @pytest.mark.parametrize("window_duration", [0.1, 0.25, 0.5, 1.0])

@ -8,7 +8,7 @@ import numpy as np
import paddle import paddle
import pytest import pytest
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
def test_normalize(): def test_normalize():

@ -9,8 +9,8 @@ import unittest
import paddle import paddle
import paddle.nn.functional as F import paddle.nn.functional as F
from audio.audiotools.core import fft_conv1d from paddlespeech.audiotools.core import fft_conv1d
from audio.audiotools.core import FFTConv1D from paddlespeech.audiotools.core import FFTConv1D
TOLERANCE = 1e-4 # as relative delta in percentage TOLERANCE = 1e-4 # as relative delta in percentage

@ -9,7 +9,7 @@ import numpy as np
import paddle import paddle
import pytest import pytest
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
def test_audio_grad(): def test_audio_grad():

@ -9,8 +9,8 @@ import unittest
import paddle import paddle
from audio.audiotools.core import highpass_filter from paddlespeech.audiotools.core import highpass_filter
from audio.audiotools.core import highpass_filters from paddlespeech.audiotools.core import highpass_filters
def pure_tone(freq: float, sr: float=128, dur: float=4, device=None): def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):

@ -8,10 +8,10 @@ import numpy as np
import pyloudnorm import pyloudnorm
import soundfile as sf import soundfile as sf
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
from audio.audiotools import datasets from paddlespeech.audiotools import datasets
from audio.audiotools import Meter from paddlespeech.audiotools import Meter
from audio.audiotools import transforms from paddlespeech.audiotools import transforms
ATOL = 1e-1 ATOL = 1e-1

@ -10,10 +10,10 @@ import unittest
import numpy as np import numpy as np
import paddle import paddle
from audio.audiotools.core import lowpass_filter from paddlespeech.audiotools.core import lowpass_filter
from audio.audiotools.core import LowPassFilter from paddlespeech.audiotools.core import LowPassFilter
from audio.audiotools.core import LowPassFilters from paddlespeech.audiotools.core import LowPassFilters
from audio.audiotools.core import resample_frac from paddlespeech.audiotools.core import resample_frac
def pure_tone(freq: float, sr: float=128, dur: float=4, device=None): def pure_tone(freq: float, sr: float=128, dur: float=4, device=None):

@ -11,8 +11,8 @@ import numpy as np
import paddle import paddle
import pytest import pytest
from audio.audiotools import util from paddlespeech.audiotools import util
from audio.audiotools.core.audio_signal import AudioSignal from paddlespeech.audiotools.core.audio_signal import AudioSignal
from paddlespeech.vector.training.seeding import seed_everything from paddlespeech.vector.training.seeding import seed_everything

@ -10,8 +10,8 @@ import numpy as np
import paddle import paddle
import pytest import pytest
from audio import audiotools from paddlespeech import audiotools
from audio.audiotools.data import transforms as tfm from paddlespeech.audiotools.data import transforms as tfm
def test_align_lists(): def test_align_lists():

@ -8,9 +8,9 @@ from pathlib import Path
import paddle import paddle
from audio.audiotools.core.util import find_audio from paddlespeech.audiotools.core.util import find_audio
from audio.audiotools.core.util import read_sources from paddlespeech.audiotools.core.util import read_sources
from audio.audiotools.data import preprocess from paddlespeech.audiotools.data import preprocess
def test_create_csv(): def test_create_csv():

@ -11,11 +11,11 @@ import numpy as np
import paddle import paddle
import pytest import pytest
from audio import audiotools from paddlespeech import audiotools
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
from audio.audiotools import util from paddlespeech.audiotools import util
from audio.audiotools.data import transforms as tfm from paddlespeech.audiotools.data import transforms as tfm
from audio.audiotools.data.datasets import AudioDataset from paddlespeech.audiotools.data.datasets import AudioDataset
from paddlespeech.vector.training.seeding import seed_everything from paddlespeech.vector.training.seeding import seed_everything
non_deterministic_transforms = ["TimeNoise", "FrequencyNoise"] non_deterministic_transforms = ["TimeNoise", "FrequencyNoise"]

@ -8,10 +8,10 @@ import time
import paddle import paddle
from visualdl import LogWriter from visualdl import LogWriter
from audio.audiotools import util from paddlespeech.audiotools import util
from audio.audiotools.ml.decorators import timer from paddlespeech.audiotools.ml.decorators import timer
from audio.audiotools.ml.decorators import Tracker from paddlespeech.audiotools.ml.decorators import Tracker
from audio.audiotools.ml.decorators import when from paddlespeech.audiotools.ml.decorators import when
def test_all_decorators(): def test_all_decorators():

@ -8,10 +8,10 @@ import tempfile
import paddle import paddle
from paddle import nn from paddle import nn
from audio.audiotools import ml from paddlespeech.audiotools import ml
from audio.audiotools import util from paddlespeech.audiotools import util
from paddlespeech.vector.training.seeding import seed_everything from paddlespeech.vector.training.seeding import seed_everything
SEED = 0 SEED = 1024
def seed_and_run(model, *args, **kwargs): def seed_and_run(model, *args, **kwargs):

@ -1,5 +1,4 @@
python -m pip install -r ../../audiotools/requirements.txt python -m pip install -r ../../../paddlespeech/audiotools/requirements.txt
export PYTHONPATH=$PYTHONPATH:$(realpath ../../..) # this is root path of `PaddleSpeech`
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/audio.tar.gz
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz wget https://paddlespeech.bj.bcebos.com/PaddleAudio/audio_tools/regression.tar.gz
tar -zxvf audio.tar.gz tar -zxvf audio.tar.gz

@ -5,9 +5,9 @@
import sys import sys
from pathlib import Path from pathlib import Path
from audio.audiotools import AudioSignal from paddlespeech.audiotools import AudioSignal
from audio.audiotools import post from paddlespeech.audiotools import post
from audio.audiotools import transforms from paddlespeech.audiotools import transforms
def test_audio_table(): def test_audio_table():

@ -34,7 +34,7 @@ function main(){
echo "End server" echo "End server"
echo "Start testing audiotools" echo "Start testing audiotools"
cd ${speech_ci_path}/../../audio/tests/audiotools cd ${speech_ci_path}/audiotools
bash test_audiotools.sh bash test_audiotools.sh
echo "End testing audiotools" echo "End testing audiotools"

@ -13,10 +13,10 @@
# limitations under the License. # limitations under the License.
import paddle import paddle
from paddlespeech.t2s.modules import expansion # from paddlespeech.t2s.modules import expansion
def test_expand(): def _test_expand():
x = paddle.randn([2, 4, 3]) # (B, T, C) x = paddle.randn([2, 4, 3]) # (B, T, C)
lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]]) lengths = paddle.to_tensor([[1, 2, 2, 1], [3, 1, 4, 0]])
y = expansion.expand(x, lengths) y = expansion.expand(x, lengths)

Loading…
Cancel
Save