Merge pull request #2115 from yt605155624/add_api

[doc]format tts doc string for read the docs
pull/2120/head
TianYuan 3 years ago committed by GitHub
commit 8817bf8636
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -141,71 +141,133 @@ class FastSpeech2(nn.Layer):
init_dec_alpha: float=1.0, ):
"""Initialize FastSpeech2 module.
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
adim (int): Attention dimension.
aheads (int): Number of attention heads.
elayers (int): Number of encoder layers.
eunits (int): Number of encoder hidden units.
dlayers (int): Number of decoder layers.
dunits (int): Number of decoder hidden units.
postnet_layers (int): Number of postnet layers.
postnet_chans (int): Number of postnet channels.
postnet_filts (int): Kernel size of postnet.
postnet_dropout_rate (float): Dropout rate in postnet.
use_scaled_pos_enc (bool): Whether to use trainable scaled pos encoding.
use_batch_norm (bool): Whether to use batch normalization in encoder prenet.
encoder_normalize_before (bool): Whether to apply layernorm layer before encoder block.
decoder_normalize_before (bool): Whether to apply layernorm layer before decoder block.
encoder_concat_after (bool): Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after (bool): Whether to concatenate attention layer's input and output in decoder.
reduction_factor (int): Reduction factor.
encoder_type (str): Encoder type ("transformer" or "conformer").
decoder_type (str): Decoder type ("transformer" or "conformer").
transformer_enc_dropout_rate (float): Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float): Dropout rate after encoder positional encoding.
transformer_enc_attn_dropout_rate (float): Dropout rate in encoder self-attention module.
transformer_dec_dropout_rate (float): Dropout rate in decoder except attention & positional encoding.
transformer_dec_positional_dropout_rate (float): Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate (float): Dropout rate in decoder self-attention module.
conformer_pos_enc_layer_type (str): Pos encoding layer type in conformer.
conformer_self_attn_layer_type (str): Self-attention layer type in conformer
conformer_activation_type (str): Activation function type in conformer.
use_macaron_style_in_conformer (bool): Whether to use macaron style FFN.
use_cnn_in_conformer (bool): Whether to use CNN in conformer.
zero_triu (bool): Whether to use zero triu in relative self-attention module.
conformer_enc_kernel_size (int): Kernel size of encoder conformer.
conformer_dec_kernel_size (int): Kernel size of decoder conformer.
duration_predictor_layers (int): Number of duration predictor layers.
duration_predictor_chans (int): Number of duration predictor channels.
duration_predictor_kernel_size (int): Kernel size of duration predictor.
duration_predictor_dropout_rate (float): Dropout rate in duration predictor.
pitch_predictor_layers (int): Number of pitch predictor layers.
pitch_predictor_chans (int): Number of pitch predictor channels.
pitch_predictor_kernel_size (int): Kernel size of pitch predictor.
pitch_predictor_dropout_rate (float): Dropout rate in pitch predictor.
pitch_embed_kernel_size (float): Kernel size of pitch embedding.
pitch_embed_dropout_rate (float): Dropout rate for pitch embedding.
stop_gradient_from_pitch_predictor (bool): Whether to stop gradient from pitch predictor to encoder.
energy_predictor_layers (int): Number of energy predictor layers.
energy_predictor_chans (int): Number of energy predictor channels.
energy_predictor_kernel_size (int): Kernel size of energy predictor.
energy_predictor_dropout_rate (float): Dropout rate in energy predictor.
energy_embed_kernel_size (float): Kernel size of energy embedding.
energy_embed_dropout_rate (float): Dropout rate for energy embedding.
stop_gradient_from_energy_predictorbool): Whether to stop gradient from energy predictor to encoder.
spk_num (Optional[int]): Number of speakers. If not None, assume that the spk_embed_dim is not None,
idim (int):
Dimension of the inputs.
odim (int):
Dimension of the outputs.
adim (int):
Attention dimension.
aheads (int):
Number of attention heads.
elayers (int):
Number of encoder layers.
eunits (int):
Number of encoder hidden units.
dlayers (int):
Number of decoder layers.
dunits (int):
Number of decoder hidden units.
postnet_layers (int):
Number of postnet layers.
postnet_chans (int):
Number of postnet channels.
postnet_filts (int):
Kernel size of postnet.
postnet_dropout_rate (float):
Dropout rate in postnet.
use_scaled_pos_enc (bool):
Whether to use trainable scaled pos encoding.
use_batch_norm (bool):
Whether to use batch normalization in encoder prenet.
encoder_normalize_before (bool):
Whether to apply layernorm layer before encoder block.
decoder_normalize_before (bool):
Whether to apply layernorm layer before decoder block.
encoder_concat_after (bool):
Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after (bool):
Whether to concatenate attention layer's input and output in decoder.
reduction_factor (int):
Reduction factor.
encoder_type (str):
Encoder type ("transformer" or "conformer").
decoder_type (str):
Decoder type ("transformer" or "conformer").
transformer_enc_dropout_rate (float):
Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float):
Dropout rate after encoder positional encoding.
transformer_enc_attn_dropout_rate (float):
Dropout rate in encoder self-attention module.
transformer_dec_dropout_rate (float):
Dropout rate in decoder except attention & positional encoding.
transformer_dec_positional_dropout_rate (float):
Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate (float):
Dropout rate in decoder self-attention module.
conformer_pos_enc_layer_type (str):
Pos encoding layer type in conformer.
conformer_self_attn_layer_type (str):
Self-attention layer type in conformer
conformer_activation_type (str):
Activation function type in conformer.
use_macaron_style_in_conformer (bool):
Whether to use macaron style FFN.
use_cnn_in_conformer (bool):
Whether to use CNN in conformer.
zero_triu (bool):
Whether to use zero triu in relative self-attention module.
conformer_enc_kernel_size (int):
Kernel size of encoder conformer.
conformer_dec_kernel_size (int):
Kernel size of decoder conformer.
duration_predictor_layers (int):
Number of duration predictor layers.
duration_predictor_chans (int):
Number of duration predictor channels.
duration_predictor_kernel_size (int):
Kernel size of duration predictor.
duration_predictor_dropout_rate (float):
Dropout rate in duration predictor.
pitch_predictor_layers (int):
Number of pitch predictor layers.
pitch_predictor_chans (int):
Number of pitch predictor channels.
pitch_predictor_kernel_size (int):
Kernel size of pitch predictor.
pitch_predictor_dropout_rate (float):
Dropout rate in pitch predictor.
pitch_embed_kernel_size (float):
Kernel size of pitch embedding.
pitch_embed_dropout_rate (float):
Dropout rate for pitch embedding.
stop_gradient_from_pitch_predictor (bool):
Whether to stop gradient from pitch predictor to encoder.
energy_predictor_layers (int):
Number of energy predictor layers.
energy_predictor_chans (int):
Number of energy predictor channels.
energy_predictor_kernel_size (int):
Kernel size of energy predictor.
energy_predictor_dropout_rate (float):
Dropout rate in energy predictor.
energy_embed_kernel_size (float):
Kernel size of energy embedding.
energy_embed_dropout_rate (float):
Dropout rate for energy embedding.
stop_gradient_from_energy_predictorbool):
Whether to stop gradient from energy predictor to encoder.
spk_num (Optional[int]):
Number of speakers. If not None, assume that the spk_embed_dim is not None,
spk_ids will be provided as the input and use spk_embedding_table.
spk_embed_dim (Optional[int]): Speaker embedding dimension. If not None,
spk_embed_dim (Optional[int]):
Speaker embedding dimension. If not None,
assume that spk_emb will be provided as the input or spk_num is not None.
spk_embed_integration_type (str): How to integrate speaker embedding.
tone_num (Optional[int]): Number of tones. If not None, assume that the
spk_embed_integration_type (str):
How to integrate speaker embedding.
tone_num (Optional[int]):
Number of tones. If not None, assume that the
tone_ids will be provided as the input and use tone_embedding_table.
tone_embed_dim (Optional[int]): Tone embedding dimension. If not None, assume that tone_num is not None.
tone_embed_integration_type (str): How to integrate tone embedding.
init_type (str): How to initialize transformer parameters.
init_enc_alpha float): Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder.
tone_embed_dim (Optional[int]):
Tone embedding dimension. If not None, assume that tone_num is not None.
tone_embed_integration_type (str):
How to integrate tone embedding.
init_type (str):
How to initialize transformer parameters.
init_enc_alpha float):
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float):
Initial value of alpha in scaled pos encoding of the decoder.
"""
assert check_argument_types()
@ -449,20 +511,29 @@ class FastSpeech2(nn.Layer):
"""Calculate forward propagation.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
text_lengths(Tensor(int64)): Batch of lengths of each input (B,).
speech(Tensor): Batch of padded target features (B, Lmax, odim).
speech_lengths(Tensor(int64)): Batch of the lengths of each target (B,).
durations(Tensor(int64)): Batch of padded durations (B, Tmax).
pitch(Tensor): Batch of padded token-averaged pitch (B, Tmax, 1).
energy(Tensor): Batch of padded token-averaged energy (B, Tmax, 1).
tone_id(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
spk_emb(Tensor, optional): Batch of speaker embeddings (B, spk_embed_dim).
spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
text(Tensor(int64)):
Batch of padded token ids (B, Tmax).
text_lengths(Tensor(int64)):
Batch of lengths of each input (B,).
speech(Tensor):
Batch of padded target features (B, Lmax, odim).
speech_lengths(Tensor(int64)):
Batch of the lengths of each target (B,).
durations(Tensor(int64)):
Batch of padded durations (B, Tmax).
pitch(Tensor):
Batch of padded token-averaged pitch (B, Tmax, 1).
energy(Tensor):
Batch of padded token-averaged energy (B, Tmax, 1).
tone_id(Tensor, optional(int64)):
Batch of padded tone ids (B, Tmax).
spk_emb(Tensor, optional):
Batch of speaker embeddings (B, spk_embed_dim).
spk_id(Tnesor, optional(int64)):
Batch of speaker ids (B,)
Returns:
"""
# input of embedding must be int64
@ -658,20 +729,28 @@ class FastSpeech2(nn.Layer):
"""Generate the sequence of features given the sequences of characters.
Args:
text(Tensor(int64)): Input sequence of characters (T,).
durations(Tensor, optional (int64)): Groundtruth of duration (T,).
pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
alpha(float, optional): Alpha to control the speed.
use_teacher_forcing(bool, optional): Whether to use teacher forcing.
text(Tensor(int64)):
Input sequence of characters (T,).
durations(Tensor, optional (int64)):
Groundtruth of duration (T,).
pitch(Tensor, optional):
Groundtruth of token-averaged pitch (T, 1).
energy(Tensor, optional):
Groundtruth of token-averaged energy (T, 1).
alpha(float, optional):
Alpha to control the speed.
use_teacher_forcing(bool, optional):
Whether to use teacher forcing.
If true, groundtruth of duration, pitch and energy will be used.
spk_emb(Tensor, optional, optional): peaker embedding vector (spk_embed_dim,). (Default value = None)
spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
tone_id(Tensor, optional(int64), optional): tone ids (T,). (Default value = None)
spk_emb(Tensor, optional, optional):
peaker embedding vector (spk_embed_dim,). (Default value = None)
spk_id(Tensor, optional(int64), optional):
spk ids (1,). (Default value = None)
tone_id(Tensor, optional(int64), optional):
tone ids (T,). (Default value = None)
Returns:
"""
# input of embedding must be int64
x = paddle.cast(text, 'int64')
@ -720,8 +799,10 @@ class FastSpeech2(nn.Layer):
"""Integrate speaker embedding with hidden states.
Args:
hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
spk_emb(Tensor): Batch of speaker embeddings (B, spk_embed_dim).
hs(Tensor):
Batch of hidden state sequences (B, Tmax, adim).
spk_emb(Tensor):
Batch of speaker embeddings (B, spk_embed_dim).
Returns:
@ -745,8 +826,10 @@ class FastSpeech2(nn.Layer):
"""Integrate speaker embedding with hidden states.
Args:
hs(Tensor): Batch of hidden state sequences (B, Tmax, adim).
tone_embs(Tensor): Batch of speaker embeddings (B, Tmax, tone_embed_dim).
hs(Tensor):
Batch of hidden state sequences (B, Tmax, adim).
tone_embs(Tensor):
Batch of speaker embeddings (B, Tmax, tone_embed_dim).
Returns:
@ -769,10 +852,12 @@ class FastSpeech2(nn.Layer):
"""Make masks for self-attention.
Args:
ilens(Tensor): Batch of lengths (B,).
ilens(Tensor):
Batch of lengths (B,).
Returns:
Tensor: Mask tensor for self-attention. dtype=paddle.bool
Tensor:
Mask tensor for self-attention. dtype=paddle.bool
Examples:
>>> ilens = [5, 3]
@ -854,19 +939,32 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
"""
Args:
text(Tensor(int64)): Input sequence of characters (T,).
durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
text(Tensor(int64)):
Input sequence of characters (T,).
durations(paddle.Tensor/np.ndarray, optional (int64)):
Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale(int/float, optional):
durations_bias(int/float, optional):
pitch(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale(int/float, optional): In denormed HZ domain.
pitch_bias(int/float, optional): In denormed HZ domain.
energy(paddle.Tensor/np.ndarray, optional): Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale(int/float, optional): In denormed domain.
energy_bias(int/float, optional): In denormed domain.
robot: bool: (Default value = False)
spk_emb: (Default value = None)
spk_id: (Default value = None)
pitch(paddle.Tensor/np.ndarray, optional):
Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
pitch_scale(int/float, optional):
In denormed HZ domain.
pitch_bias(int/float, optional):
In denormed HZ domain.
energy(paddle.Tensor/np.ndarray, optional):
Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
energy_scale(int/float, optional):
In denormed domain.
energy_bias(int/float, optional):
In denormed domain.
robot(bool) (Default value = False):
spk_emb(Default value = None):
spk_id(Default value = None):
Returns:
Tensor: logmel
@ -945,8 +1043,10 @@ class FastSpeech2Loss(nn.Layer):
use_weighted_masking: bool=False):
"""Initialize feed-forward Transformer loss module.
Args:
use_masking (bool): Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool): Whether to weighted masking in loss calculation.
use_masking (bool):
Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool):
Whether to weighted masking in loss calculation.
"""
assert check_argument_types()
super().__init__()
@ -978,17 +1078,28 @@ class FastSpeech2Loss(nn.Layer):
"""Calculate forward propagation.
Args:
after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
d_outs(Tensor): Batch of outputs of duration predictor (B, Tmax).
p_outs(Tensor): Batch of outputs of pitch predictor (B, Tmax, 1).
e_outs(Tensor): Batch of outputs of energy predictor (B, Tmax, 1).
ys(Tensor): Batch of target features (B, Lmax, odim).
ds(Tensor): Batch of durations (B, Tmax).
ps(Tensor): Batch of target token-averaged pitch (B, Tmax, 1).
es(Tensor): Batch of target token-averaged energy (B, Tmax, 1).
ilens(Tensor): Batch of the lengths of each input (B,).
olens(Tensor): Batch of the lengths of each target (B,).
after_outs(Tensor):
Batch of outputs after postnets (B, Lmax, odim).
before_outs(Tensor):
Batch of outputs before postnets (B, Lmax, odim).
d_outs(Tensor):
Batch of outputs of duration predictor (B, Tmax).
p_outs(Tensor):
Batch of outputs of pitch predictor (B, Tmax, 1).
e_outs(Tensor):
Batch of outputs of energy predictor (B, Tmax, 1).
ys(Tensor):
Batch of target features (B, Lmax, odim).
ds(Tensor):
Batch of durations (B, Tmax).
ps(Tensor):
Batch of target token-averaged pitch (B, Tmax, 1).
es(Tensor):
Batch of target token-averaged energy (B, Tmax, 1).
ilens(Tensor):
Batch of the lengths of each input (B,).
olens(Tensor):
Batch of the lengths of each target (B,).
Returns:

@ -50,20 +50,34 @@ class HiFiGANGenerator(nn.Layer):
init_type: str="xavier_uniform", ):
"""Initialize HiFiGANGenerator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
channels (int): Number of hidden representation channels.
global_channels (int): Number of global conditioning channels.
kernel_size (int): Kernel size of initial and final conv layer.
upsample_scales (list): List of upsampling scales.
upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
resblock_dilations (list): List of dilation list for residual blocks.
use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
use_weight_norm (bool): Whether to use weight norm.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
channels (int):
Number of hidden representation channels.
global_channels (int):
Number of global conditioning channels.
kernel_size (int):
Kernel size of initial and final conv layer.
upsample_scales (list):
List of upsampling scales.
upsample_kernel_sizes (list):
List of kernel sizes for upsampling layers.
resblock_kernel_sizes (list):
List of kernel sizes for residual blocks.
resblock_dilations (list):
List of dilation list for residual blocks.
use_additional_convs (bool):
Whether to use additional conv layers in residual blocks.
bias (bool):
Whether to add bias parameter in convolution layers.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (dict):
Hyperparameters for activation function.
use_weight_norm (bool):
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@ -199,9 +213,10 @@ class HiFiGANGenerator(nn.Layer):
def inference(self, c, g: Optional[paddle.Tensor]=None):
"""Perform inference.
Args:
c (Tensor): Input tensor (T, in_channels).
normalize_before (bool): Whether to perform normalization.
g (Optional[Tensor]): Global conditioning tensor (global_channels, 1).
c (Tensor):
Input tensor (T, in_channels).
g (Optional[Tensor]):
Global conditioning tensor (global_channels, 1).
Returns:
Tensor:
Output tensor (T ** prod(upsample_scales), out_channels).
@ -233,20 +248,33 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
"""Initialize HiFiGANPeriodDiscriminator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
period (int): Period.
kernel_sizes (list): Kernel sizes of initial conv layers and the final conv layer.
channels (int): Number of initial channels.
downsample_scales (list): List of downsampling scales.
max_downsample_channels (int): Number of maximum downsampling channels.
use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
use_weight_norm (bool): Whether to use weight norm.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
period (int):
Period.
kernel_sizes (list):
Kernel sizes of initial conv layers and the final conv layer.
channels (int):
Number of initial channels.
downsample_scales (list):
List of downsampling scales.
max_downsample_channels (int):
Number of maximum downsampling channels.
use_additional_convs (bool):
Whether to use additional conv layers in residual blocks.
bias (bool):
Whether to add bias parameter in convolution layers.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (dict):
Hyperparameters for activation function.
use_weight_norm (bool):
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
use_spectral_norm (bool): Whether to use spectral norm.
use_spectral_norm (bool):
Whether to use spectral norm.
If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@ -298,7 +326,8 @@ class HiFiGANPeriodDiscriminator(nn.Layer):
"""Calculate forward propagation.
Args:
c (Tensor): Input tensor (B, in_channels, T).
c (Tensor):
Input tensor (B, in_channels, T).
Returns:
list: List of each layer's tensors.
"""
@ -367,8 +396,10 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
"""Initialize HiFiGANMultiPeriodDiscriminator module.
Args:
periods (list): List of periods.
discriminator_params (dict): Parameters for hifi-gan period discriminator module.
periods (list):
List of periods.
discriminator_params (dict):
Parameters for hifi-gan period discriminator module.
The period parameter will be overwritten.
"""
super().__init__()
@ -385,7 +416,8 @@ class HiFiGANMultiPeriodDiscriminator(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T).
x (Tensor):
Input noise signal (B, 1, T).
Returns:
List: List of list of each discriminator outputs, which consists of each layer output tensors.
"""
@ -417,16 +449,25 @@ class HiFiGANScaleDiscriminator(nn.Layer):
"""Initilize HiFiGAN scale discriminator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
kernel_sizes (list): List of four kernel sizes. The first will be used for the first conv layer,
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
kernel_sizes (list):
List of four kernel sizes. The first will be used for the first conv layer,
and the second is for downsampling part, and the remaining two are for output layers.
channels (int): Initial number of channels for conv layer.
max_downsample_channels (int): Maximum number of channels for downsampling layers.
bias (bool): Whether to add bias parameter in convolution layers.
downsample_scales (list): List of downsampling scales.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
channels (int):
Initial number of channels for conv layer.
max_downsample_channels (int):
Maximum number of channels for downsampling layers.
bias (bool):
Whether to add bias parameter in convolution layers.
downsample_scales (list):
List of downsampling scales.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (dict):
Hyperparameters for activation function.
use_weight_norm (bool): Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
use_spectral_norm (bool): Whether to use spectral norm.
@ -614,7 +655,8 @@ class HiFiGANMultiScaleDiscriminator(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T).
x (Tensor):
Input noise signal (B, 1, T).
Returns:
List: List of list of each discriminator outputs, which consists of each layer output tensors.
"""
@ -675,14 +717,21 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
"""Initilize HiFiGAN multi-scale + multi-period discriminator module.
Args:
scales (int): Number of multi-scales.
scale_downsample_pooling (str): Pooling module name for downsampling of the inputs.
scale_downsample_pooling_params (dict): Parameters for the above pooling module.
scale_discriminator_params (dict): Parameters for hifi-gan scale discriminator module.
follow_official_norm bool): Whether to follow the norm setting of the official implementaion.
scales (int):
Number of multi-scales.
scale_downsample_pooling (str):
Pooling module name for downsampling of the inputs.
scale_downsample_pooling_params (dict):
Parameters for the above pooling module.
scale_discriminator_params (dict):
Parameters for hifi-gan scale discriminator module.
follow_official_norm bool):
Whether to follow the norm setting of the official implementaion.
The first discriminator uses spectral norm and the other discriminators use weight norm.
periods (list): List of periods.
period_discriminator_params (dict): Parameters for hifi-gan period discriminator module.
periods (list):
List of periods.
period_discriminator_params (dict):
Parameters for hifi-gan period discriminator module.
The period parameter will be overwritten.
"""
super().__init__()
@ -704,7 +753,8 @@ class HiFiGANMultiScaleMultiPeriodDiscriminator(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T).
x (Tensor):
Input noise signal (B, 1, T).
Returns:
List:
List of list of each discriminator outputs,

@ -53,24 +53,38 @@ class MelGANGenerator(nn.Layer):
"""Initialize MelGANGenerator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels,
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels,
the number of sub-band is out_channels in multi-band melgan.
kernel_size (int): Kernel size of initial and final conv layer.
channels (int): Initial number of channels for conv layer.
bias (bool): Whether to add bias parameter in convolution layers.
upsample_scales (List[int]): List of upsampling scales.
stack_kernel_size (int): Kernel size of dilated conv layers in residual stack.
stacks (int): Number of stacks in a single residual stack.
nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
by default {}
pad (str): Padding function module name before dilated convolution layer.
pad_params (dict): Hyperparameters for padding function.
use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
use_weight_norm (bool): Whether to use weight norm.
kernel_size (int):
Kernel size of initial and final conv layer.
channels (int):
Initial number of channels for conv layer.
bias (bool):
Whether to add bias parameter in convolution layers.
upsample_scales (List[int]):
List of upsampling scales.
stack_kernel_size (int):
Kernel size of dilated conv layers in residual stack.
stacks (int):
Number of stacks in a single residual stack.
nonlinear_activation (Optional[str], optional):
Non linear activation in upsample network, by default None
nonlinear_activation_params (Dict[str, Any], optional):
Parameters passed to the linear activation in the upsample network, by default {}
pad (str):
Padding function module name before dilated convolution layer.
pad_params (dict):
Hyperparameters for padding function.
use_final_nonlinear_activation (nn.Layer):
Activation function for the final layer.
use_weight_norm (bool):
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
use_causal_conv (bool): Whether to use causal convolution.
use_causal_conv (bool):
Whether to use causal convolution.
"""
super().__init__()
@ -194,7 +208,8 @@ class MelGANGenerator(nn.Layer):
"""Calculate forward propagation.
Args:
c (Tensor): Input tensor (B, in_channels, T).
c (Tensor):
Input tensor (B, in_channels, T).
Returns:
Tensor: Output tensor (B, out_channels, T ** prod(upsample_scales)).
"""
@ -244,7 +259,8 @@ class MelGANGenerator(nn.Layer):
"""Perform inference.
Args:
c (Union[Tensor, ndarray]): Input tensor (T, in_channels).
c (Union[Tensor, ndarray]):
Input tensor (T, in_channels).
Returns:
Tensor: Output tensor (out_channels*T ** prod(upsample_scales), 1).
"""
@ -279,20 +295,30 @@ class MelGANDiscriminator(nn.Layer):
"""Initilize MelGAN discriminator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
kernel_sizes (List[int]): List of two kernel sizes. The prod will be used for the first conv layer,
and the first and the second kernel sizes will be used for the last two layers.
For example if kernel_sizes = [5, 3], the first layer kernel size will be 5 * 3 = 15,
the last two layers' kernel size will be 5 and 3, respectively.
channels (int): Initial number of channels for conv layer.
max_downsample_channels (int): Maximum number of channels for downsampling layers.
bias (bool): Whether to add bias parameter in convolution layers.
downsample_scales (List[int]): List of downsampling scales.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
pad (str): Padding function module name before dilated convolution layer.
pad_params (dict): Hyperparameters for padding function.
channels (int):
Initial number of channels for conv layer.
max_downsample_channels (int):
Maximum number of channels for downsampling layers.
bias (bool):
Whether to add bias parameter in convolution layers.
downsample_scales (List[int]):
List of downsampling scales.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (dict):
Hyperparameters for activation function.
pad (str):
Padding function module name before dilated convolution layer.
pad_params (dict):
Hyperparameters for padding function.
"""
super().__init__()
@ -364,7 +390,8 @@ class MelGANDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T).
x (Tensor):
Input noise signal (B, 1, T).
Returns:
List: List of output tensors of each layer (for feat_match_loss).
"""
@ -406,22 +433,37 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
"""Initilize MelGAN multi-scale discriminator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
scales (int): Number of multi-scales.
downsample_pooling (str): Pooling module name for downsampling of the inputs.
downsample_pooling_params (dict): Parameters for the above pooling module.
kernel_sizes (List[int]): List of two kernel sizes. The sum will be used for the first conv layer,
in_channels (int):
Number of input channels.
out_channels (int):
Number of output channels.
scales (int):
Number of multi-scales.
downsample_pooling (str):
Pooling module name for downsampling of the inputs.
downsample_pooling_params (dict):
Parameters for the above pooling module.
kernel_sizes (List[int]):
List of two kernel sizes. The sum will be used for the first conv layer,
and the first and the second kernel sizes will be used for the last two layers.
channels (int): Initial number of channels for conv layer.
max_downsample_channels (int): Maximum number of channels for downsampling layers.
bias (bool): Whether to add bias parameter in convolution layers.
downsample_scales (List[int]): List of downsampling scales.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
pad (str): Padding function module name before dilated convolution layer.
pad_params (dict): Hyperparameters for padding function.
use_causal_conv (bool): Whether to use causal convolution.
channels (int):
Initial number of channels for conv layer.
max_downsample_channels (int):
Maximum number of channels for downsampling layers.
bias (bool):
Whether to add bias parameter in convolution layers.
downsample_scales (List[int]):
List of downsampling scales.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (dict):
Hyperparameters for activation function.
pad (str):
Padding function module name before dilated convolution layer.
pad_params (dict):
Hyperparameters for padding function.
use_causal_conv (bool):
Whether to use causal convolution.
"""
super().__init__()
@ -464,7 +506,8 @@ class MelGANMultiScaleDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, 1, T).
x (Tensor):
Input noise signal (B, 1, T).
Returns:
List: List of list of each discriminator outputs, which consists of each layer output tensors.
"""

@ -54,20 +54,34 @@ class StyleMelGANGenerator(nn.Layer):
"""Initilize Style MelGAN generator.
Args:
in_channels (int): Number of input noise channels.
aux_channels (int): Number of auxiliary input channels.
channels (int): Number of channels for conv layer.
out_channels (int): Number of output channels.
kernel_size (int): Kernel size of conv layers.
dilation (int): Dilation factor for conv layers.
bias (bool): Whether to add bias parameter in convolution layers.
noise_upsample_scales (list): List of noise upsampling scales.
noise_upsample_activation (str): Activation function module name for noise upsampling.
noise_upsample_activation_params (dict): Hyperparameters for the above activation function.
upsample_scales (list): List of upsampling scales.
upsample_mode (str): Upsampling mode in TADE layer.
gated_function (str): Gated function in TADEResBlock ("softmax" or "sigmoid").
use_weight_norm (bool): Whether to use weight norm.
in_channels (int):
Number of input noise channels.
aux_channels (int):
Number of auxiliary input channels.
channels (int):
Number of channels for conv layer.
out_channels (int):
Number of output channels.
kernel_size (int):
Kernel size of conv layers.
dilation (int):
Dilation factor for conv layers.
bias (bool):
Whether to add bias parameter in convolution layers.
noise_upsample_scales (list):
List of noise upsampling scales.
noise_upsample_activation (str):
Activation function module name for noise upsampling.
noise_upsample_activation_params (dict):
Hyperparameters for the above activation function.
upsample_scales (list):
List of upsampling scales.
upsample_mode (str):
Upsampling mode in TADE layer.
gated_function (str):
Gated function in TADEResBlock ("softmax" or "sigmoid").
use_weight_norm (bool):
Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
@ -194,7 +208,8 @@ class StyleMelGANGenerator(nn.Layer):
def inference(self, c):
"""Perform inference.
Args:
c (Tensor): Input tensor (T, in_channels).
c (Tensor):
Input tensor (T, in_channels).
Returns:
Tensor: Output tensor (T ** prod(upsample_scales), out_channels).
"""
@ -258,11 +273,16 @@ class StyleMelGANDiscriminator(nn.Layer):
"""Initilize Style MelGAN discriminator.
Args:
repeats (int): Number of repititons to apply RWD.
window_sizes (list): List of random window sizes.
pqmf_params (list): List of list of Parameters for PQMF modules
discriminator_params (dict): Parameters for base discriminator module.
use_weight_nom (bool): Whether to apply weight normalization.
repeats (int):
Number of repititons to apply RWD.
window_sizes (list):
List of random window sizes.
pqmf_params (list):
List of list of Parameters for PQMF modules
discriminator_params (dict):
Parameters for base discriminator module.
use_weight_nom (bool):
Whether to apply weight normalization.
"""
super().__init__()
@ -299,7 +319,8 @@ class StyleMelGANDiscriminator(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, 1, T).
x (Tensor):
Input tensor (B, 1, T).
Returns:
List: List of discriminator outputs, #items in the list will be
equal to repeats * #discriminators.

@ -32,29 +32,45 @@ class PWGGenerator(nn.Layer):
"""Wave Generator for Parallel WaveGAN
Args:
in_channels (int, optional): Number of channels of the input waveform, by default 1
out_channels (int, optional): Number of channels of the output waveform, by default 1
kernel_size (int, optional): Kernel size of the residual blocks inside, by default 3
layers (int, optional): Number of residual blocks inside, by default 30
stacks (int, optional): The number of groups to split the residual blocks into, by default 3
in_channels (int, optional):
Number of channels of the input waveform, by default 1
out_channels (int, optional):
Number of channels of the output waveform, by default 1
kernel_size (int, optional):
Kernel size of the residual blocks inside, by default 3
layers (int, optional):
Number of residual blocks inside, by default 30
stacks (int, optional):
The number of groups to split the residual blocks into, by default 3
Within each group, the dilation of the residual block grows exponentially.
residual_channels (int, optional): Residual channel of the residual blocks, by default 64
gate_channels (int, optional): Gate channel of the residual blocks, by default 128
skip_channels (int, optional): Skip channel of the residual blocks, by default 64
aux_channels (int, optional): Auxiliary channel of the residual blocks, by default 80
aux_context_window (int, optional): The context window size of the first convolution applied to the
auxiliary input, by default 2
dropout (float, optional): Dropout of the residual blocks, by default 0.
bias (bool, optional): Whether to use bias in residual blocks, by default True
use_weight_norm (bool, optional): Whether to use weight norm in all convolutions, by default True
use_causal_conv (bool, optional): Whether to use causal padding in the upsample network and residual
blocks, by default False
upsample_scales (List[int], optional): Upsample scales of the upsample network, by default [4, 4, 4, 4]
nonlinear_activation (Optional[str], optional): Non linear activation in upsample network, by default None
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
by default {}
interpolate_mode (str, optional): Interpolation mode of the upsample network, by default "nearest"
freq_axis_kernel_size (int, optional): Kernel size along the frequency axis of the upsample network, by default 1
residual_channels (int, optional):
Residual channel of the residual blocks, by default 64
gate_channels (int, optional):
Gate channel of the residual blocks, by default 128
skip_channels (int, optional):
Skip channel of the residual blocks, by default 64
aux_channels (int, optional):
Auxiliary channel of the residual blocks, by default 80
aux_context_window (int, optional):
The context window size of the first convolution applied to the auxiliary input, by default 2
dropout (float, optional):
Dropout of the residual blocks, by default 0.
bias (bool, optional):
Whether to use bias in residual blocks, by default True
use_weight_norm (bool, optional):
Whether to use weight norm in all convolutions, by default True
use_causal_conv (bool, optional):
Whether to use causal padding in the upsample network and residual blocks, by default False
upsample_scales (List[int], optional):
Upsample scales of the upsample network, by default [4, 4, 4, 4]
nonlinear_activation (Optional[str], optional):
Non linear activation in upsample network, by default None
nonlinear_activation_params (Dict[str, Any], optional):
Parameters passed to the linear activation in the upsample network, by default {}
interpolate_mode (str, optional):
Interpolation mode of the upsample network, by default "nearest"
freq_axis_kernel_size (int, optional):
Kernel size along the frequency axis of the upsample network, by default 1
"""
def __init__(
@ -147,9 +163,11 @@ class PWGGenerator(nn.Layer):
"""Generate waveform.
Args:
x(Tensor): Shape (N, C_in, T), The input waveform.
c(Tensor): Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram). It
is upsampled to match the time resolution of the input.
x(Tensor):
Shape (N, C_in, T), The input waveform.
c(Tensor):
Shape (N, C_aux, T'). The auxiliary input (e.g. spectrogram).
It is upsampled to match the time resolution of the input.
Returns:
Tensor: Shape (N, C_out, T), the generated waveform.
@ -195,8 +213,10 @@ class PWGGenerator(nn.Layer):
"""Waveform generation. This function is used for single instance inference.
Args:
c(Tensor, optional, optional): Shape (T', C_aux), the auxiliary input, by default None
x(Tensor, optional): Shape (T, C_in), the noise waveform, by default None
c(Tensor, optional, optional):
Shape (T', C_aux), the auxiliary input, by default None
x(Tensor, optional):
Shape (T, C_in), the noise waveform, by default None
Returns:
Tensor: Shape (T, C_out), the generated waveform
@ -214,20 +234,28 @@ class PWGDiscriminator(nn.Layer):
"""A convolutional discriminator for audio.
Args:
in_channels (int, optional): Number of channels of the input audio, by default 1
out_channels (int, optional): Output feature size, by default 1
kernel_size (int, optional): Kernel size of convolutional sublayers, by default 3
layers (int, optional): Number of layers, by default 10
conv_channels (int, optional): Feature size of the convolutional sublayers, by default 64
dilation_factor (int, optional): The factor with which dilation of each convolutional sublayers grows
in_channels (int, optional):
Number of channels of the input audio, by default 1
out_channels (int, optional):
Output feature size, by default 1
kernel_size (int, optional):
Kernel size of convolutional sublayers, by default 3
layers (int, optional):
Number of layers, by default 10
conv_channels (int, optional):
Feature size of the convolutional sublayers, by default 64
dilation_factor (int, optional):
The factor with which dilation of each convolutional sublayers grows
exponentially if it is greater than 1, else the dilation of each convolutional sublayers grows linearly,
by default 1
nonlinear_activation (str, optional): The activation after each convolutional sublayer, by default "leakyrelu"
nonlinear_activation_params (Dict[str, Any], optional): The parameters passed to the activation's initializer, by default
{"negative_slope": 0.2}
bias (bool, optional): Whether to use bias in convolutional sublayers, by default True
use_weight_norm (bool, optional): Whether to use weight normalization at all convolutional sublayers,
by default True
nonlinear_activation (str, optional):
The activation after each convolutional sublayer, by default "leakyrelu"
nonlinear_activation_params (Dict[str, Any], optional):
The parameters passed to the activation's initializer, by default {"negative_slope": 0.2}
bias (bool, optional):
Whether to use bias in convolutional sublayers, by default True
use_weight_norm (bool, optional):
Whether to use weight normalization at all convolutional sublayers, by default True
"""
def __init__(
@ -290,7 +318,8 @@ class PWGDiscriminator(nn.Layer):
"""
Args:
x (Tensor): Shape (N, in_channels, num_samples), the input audio.
x (Tensor):
Shape (N, in_channels, num_samples), the input audio.
Returns:
Tensor: Shape (N, out_channels, num_samples), the predicted logits.
@ -318,24 +347,35 @@ class ResidualPWGDiscriminator(nn.Layer):
"""A wavenet-style discriminator for audio.
Args:
in_channels (int, optional): Number of channels of the input audio, by default 1
out_channels (int, optional): Output feature size, by default 1
kernel_size (int, optional): Kernel size of residual blocks, by default 3
layers (int, optional): Number of residual blocks, by default 30
stacks (int, optional): Number of groups of residual blocks, within which the dilation
in_channels (int, optional):
Number of channels of the input audio, by default 1
out_channels (int, optional):
Output feature size, by default 1
kernel_size (int, optional):
Kernel size of residual blocks, by default 3
layers (int, optional):
Number of residual blocks, by default 30
stacks (int, optional):
Number of groups of residual blocks, within which the dilation
of each residual blocks grows exponentially, by default 3
residual_channels (int, optional): Residual channels of residual blocks, by default 64
gate_channels (int, optional): Gate channels of residual blocks, by default 128
skip_channels (int, optional): Skip channels of residual blocks, by default 64
dropout (float, optional): Dropout probability of residual blocks, by default 0.
bias (bool, optional): Whether to use bias in residual blocks, by default True
use_weight_norm (bool, optional): Whether to use weight normalization in all convolutional layers,
by default True
use_causal_conv (bool, optional): Whether to use causal convolution in residual blocks, by default False
nonlinear_activation (str, optional): Activation after convolutions other than those in residual blocks,
by default "leakyrelu"
nonlinear_activation_params (Dict[str, Any], optional): Parameters to pass to the activation,
by default {"negative_slope": 0.2}
residual_channels (int, optional):
Residual channels of residual blocks, by default 64
gate_channels (int, optional):
Gate channels of residual blocks, by default 128
skip_channels (int, optional):
Skip channels of residual blocks, by default 64
dropout (float, optional):
Dropout probability of residual blocks, by default 0.
bias (bool, optional):
Whether to use bias in residual blocks, by default True
use_weight_norm (bool, optional):
Whether to use weight normalization in all convolutional layers, by default True
use_causal_conv (bool, optional):
Whether to use causal convolution in residual blocks, by default False
nonlinear_activation (str, optional):
Activation after convolutions other than those in residual blocks, by default "leakyrelu"
nonlinear_activation_params (Dict[str, Any], optional):
Parameters to pass to the activation, by default {"negative_slope": 0.2}
"""
def __init__(
@ -405,7 +445,8 @@ class ResidualPWGDiscriminator(nn.Layer):
def forward(self, x):
"""
Args:
x(Tensor): Shape (N, in_channels, num_samples), the input audio.
x(Tensor):
Shape (N, in_channels, num_samples), the input audio.
Returns:
Tensor: Shape (N, out_channels, num_samples), the predicted logits.

@ -29,10 +29,14 @@ class ResidualBlock(nn.Layer):
n: int=2):
"""SpeedySpeech encoder module.
Args:
channels (int, optional): Feature size of the residual output(and also the input).
kernel_size (int, optional): Kernel size of the 1D convolution.
dilation (int, optional): Dilation of the 1D convolution.
n (int): Number of blocks.
channels (int, optional):
Feature size of the residual output(and also the input).
kernel_size (int, optional):
Kernel size of the 1D convolution.
dilation (int, optional):
Dilation of the 1D convolution.
n (int):
Number of blocks.
"""
super().__init__()
@ -57,7 +61,8 @@ class ResidualBlock(nn.Layer):
def forward(self, x: paddle.Tensor):
"""Calculate forward propagation.
Args:
x(Tensor): Batch of input sequences (B, hidden_size, Tmax).
x(Tensor):
Batch of input sequences (B, hidden_size, Tmax).
Returns:
Tensor: The residual output (B, hidden_size, Tmax).
"""
@ -89,8 +94,10 @@ class TextEmbedding(nn.Layer):
def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None):
"""Calculate forward propagation.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
text(Tensor(int64)):
Batch of padded token ids (B, Tmax).
tones(Tensor, optional(int64)):
Batch of padded tone ids (B, Tmax).
Returns:
Tensor: The residual output (B, Tmax, embedding_size).
"""
@ -109,12 +116,18 @@ class TextEmbedding(nn.Layer):
class SpeedySpeechEncoder(nn.Layer):
"""SpeedySpeech encoder module.
Args:
vocab_size (int): Dimension of the inputs.
tone_size (Optional[int]): Number of tones.
hidden_size (int): Number of encoder hidden units.
kernel_size (int): Kernel size of encoder.
dilations (List[int]): Dilations of encoder.
spk_num (Optional[int]): Number of speakers.
vocab_size (int):
Dimension of the inputs.
tone_size (Optional[int]):
Number of tones.
hidden_size (int):
Number of encoder hidden units.
kernel_size (int):
Kernel size of encoder.
dilations (List[int]):
Dilations of encoder.
spk_num (Optional[int]):
Number of speakers.
"""
def __init__(self,
@ -161,9 +174,12 @@ class SpeedySpeechEncoder(nn.Layer):
spk_id: paddle.Tensor=None):
"""Encoder input sequence.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
text(Tensor(int64)):
Batch of padded token ids (B, Tmax).
tones(Tensor, optional(int64)):
Batch of padded tone ids (B, Tmax).
spk_id(Tnesor, optional(int64)):
Batch of speaker ids (B,)
Returns:
Tensor: Output tensor (B, Tmax, hidden_size).
@ -192,7 +208,8 @@ class DurationPredictor(nn.Layer):
def forward(self, x: paddle.Tensor):
"""Calculate forward propagation.
Args:
x(Tensor): Batch of input sequences (B, Tmax, hidden_size).
x(Tensor):
Batch of input sequences (B, Tmax, hidden_size).
Returns:
Tensor: Batch of predicted durations in log domain (B, Tmax).
@ -212,10 +229,14 @@ class SpeedySpeechDecoder(nn.Layer):
]):
"""SpeedySpeech decoder module.
Args:
hidden_size (int): Number of decoder hidden units.
kernel_size (int): Kernel size of decoder.
output_size (int): Dimension of the outputs.
dilations (List[int]): Dilations of decoder.
hidden_size (int):
Number of decoder hidden units.
kernel_size (int):
Kernel size of decoder.
output_size (int):
Dimension of the outputs.
dilations (List[int]):
Dilations of decoder.
"""
super().__init__()
res_blocks = [
@ -230,7 +251,8 @@ class SpeedySpeechDecoder(nn.Layer):
def forward(self, x):
"""Decoder input sequence.
Args:
x(Tensor): Input tensor (B, time, hidden_size).
x(Tensor):
Input tensor (B, time, hidden_size).
Returns:
Tensor: Output tensor (B, time, output_size).
@ -261,18 +283,30 @@ class SpeedySpeech(nn.Layer):
positional_dropout_rate: int=0.1):
"""Initialize SpeedySpeech module.
Args:
vocab_size (int): Dimension of the inputs.
encoder_hidden_size (int): Number of encoder hidden units.
encoder_kernel_size (int): Kernel size of encoder.
encoder_dilations (List[int]): Dilations of encoder.
duration_predictor_hidden_size (int): Number of duration predictor hidden units.
decoder_hidden_size (int): Number of decoder hidden units.
decoder_kernel_size (int): Kernel size of decoder.
decoder_dilations (List[int]): Dilations of decoder.
decoder_output_size (int): Dimension of the outputs.
tone_size (Optional[int]): Number of tones.
spk_num (Optional[int]): Number of speakers.
init_type (str): How to initialize transformer parameters.
vocab_size (int):
Dimension of the inputs.
encoder_hidden_size (int):
Number of encoder hidden units.
encoder_kernel_size (int):
Kernel size of encoder.
encoder_dilations (List[int]):
Dilations of encoder.
duration_predictor_hidden_size (int):
Number of duration predictor hidden units.
decoder_hidden_size (int):
Number of decoder hidden units.
decoder_kernel_size (int):
Kernel size of decoder.
decoder_dilations (List[int]):
Dilations of decoder.
decoder_output_size (int):
Dimension of the outputs.
tone_size (Optional[int]):
Number of tones.
spk_num (Optional[int]):
Number of speakers.
init_type (str):
How to initialize transformer parameters.
"""
super().__init__()
@ -304,14 +338,20 @@ class SpeedySpeech(nn.Layer):
spk_id: paddle.Tensor=None):
"""Calculate forward propagation.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
durations(Tensor(int64)): Batch of padded durations (B, Tmax).
tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
text(Tensor(int64)):
Batch of padded token ids (B, Tmax).
durations(Tensor(int64)):
Batch of padded durations (B, Tmax).
tones(Tensor, optional(int64)):
Batch of padded tone ids (B, Tmax).
spk_id(Tnesor, optional(int64)):
Batch of speaker ids (B,)
Returns:
Tensor: Output tensor (B, T_frames, decoder_output_size).
Tensor: Predicted durations (B, Tmax).
Tensor:
Output tensor (B, T_frames, decoder_output_size).
Tensor:
Predicted durations (B, Tmax).
"""
# input of embedding must be int64
text = paddle.cast(text, 'int64')
@ -336,10 +376,14 @@ class SpeedySpeech(nn.Layer):
spk_id: paddle.Tensor=None):
"""Generate the sequence of features given the sequences of characters.
Args:
text(Tensor(int64)): Input sequence of characters (T,).
tones(Tensor, optional(int64)): Batch of padded tone ids (T, ).
durations(Tensor, optional (int64)): Groundtruth of duration (T,).
spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
text(Tensor(int64)):
Input sequence of characters (T,).
tones(Tensor, optional(int64)):
Batch of padded tone ids (T, ).
durations(Tensor, optional (int64)):
Groundtruth of duration (T,).
spk_id(Tensor, optional(int64), optional):
spk ids (1,). (Default value = None)
Returns:
Tensor: logmel (T, decoder_output_size).

@ -83,38 +83,67 @@ class Tacotron2(nn.Layer):
init_type: str="xavier_uniform", ):
"""Initialize Tacotron2 module.
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
embed_dim (int): Dimension of the token embedding.
elayers (int): Number of encoder blstm layers.
eunits (int): Number of encoder blstm units.
econv_layers (int): Number of encoder conv layers.
econv_filts (int): Number of encoder conv filter size.
econv_chans (int): Number of encoder conv filter channels.
dlayers (int): Number of decoder lstm layers.
dunits (int): Number of decoder lstm units.
prenet_layers (int): Number of prenet layers.
prenet_units (int): Number of prenet units.
postnet_layers (int): Number of postnet layers.
postnet_filts (int): Number of postnet filter size.
postnet_chans (int): Number of postnet filter channels.
output_activation (str): Name of activation function for outputs.
adim (int): Number of dimension of mlp in attention.
aconv_chans (int): Number of attention conv filter channels.
aconv_filts (int): Number of attention conv filter size.
cumulate_att_w (bool): Whether to cumulate previous attention weight.
use_batch_norm (bool): Whether to use batch normalization.
use_concate (bool): Whether to concat enc outputs w/ dec lstm outputs.
reduction_factor (int): Reduction factor.
spk_num (Optional[int]): Number of speakers. If set to > 1, assume that the
idim (int):
Dimension of the inputs.
odim (int):
Dimension of the outputs.
embed_dim (int):
Dimension of the token embedding.
elayers (int):
Number of encoder blstm layers.
eunits (int):
Number of encoder blstm units.
econv_layers (int):
Number of encoder conv layers.
econv_filts (int):
Number of encoder conv filter size.
econv_chans (int):
Number of encoder conv filter channels.
dlayers (int):
Number of decoder lstm layers.
dunits (int):
Number of decoder lstm units.
prenet_layers (int):
Number of prenet layers.
prenet_units (int):
Number of prenet units.
postnet_layers (int):
Number of postnet layers.
postnet_filts (int):
Number of postnet filter size.
postnet_chans (int):
Number of postnet filter channels.
output_activation (str):
Name of activation function for outputs.
adim (int):
Number of dimension of mlp in attention.
aconv_chans (int):
Number of attention conv filter channels.
aconv_filts (int):
Number of attention conv filter size.
cumulate_att_w (bool):
Whether to cumulate previous attention weight.
use_batch_norm (bool):
Whether to use batch normalization.
use_concate (bool):
Whether to concat enc outputs w/ dec lstm outputs.
reduction_factor (int):
Reduction factor.
spk_num (Optional[int]):
Number of speakers. If set to > 1, assume that the
sids will be provided as the input and use sid embedding layer.
lang_num (Optional[int]): Number of languages. If set to > 1, assume that the
lang_num (Optional[int]):
Number of languages. If set to > 1, assume that the
lids will be provided as the input and use sid embedding layer.
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
spk_embed_dim (Optional[int]):
Speaker embedding dimension. If set to > 0,
assume that spk_emb will be provided as the input.
spk_embed_integration_type (str): How to integrate speaker embedding.
dropout_rate (float): Dropout rate.
zoneout_rate (float): Zoneout rate.
spk_embed_integration_type (str):
How to integrate speaker embedding.
dropout_rate (float):
Dropout rate.
zoneout_rate (float):
Zoneout rate.
"""
assert check_argument_types()
super().__init__()
@ -230,18 +259,28 @@ class Tacotron2(nn.Layer):
"""Calculate forward propagation.
Args:
text (Tensor(int64)): Batch of padded character ids (B, T_text).
text_lengths (Tensor(int64)): Batch of lengths of each input batch (B,).
speech (Tensor): Batch of padded target features (B, T_feats, odim).
speech_lengths (Tensor(int64)): Batch of the lengths of each target (B,).
spk_emb (Optional[Tensor]): Batch of speaker embeddings (B, spk_embed_dim).
spk_id (Optional[Tensor]): Batch of speaker IDs (B, 1).
lang_id (Optional[Tensor]): Batch of language IDs (B, 1).
text (Tensor(int64)):
Batch of padded character ids (B, T_text).
text_lengths (Tensor(int64)):
Batch of lengths of each input batch (B,).
speech (Tensor):
Batch of padded target features (B, T_feats, odim).
speech_lengths (Tensor(int64)):
Batch of the lengths of each target (B,).
spk_emb (Optional[Tensor]):
Batch of speaker embeddings (B, spk_embed_dim).
spk_id (Optional[Tensor]):
Batch of speaker IDs (B, 1).
lang_id (Optional[Tensor]):
Batch of language IDs (B, 1).
Returns:
Tensor: Loss scalar value.
Dict: Statistics to be monitored.
Tensor: Weight value if not joint training else model outputs.
Tensor:
Loss scalar value.
Dict:
Statistics to be monitored.
Tensor:
Weight value if not joint training else model outputs.
"""
text = text[:, :text_lengths.max()]
@ -329,18 +368,30 @@ class Tacotron2(nn.Layer):
"""Generate the sequence of features given the sequences of characters.
Args:
text (Tensor(int64)): Input sequence of characters (T_text,).
speech (Optional[Tensor]): Feature sequence to extract style (N, idim).
spk_emb (ptional[Tensor]): Speaker embedding (spk_embed_dim,).
spk_id (Optional[Tensor]): Speaker ID (1,).
lang_id (Optional[Tensor]): Language ID (1,).
threshold (float): Threshold in inference.
minlenratio (float): Minimum length ratio in inference.
maxlenratio (float): Maximum length ratio in inference.
use_att_constraint (bool): Whether to apply attention constraint.
backward_window (int): Backward window in attention constraint.
forward_window (int): Forward window in attention constraint.
use_teacher_forcing (bool): Whether to use teacher forcing.
text (Tensor(int64)):
Input sequence of characters (T_text,).
speech (Optional[Tensor]):
Feature sequence to extract style (N, idim).
spk_emb (ptional[Tensor]):
Speaker embedding (spk_embed_dim,).
spk_id (Optional[Tensor]):
Speaker ID (1,).
lang_id (Optional[Tensor]):
Language ID (1,).
threshold (float):
Threshold in inference.
minlenratio (float):
Minimum length ratio in inference.
maxlenratio (float):
Maximum length ratio in inference.
use_att_constraint (bool):
Whether to apply attention constraint.
backward_window (int):
Backward window in attention constraint.
forward_window (int):
Forward window in attention constraint.
use_teacher_forcing (bool):
Whether to use teacher forcing.
Returns:
Dict[str, Tensor]

@ -49,66 +49,124 @@ class TransformerTTS(nn.Layer):
https://arxiv.org/pdf/1809.08895.pdf
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
embed_dim (int, optional): Dimension of character embedding.
eprenet_conv_layers (int, optional): Number of encoder prenet convolution layers.
eprenet_conv_chans (int, optional): Number of encoder prenet convolution channels.
eprenet_conv_filts (int, optional): Filter size of encoder prenet convolution.
dprenet_layers (int, optional): Number of decoder prenet layers.
dprenet_units (int, optional): Number of decoder prenet hidden units.
elayers (int, optional): Number of encoder layers.
eunits (int, optional): Number of encoder hidden units.
adim (int, optional): Number of attention transformation dimensions.
aheads (int, optional): Number of heads for multi head attention.
dlayers (int, optional): Number of decoder layers.
dunits (int, optional): Number of decoder hidden units.
postnet_layers (int, optional): Number of postnet layers.
postnet_chans (int, optional): Number of postnet channels.
postnet_filts (int, optional): Filter size of postnet.
use_scaled_pos_enc (pool, optional): Whether to use trainable scaled positional encoding.
use_batch_norm (bool, optional): Whether to use batch normalization in encoder prenet.
encoder_normalize_before (bool, optional): Whether to perform layer normalization before encoder block.
decoder_normalize_before (bool, optional): Whether to perform layer normalization before decoder block.
encoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after (bool, optional): Whether to concatenate attention layer's input and output in decoder.
positionwise_layer_type (str, optional): Position-wise operation type.
positionwise_conv_kernel_size (int, optional): Kernel size in position wise conv 1d.
reduction_factor (int, optional): Reduction factor.
spk_embed_dim (int, optional): Number of speaker embedding dimenstions.
spk_embed_integration_type (str, optional): How to integrate speaker embedding.
use_gst (str, optional): Whether to use global style token.
gst_tokens (int, optional): The number of GST embeddings.
gst_heads (int, optional): The number of heads in GST multihead attention.
gst_conv_layers (int, optional): The number of conv layers in GST.
gst_conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in GST.
gst_conv_kernel_size (int, optional): Kernal size of conv layers in GST.
gst_conv_stride (int, optional): Stride size of conv layers in GST.
gst_gru_layers (int, optional): The number of GRU layers in GST.
gst_gru_units (int, optional): The number of GRU units in GST.
transformer_lr (float, optional): Initial value of learning rate.
transformer_warmup_steps (int, optional): Optimizer warmup steps.
transformer_enc_dropout_rate (float, optional): Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float, optional): Dropout rate after encoder positional encoding.
transformer_enc_attn_dropout_rate float, optional): Dropout rate in encoder self-attention module.
transformer_dec_dropout_rate (float, optional): Dropout rate in decoder except attention & positional encoding.
transformer_dec_positional_dropout_rate (float, optional): Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate float, optional): Dropout rate in deocoder self-attention module.
transformer_enc_dec_attn_dropout_rate (float, optional): Dropout rate in encoder-deocoder attention module.
init_type (str, optional): How to initialize transformer parameters.
init_enc_alpha float, optional: Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float, optional): Initial value of alpha in scaled pos encoding of the decoder.
eprenet_dropout_rate (float, optional): Dropout rate in encoder prenet.
dprenet_dropout_rate (float, optional): Dropout rate in decoder prenet.
postnet_dropout_rate (float, optional): Dropout rate in postnet.
use_masking (bool, optional): Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool, optional): Whether to apply weighted masking in loss calculation.
bce_pos_weight (float, optional): Positive sample weight in bce calculation (only for use_masking=true).
loss_type (str, optional): How to calculate loss.
use_guided_attn_loss (bool, optional): Whether to use guided attention loss.
num_heads_applied_guided_attn (int, optional): Number of heads in each layer to apply guided attention loss.
num_layers_applied_guided_attn (int, optional): Number of layers to apply guided attention loss.
List of module names to apply guided attention loss.
idim (int):
Dimension of the inputs.
odim (int):
Dimension of the outputs.
embed_dim (int, optional):
Dimension of character embedding.
eprenet_conv_layers (int, optional):
Number of encoder prenet convolution layers.
eprenet_conv_chans (int, optional):
Number of encoder prenet convolution channels.
eprenet_conv_filts (int, optional):
Filter size of encoder prenet convolution.
dprenet_layers (int, optional):
Number of decoder prenet layers.
dprenet_units (int, optional):
Number of decoder prenet hidden units.
elayers (int, optional):
Number of encoder layers.
eunits (int, optional):
Number of encoder hidden units.
adim (int, optional):
Number of attention transformation dimensions.
aheads (int, optional):
Number of heads for multi head attention.
dlayers (int, optional):
Number of decoder layers.
dunits (int, optional):
Number of decoder hidden units.
postnet_layers (int, optional):
Number of postnet layers.
postnet_chans (int, optional):
Number of postnet channels.
postnet_filts (int, optional):
Filter size of postnet.
use_scaled_pos_enc (pool, optional):
Whether to use trainable scaled positional encoding.
use_batch_norm (bool, optional):
Whether to use batch normalization in encoder prenet.
encoder_normalize_before (bool, optional):
Whether to perform layer normalization before encoder block.
decoder_normalize_before (bool, optional):
Whether to perform layer normalization before decoder block.
encoder_concat_after (bool, optional):
Whether to concatenate attention layer's input and output in encoder.
decoder_concat_after (bool, optional):
Whether to concatenate attention layer's input and output in decoder.
positionwise_layer_type (str, optional):
Position-wise operation type.
positionwise_conv_kernel_size (int, optional):
Kernel size in position wise conv 1d.
reduction_factor (int, optional):
Reduction factor.
spk_embed_dim (int, optional):
Number of speaker embedding dimenstions.
spk_embed_integration_type (str, optional):
How to integrate speaker embedding.
use_gst (str, optional):
Whether to use global style token.
gst_tokens (int, optional):
The number of GST embeddings.
gst_heads (int, optional):
The number of heads in GST multihead attention.
gst_conv_layers (int, optional):
The number of conv layers in GST.
gst_conv_chans_list (Sequence[int], optional):
List of the number of channels of conv layers in GST.
gst_conv_kernel_size (int, optional):
Kernal size of conv layers in GST.
gst_conv_stride (int, optional):
Stride size of conv layers in GST.
gst_gru_layers (int, optional):
The number of GRU layers in GST.
gst_gru_units (int, optional):
The number of GRU units in GST.
transformer_lr (float, optional):
Initial value of learning rate.
transformer_warmup_steps (int, optional):
Optimizer warmup steps.
transformer_enc_dropout_rate (float, optional):
Dropout rate in encoder except attention and positional encoding.
transformer_enc_positional_dropout_rate (float, optional):
Dropout rate after encoder positional encoding.
transformer_enc_attn_dropout_rate float, optional):
Dropout rate in encoder self-attention module.
transformer_dec_dropout_rate (float, optional):
Dropout rate in decoder except attention & positional encoding.
transformer_dec_positional_dropout_rate (float, optional):
Dropout rate after decoder positional encoding.
transformer_dec_attn_dropout_rate float, optional):
Dropout rate in deocoder self-attention module.
transformer_enc_dec_attn_dropout_rate (float, optional):
Dropout rate in encoder-deocoder attention module.
init_type (str, optional):
How to initialize transformer parameters.
init_enc_alpha float, optional:
Initial value of alpha in scaled pos encoding of the encoder.
init_dec_alpha (float, optional):
Initial value of alpha in scaled pos encoding of the decoder.
eprenet_dropout_rate (float, optional):
Dropout rate in encoder prenet.
dprenet_dropout_rate (float, optional):
Dropout rate in decoder prenet.
postnet_dropout_rate (float, optional):
Dropout rate in postnet.
use_masking (bool, optional):
Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool, optional):
Whether to apply weighted masking in loss calculation.
bce_pos_weight (float, optional):
Positive sample weight in bce calculation (only for use_masking=true).
loss_type (str, optional):
How to calculate loss.
use_guided_attn_loss (bool, optional):
Whether to use guided attention loss.
num_heads_applied_guided_attn (int, optional):
Number of heads in each layer to apply guided attention loss.
num_layers_applied_guided_attn (int, optional):
Number of layers to apply guided attention loss.
"""
def __init__(

@ -33,8 +33,10 @@ def fold(x, n_group):
"""Fold audio or spectrogram's temporal dimension in to groups.
Args:
x(Tensor): The input tensor. shape=(*, time_steps)
n_group(int): The size of a group.
x(Tensor):
The input tensor. shape=(*, time_steps)
n_group(int):
The size of a group.
Returns:
Tensor: Folded tensor. shape=(*, time_steps // n_group, group)
@ -53,7 +55,8 @@ class UpsampleNet(nn.LayerList):
on mel and time dimension.
Args:
upscale_factors(List[int], optional): Time upsampling factors for each Conv2DTranspose Layer.
upscale_factors(List[int], optional):
Time upsampling factors for each Conv2DTranspose Layer.
The ``UpsampleNet`` contains ``len(upscale_factor)`` Conv2DTranspose
Layers. Each upscale_factor is used as the ``stride`` for the
corresponding Conv2DTranspose. Defaults to [16, 16], this the default
@ -94,8 +97,10 @@ class UpsampleNet(nn.LayerList):
"""Forward pass of the ``UpsampleNet``
Args:
x(Tensor): The input spectrogram. shape=(batch_size, input_channels, time_steps)
trim_conv_artifact(bool, optional, optional): Trim deconvolution artifact at each layer. Defaults to False.
x(Tensor):
The input spectrogram. shape=(batch_size, input_channels, time_steps)
trim_conv_artifact(bool, optional, optional):
Trim deconvolution artifact at each layer. Defaults to False.
Returns:
Tensor: The upsampled spectrogram. shape=(batch_size, input_channels, time_steps * upsample_factor)
@ -123,10 +128,14 @@ class ResidualBlock(nn.Layer):
and output.
Args:
channels (int): Feature size of the input.
cond_channels (int): Featuer size of the condition.
kernel_size (Tuple[int]): Kernel size of the Convolution2d applied to the input.
dilations (int): Dilations of the Convolution2d applied to the input.
channels (int):
Feature size of the input.
cond_channels (int):
Featuer size of the condition.
kernel_size (Tuple[int]):
Kernel size of the Convolution2d applied to the input.
dilations (int):
Dilations of the Convolution2d applied to the input.
"""
def __init__(self, channels, cond_channels, kernel_size, dilations):
@ -173,12 +182,16 @@ class ResidualBlock(nn.Layer):
"""Compute output for a whole folded sequence.
Args:
x (Tensor): The input. [shape=(batch_size, channel, height, width)]
condition (Tensor [shape=(batch_size, condition_channel, height, width)]): The local condition.
x (Tensor):
The input. [shape=(batch_size, channel, height, width)]
condition (Tensor [shape=(batch_size, condition_channel, height, width)]):
The local condition.
Returns:
res (Tensor): The residual output. [shape=(batch_size, channel, height, width)]
skip (Tensor): The skip output. [shape=(batch_size, channel, height, width)]
res (Tensor):
The residual output. [shape=(batch_size, channel, height, width)]
skip (Tensor):
The skip output. [shape=(batch_size, channel, height, width)]
"""
x_in = x
x = self.conv(x)
@ -216,12 +229,16 @@ class ResidualBlock(nn.Layer):
"""Compute the output for a row and update the buffer.
Args:
x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
x_row (Tensor):
A row of the input. shape=(batch_size, channel, 1, width)
condition_row (Tensor):
A row of the condition. shape=(batch_size, condition_channel, 1, width)
Returns:
res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
res (Tensor):
A row of the the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor):
A row of the skip output. shape=(batch_size, channel, 1, width)
"""
x_row_in = x_row
@ -258,11 +275,16 @@ class ResidualNet(nn.LayerList):
"""A stack of several ResidualBlocks. It merges condition at each layer.
Args:
n_layer (int): Number of ResidualBlocks in the ResidualNet.
residual_channels (int): Feature size of each ResidualBlocks.
condition_channels (int): Feature size of the condition.
kernel_size (Tuple[int]): Kernel size of each ResidualBlock.
dilations_h (List[int]): Dilation in height dimension of every ResidualBlock.
n_layer (int):
Number of ResidualBlocks in the ResidualNet.
residual_channels (int):
Feature size of each ResidualBlocks.
condition_channels (int):
Feature size of the condition.
kernel_size (Tuple[int]):
Kernel size of each ResidualBlock.
dilations_h (List[int]):
Dilation in height dimension of every ResidualBlock.
Raises:
ValueError: If the length of dilations_h does not equals n_layers.
@ -288,11 +310,13 @@ class ResidualNet(nn.LayerList):
"""Comput the output of given the input and the condition.
Args:
x (Tensor): The input. shape=(batch_size, channel, height, width)
condition (Tensor): The local condition. shape=(batch_size, condition_channel, height, width)
x (Tensor):
The input. shape=(batch_size, channel, height, width)
condition (Tensor):
The local condition. shape=(batch_size, condition_channel, height, width)
Returns:
Tensor : The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
Tensor: The output, which is an aggregation of all the skip outputs. shape=(batch_size, channel, height, width)
"""
skip_connections = []
@ -312,12 +336,16 @@ class ResidualNet(nn.LayerList):
"""Compute the output for a row and update the buffers.
Args:
x_row (Tensor): A row of the input. shape=(batch_size, channel, 1, width)
condition_row (Tensor): A row of the condition. shape=(batch_size, condition_channel, 1, width)
x_row (Tensor):
A row of the input. shape=(batch_size, channel, 1, width)
condition_row (Tensor):
A row of the condition. shape=(batch_size, condition_channel, 1, width)
Returns:
res (Tensor): A row of the the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor): A row of the skip output. shape=(batch_size, channel, 1, width)
res (Tensor):
A row of the the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor):
A row of the skip output. shape=(batch_size, channel, 1, width)
"""
skip_connections = []
@ -337,11 +365,16 @@ class Flow(nn.Layer):
sampling.
Args:
n_layers (int): Number of ResidualBlocks in the Flow.
channels (int): Feature size of the ResidualBlocks.
mel_bands (int): Feature size of the mel spectrogram (mel bands).
kernel_size (Tuple[int]): Kernel size of each ResisualBlocks in the Flow.
n_group (int): Number of timesteps to the folded into a group.
n_layers (int):
Number of ResidualBlocks in the Flow.
channels (int):
Feature size of the ResidualBlocks.
mel_bands (int):
Feature size of the mel spectrogram (mel bands).
kernel_size (Tuple[int]):
Kernel size of each ResisualBlocks in the Flow.
n_group (int):
Number of timesteps to the folded into a group.
"""
dilations_dict = {
8: [1, 1, 1, 1, 1, 1, 1, 1],
@ -393,11 +426,14 @@ class Flow(nn.Layer):
a sample from p(X) into a sample from p(Z).
Args:
x (Tensor): A input sample of the distribution p(X). shape=(batch, 1, height, width)
condition (Tensor): The local condition. shape=(batch, condition_channel, height, width)
x (Tensor):
A input sample of the distribution p(X). shape=(batch, 1, height, width)
condition (Tensor):
The local condition. shape=(batch, condition_channel, height, width)
Returns:
z (Tensor): shape(batch, 1, height, width), the transformed sample.
z (Tensor):
shape(batch, 1, height, width), the transformed sample.
Tuple[Tensor, Tensor]:
The parameter of the transformation.
logs (Tensor): shape(batch, 1, height - 1, width), the log scale of the transformation from x to z.
@ -433,8 +469,10 @@ class Flow(nn.Layer):
p(Z) and transform the sample. It is a auto regressive transformation.
Args:
z(Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
condition(Tensor): The local condition. shape=(batch, condition_channel, time_steps)
z(Tensor):
A sample of the distribution p(Z). shape=(batch, 1, time_steps
condition(Tensor):
The local condition. shape=(batch, condition_channel, time_steps)
Returns:
Tensor:
The transformed sample. shape=(batch, 1, height, width)
@ -462,12 +500,18 @@ class WaveFlow(nn.LayerList):
flows.
Args:
n_flows (int): Number of flows in the WaveFlow model.
n_layers (int): Number of ResidualBlocks in each Flow.
n_group (int): Number of timesteps to fold as a group.
channels (int): Feature size of each ResidualBlock.
mel_bands (int): Feature size of mel spectrogram (mel bands).
kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
n_flows (int):
Number of flows in the WaveFlow model.
n_layers (int):
Number of ResidualBlocks in each Flow.
n_group (int):
Number of timesteps to fold as a group.
channels (int):
Feature size of each ResidualBlock.
mel_bands (int):
Feature size of mel spectrogram (mel bands).
kernel_size (Union[int, List[int]]):
Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self, n_flows, n_layers, n_group, channels, mel_bands,
@ -518,12 +562,16 @@ class WaveFlow(nn.LayerList):
condition.
Args:
x (Tensor): The audio. shape=(batch_size, time_steps)
condition (Tensor): The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
x (Tensor):
The audio. shape=(batch_size, time_steps)
condition (Tensor):
The local condition (mel spectrogram here). shape=(batch_size, condition channel, time_steps)
Returns:
Tensor: The transformed random variable. shape=(batch_size, time_steps)
Tensor: The log determinant of the jacobian of the transformation from x to z. shape=(1,)
Tensor:
The transformed random variable. shape=(batch_size, time_steps)
Tensor:
The log determinant of the jacobian of the transformation from x to z. shape=(1,)
"""
# x: (B, T)
# condition: (B, C, T) upsampled condition
@ -559,12 +607,13 @@ class WaveFlow(nn.LayerList):
autoregressive manner.
Args:
z (Tensor): A sample of the distribution p(Z). shape=(batch, 1, time_steps
condition (Tensor): The local condition. shape=(batch, condition_channel, time_steps)
z (Tensor):
A sample of the distribution p(Z). shape=(batch, 1, time_steps
condition (Tensor):
The local condition. shape=(batch, condition_channel, time_steps)
Returns:
Tensor: The transformed sample (audio here). shape=(batch_size, time_steps)
"""
z, condition = self._trim(z, condition)
@ -590,13 +639,20 @@ class ConditionalWaveFlow(nn.LayerList):
"""ConditionalWaveFlow, a UpsampleNet with a WaveFlow model.
Args:
upsample_factors (List[int]): Upsample factors for the upsample net.
n_flows (int): Number of flows in the WaveFlow model.
n_layers (int): Number of ResidualBlocks in each Flow.
n_group (int): Number of timesteps to fold as a group.
channels (int): Feature size of each ResidualBlock.
n_mels (int): Feature size of mel spectrogram (mel bands).
kernel_size (Union[int, List[int]]): Kernel size of the convolution layer in each ResidualBlock.
upsample_factors (List[int]):
Upsample factors for the upsample net.
n_flows (int):
Number of flows in the WaveFlow model.
n_layers (int):
Number of ResidualBlocks in each Flow.
n_group (int):
Number of timesteps to fold as a group.
channels (int):
Feature size of each ResidualBlock.
n_mels (int):
Feature size of mel spectrogram (mel bands).
kernel_size (Union[int, List[int]]):
Kernel size of the convolution layer in each ResidualBlock.
"""
def __init__(self,
@ -622,12 +678,16 @@ class ConditionalWaveFlow(nn.LayerList):
the determinant of the jacobian of the transformation from x to z.
Args:
audio(Tensor): The audio. shape=(B, T)
mel(Tensor): The mel spectrogram. shape=(B, C_mel, T_mel)
audio(Tensor):
The audio. shape=(B, T)
mel(Tensor):
The mel spectrogram. shape=(B, C_mel, T_mel)
Returns:
Tensor: The inversely transformed random variable z (x to z). shape=(B, T)
Tensor: the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
Tensor:
The inversely transformed random variable z (x to z). shape=(B, T)
Tensor:
the log of the determinant of the jacobian of the transformation from x to z. shape=(1,)
"""
condition = self.encoder(mel)
z, log_det_jacobian = self.decoder(audio, condition)
@ -638,10 +698,12 @@ class ConditionalWaveFlow(nn.LayerList):
"""Generate raw audio given mel spectrogram.
Args:
mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
mel(np.ndarray):
Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
Returns:
Tensor: The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
Tensor:
The synthesized audio, where``T <= T_mel * upsample_factors``. shape=(B, T)
"""
start = time.time()
condition = self.encoder(mel, trim_conv_artifact=True) # (B, C, T)
@ -657,7 +719,8 @@ class ConditionalWaveFlow(nn.LayerList):
"""Generate raw audio given mel spectrogram.
Args:
mel(np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
mel(np.ndarray):
Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
Returns:
np.ndarray: The synthesized audio. shape=(T,)
@ -673,8 +736,10 @@ class ConditionalWaveFlow(nn.LayerList):
"""Build a ConditionalWaveFlow model from a pretrained model.
Args:
config(yacs.config.CfgNode): model configs
checkpoint_path(Path or str): the path of pretrained model checkpoint, without extension name
config(yacs.config.CfgNode):
model configs
checkpoint_path(Path or str):
the path of pretrained model checkpoint, without extension name
Returns:
ConditionalWaveFlow The model built from pretrained result.
@ -694,8 +759,8 @@ class WaveFlowLoss(nn.Layer):
"""Criterion of a WaveFlow model.
Args:
sigma (float): The standard deviation of the gaussian noise used in WaveFlow,
by default 1.0.
sigma (float):
The standard deviation of the gaussian noise used in WaveFlow, by default 1.0.
"""
def __init__(self, sigma=1.0):
@ -708,8 +773,10 @@ class WaveFlowLoss(nn.Layer):
log_det_jacobian of transformation from x to z.
Args:
z(Tensor): The transformed random variable (x to z). shape=(B, T)
log_det_jacobian(Tensor): The log of the determinant of the jacobian matrix of the
z(Tensor):
The transformed random variable (x to z). shape=(B, T)
log_det_jacobian(Tensor):
The log of the determinant of the jacobian matrix of the
transformation from x to z. shape=(1,)
Returns:
@ -726,7 +793,8 @@ class ConditionalWaveFlow2Infer(ConditionalWaveFlow):
"""Generate raw audio given mel spectrogram.
Args:
mel (np.ndarray): Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
mel (np.ndarray):
Mel spectrogram of an utterance(in log-magnitude). shape=(C_mel, T_mel)
Returns:
np.ndarray: The synthesized audio. shape=(T,)

@ -165,19 +165,29 @@ class WaveRNN(nn.Layer):
init_type: str="xavier_uniform", ):
'''
Args:
rnn_dims (int, optional): Hidden dims of RNN Layers.
fc_dims (int, optional): Dims of FC Layers.
bits (int, optional): bit depth of signal.
aux_context_window (int, optional): The context window size of the first convolution applied to the
auxiliary input, by default 2
upsample_scales (List[int], optional): Upsample scales of the upsample network.
aux_channels (int, optional): Auxiliary channel of the residual blocks.
compute_dims (int, optional): Dims of Conv1D in MelResNet.
res_out_dims (int, optional): Dims of output in MelResNet.
res_blocks (int, optional): Number of residual blocks.
mode (str, optional): Output mode of the WaveRNN vocoder.
rnn_dims (int, optional):
Hidden dims of RNN Layers.
fc_dims (int, optional):
Dims of FC Layers.
bits (int, optional):
bit depth of signal.
aux_context_window (int, optional):
The context window size of the first convolution applied to the auxiliary input, by default 2
upsample_scales (List[int], optional):
Upsample scales of the upsample network.
aux_channels (int, optional):
Auxiliary channel of the residual blocks.
compute_dims (int, optional):
Dims of Conv1D in MelResNet.
res_out_dims (int, optional):
Dims of output in MelResNet.
res_blocks (int, optional):
Number of residual blocks.
mode (str, optional):
Output mode of the WaveRNN vocoder.
`MOL` for Mixture of Logistic Distribution, and `RAW` for quantized bits as the model's output.
init_type (str): How to initialize parameters.
init_type (str):
How to initialize parameters.
'''
super().__init__()
self.mode = mode
@ -226,8 +236,10 @@ class WaveRNN(nn.Layer):
def forward(self, x, c):
'''
Args:
x (Tensor): wav sequence, [B, T]
c (Tensor): mel spectrogram [B, C_aux, T']
x (Tensor):
wav sequence, [B, T]
c (Tensor):
mel spectrogram [B, C_aux, T']
T = (T' - 2 * aux_context_window ) * hop_length
Returns:
@ -280,10 +292,14 @@ class WaveRNN(nn.Layer):
gen_display: bool=False):
"""
Args:
c(Tensor): input mels, (T', C_aux)
batched(bool): generate in batch or not
target(int): target number of samples to be generated in each batch entry
overlap(int): number of samples for crossfading between batches
c(Tensor):
input mels, (T', C_aux)
batched(bool):
generate in batch or not
target(int):
target number of samples to be generated in each batch entry
overlap(int):
number of samples for crossfading between batches
mu_law(bool)
Returns:
wav sequence: Output (T' * prod(upsample_scales), out_channels, C_out).
@ -404,7 +420,8 @@ class WaveRNN(nn.Layer):
def pad_tensor(self, x, pad, side='both'):
'''
Args:
x(Tensor): mel, [1, n_frames, 80]
x(Tensor):
mel, [1, n_frames, 80]
pad(int):
side(str, optional): (Default value = 'both')
@ -428,12 +445,15 @@ class WaveRNN(nn.Layer):
Overlap will be used for crossfading in xfade_and_unfold()
Args:
x(Tensor): Upsampled conditioning features. mels or aux
x(Tensor):
Upsampled conditioning features. mels or aux
shape=(1, T, features)
mels: [1, T, 80]
aux: [1, T, 128]
target(int): Target timesteps for each index of batch
overlap(int): Timesteps for both xfade and rnn warmup
target(int):
Target timesteps for each index of batch
overlap(int):
Timesteps for both xfade and rnn warmup
Returns:
Tensor:

@ -42,7 +42,8 @@ class CausalConv1D(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
x (Tensor):
Input tensor (B, in_channels, T).
Returns:
Tensor: Output tensor (B, out_channels, T).
"""
@ -67,7 +68,8 @@ class CausalConv1DTranspose(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T_in).
x (Tensor):
Input tensor (B, in_channels, T_in).
Returns:
Tensor: Output tensor (B, out_channels, T_out).
"""

@ -20,8 +20,10 @@ class ConvolutionModule(nn.Layer):
"""ConvolutionModule in Conformer model.
Args:
channels (int): The number of channels of conv layers.
kernel_size (int): Kernerl size of conv layers.
channels (int):
The number of channels of conv layers.
kernel_size (int):
Kernerl size of conv layers.
"""
def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
@ -59,7 +61,8 @@ class ConvolutionModule(nn.Layer):
"""Compute convolution module.
Args:
x (Tensor): Input tensor (#batch, time, channels).
x (Tensor):
Input tensor (#batch, time, channels).
Returns:
Tensor: Output tensor (#batch, time, channels).
"""

@ -23,25 +23,34 @@ class EncoderLayer(nn.Layer):
"""Encoder layer module.
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
size (int):
Input dimension.
self_attn (nn.Layer):
Self-attention module instance.
`MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
can be used as the argument.
feed_forward (nn.Layer): Feed-forward module instance.
feed_forward (nn.Layer):
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
can be used as the argument.
feed_forward_macaron (nn.Layer): Additional feed-forward module instance.
feed_forward_macaron (nn.Layer):
Additional feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
can be used as the argument.
conv_module (nn.Layer): Convolution module instance.
conv_module (nn.Layer):
Convolution module instance.
`ConvlutionModule` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
dropout_rate (float):
Dropout rate.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
stochastic_depth_rate (float): Proability to skip this layer.
stochastic_depth_rate (float):
Proability to skip this layer.
During training, the layer may skip residual computation and return input
as-is with given probability.
"""
@ -86,15 +95,19 @@ class EncoderLayer(nn.Layer):
"""Compute encoded features.
Args:
x_input(Union[Tuple, Tensor]): Input tensor w/ or w/o pos emb.
x_input(Union[Tuple, Tensor]):
Input tensor w/ or w/o pos emb.
- w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
- w/o pos emb: Tensor (#batch, time, size).
mask(Tensor): Mask tensor for the input (#batch, time).
mask(Tensor):
Mask tensor for the input (#batch, time).
cache (Tensor):
Returns:
Tensor: Output tensor (#batch, time, size).
Tensor: Mask tensor (#batch, time).
Tensor:
Output tensor (#batch, time, size).
Tensor:
Mask tensor (#batch, time).
"""
if isinstance(x_input, tuple):
x, pos_emb = x_input[0], x_input[1]

@ -42,13 +42,19 @@ class Conv1dCell(nn.Conv1D):
class.
Args:
in_channels (int): The feature size of the input.
out_channels (int): The feature size of the output.
kernel_size (int or Tuple[int]): The size of the kernel.
dilation (int or Tuple[int]): The dilation of the convolution, by default 1
weight_attr (ParamAttr, Initializer, str or bool, optional) : The parameter attribute of the convolution kernel,
in_channels (int):
The feature size of the input.
out_channels (int):
The feature size of the output.
kernel_size (int or Tuple[int]):
The size of the kernel.
dilation (int or Tuple[int]):
The dilation of the convolution, by default 1
weight_attr (ParamAttr, Initializer, str or bool, optional):
The parameter attribute of the convolution kernel,
by default None.
bias_attr (ParamAttr, Initializer, str or bool, optional):The parameter attribute of the bias.
bias_attr (ParamAttr, Initializer, str or bool, optional):
The parameter attribute of the bias.
If ``False``, this layer does not have a bias, by default None.
Examples:
@ -122,7 +128,8 @@ class Conv1dCell(nn.Conv1D):
"""Initialize the buffer for the step input.
Args:
x_t (Tensor): The step input. shape=(batch_size, in_channels)
x_t (Tensor):
The step input. shape=(batch_size, in_channels)
"""
batch_size, _ = x_t.shape
@ -134,7 +141,8 @@ class Conv1dCell(nn.Conv1D):
"""Shift the buffer by one step.
Args:
x_t (Tensor): The step input. shape=(batch_size, in_channels)
x_t (Tensor): T
he step input. shape=(batch_size, in_channels)
"""
self._buffer = paddle.concat(
@ -144,10 +152,12 @@ class Conv1dCell(nn.Conv1D):
"""Add step input and compute step output.
Args:
x_t (Tensor): The step input. shape=(batch_size, in_channels)
x_t (Tensor):
The step input. shape=(batch_size, in_channels)
Returns:
y_t (Tensor): The step output. shape=(batch_size, out_channels)
y_t (Tensor):
The step output. shape=(batch_size, out_channels)
"""
batch_size = x_t.shape[0]
@ -173,10 +183,14 @@ class Conv1dBatchNorm(nn.Layer):
"""A Conv1D Layer followed by a BatchNorm1D.
Args:
in_channels (int): The feature size of the input.
out_channels (int): The feature size of the output.
kernel_size (int): The size of the convolution kernel.
stride (int, optional): The stride of the convolution, by default 1.
in_channels (int):
The feature size of the input.
out_channels (int):
The feature size of the output.
kernel_size (int):
The size of the convolution kernel.
stride (int, optional):
The stride of the convolution, by default 1.
padding (int, str or Tuple[int], optional):
The padding of the convolution.
If int, a symmetrical padding is applied before convolution;
@ -189,9 +203,12 @@ class Conv1dBatchNorm(nn.Layer):
bias_attr (ParamAttr, Initializer, str or bool, optional):
The parameter attribute of the bias of the convolution,
by defaultNone.
data_format (str ["NCL" or "NLC"], optional): The data layout of the input, by default "NCL"
momentum (float, optional): The momentum of the BatchNorm1D layer, by default 0.9
epsilon (float, optional): The epsilon of the BatchNorm1D layer, by default 1e-05
data_format (str ["NCL" or "NLC"], optional):
The data layout of the input, by default "NCL"
momentum (float, optional):
The momentum of the BatchNorm1D layer, by default 0.9
epsilon (float, optional):
The epsilon of the BatchNorm1D layer, by default 1e-05
"""
def __init__(self,
@ -225,12 +242,13 @@ class Conv1dBatchNorm(nn.Layer):
"""Forward pass of the Conv1dBatchNorm layer.
Args:
x (Tensor): The input tensor. Its data layout depends on ``data_format``.
shape=(B, C_in, T_in) or (B, T_in, C_in)
x (Tensor):
The input tensor. Its data layout depends on ``data_format``.
shape=(B, C_in, T_in) or (B, T_in, C_in)
Returns:
Tensor: The output tensor.
shape=(B, C_out, T_out) or (B, T_out, C_out)
Tensor:
The output tensor. shape=(B, C_out, T_out) or (B, T_out, C_out)
"""
x = self.conv(x)

@ -19,8 +19,10 @@ def shuffle_dim(x, axis, perm=None):
"""Permute input tensor along aixs given the permutation or randomly.
Args:
x (Tensor): The input tensor.
axis (int): The axis to shuffle.
x (Tensor):
The input tensor.
axis (int):
The axis to shuffle.
perm (List[int], ndarray, optional):
The order to reorder the tensor along the ``axis``-th dimension.
It is a permutation of ``[0, d)``, where d is the size of the

@ -19,8 +19,10 @@ from paddle import nn
class LayerNorm(nn.LayerNorm):
"""Layer normalization module.
Args:
nout (int): Output dim size.
dim (int): Dimension to be normalized.
nout (int):
Output dim size.
dim (int):
Dimension to be normalized.
"""
def __init__(self, nout, dim=-1):
@ -32,7 +34,8 @@ class LayerNorm(nn.LayerNorm):
"""Apply layer normalization.
Args:
x (Tensor):Input tensor.
x (Tensor):
Input tensor.
Returns:
Tensor: Normalized tensor.

@ -269,8 +269,10 @@ class GuidedAttentionLoss(nn.Layer):
"""Make masks indicating non-padded part.
Args:
ilens(Tensor(int64) or List): Batch of lengths (B,).
olens(Tensor(int64) or List): Batch of lengths (B,).
ilens(Tensor(int64) or List):
Batch of lengths (B,).
olens(Tensor(int64) or List):
Batch of lengths (B,).
Returns:
Tensor: Mask tensor indicating non-padded part.
@ -322,9 +324,12 @@ class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
"""Calculate forward propagation.
Args:
att_ws(Tensor): Batch of multi head attention weights (B, H, T_max_out, T_max_in).
ilens(Tensor): Batch of input lenghts (B,).
olens(Tensor): Batch of output lenghts (B,).
att_ws(Tensor):
Batch of multi head attention weights (B, H, T_max_out, T_max_in).
ilens(Tensor):
Batch of input lenghts (B,).
olens(Tensor):
Batch of output lenghts (B,).
Returns:
Tensor: Guided attention loss value.
@ -354,9 +359,12 @@ class Tacotron2Loss(nn.Layer):
"""Initialize Tactoron2 loss module.
Args:
use_masking (bool): Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool): Whether to apply weighted masking in loss calculation.
bce_pos_weight (float): Weight of positive sample of stop token.
use_masking (bool):
Whether to apply masking for padded part in loss calculation.
use_weighted_masking (bool):
Whether to apply weighted masking in loss calculation.
bce_pos_weight (float):
Weight of positive sample of stop token.
"""
super().__init__()
assert (use_masking != use_weighted_masking) or not use_masking
@ -374,17 +382,25 @@ class Tacotron2Loss(nn.Layer):
"""Calculate forward propagation.
Args:
after_outs(Tensor): Batch of outputs after postnets (B, Lmax, odim).
before_outs(Tensor): Batch of outputs before postnets (B, Lmax, odim).
logits(Tensor): Batch of stop logits (B, Lmax).
ys(Tensor): Batch of padded target features (B, Lmax, odim).
stop_labels(Tensor(int64)): Batch of the sequences of stop token labels (B, Lmax).
after_outs(Tensor):
Batch of outputs after postnets (B, Lmax, odim).
before_outs(Tensor):
Batch of outputs before postnets (B, Lmax, odim).
logits(Tensor):
Batch of stop logits (B, Lmax).
ys(Tensor):
Batch of padded target features (B, Lmax, odim).
stop_labels(Tensor(int64)):
Batch of the sequences of stop token labels (B, Lmax).
olens(Tensor(int64)):
Returns:
Tensor: L1 loss value.
Tensor: Mean square error loss value.
Tensor: Binary cross entropy loss value.
Tensor:
L1 loss value.
Tensor:
Mean square error loss value.
Tensor:
Binary cross entropy loss value.
"""
# make mask and apply it
if self.use_masking:
@ -437,16 +453,24 @@ def stft(x,
pad_mode='reflect'):
"""Perform STFT and convert to magnitude spectrogram.
Args:
x(Tensor): Input signal tensor (B, T).
fft_size(int): FFT size.
hop_size(int): Hop size.
win_length(int, optional): window : str, optional (Default value = None)
window(str, optional): Name of window function, see `scipy.signal.get_window` for more
details. Defaults to "hann".
center(bool, optional, optional): center (bool, optional): Whether to pad `x` to make that the
x(Tensor):
Input signal tensor (B, T).
fft_size(int):
FFT size.
hop_size(int):
Hop size.
win_length(int, optional):
window (str, optional):
(Default value = None)
window(str, optional):
Name of window function, see `scipy.signal.get_window` for more details. Defaults to "hann".
center(bool, optional, optional): center (bool, optional):
Whether to pad `x` to make that the
:math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`.
pad_mode(str, optional, optional): (Default value = 'reflect')
hop_length: (Default value = None)
pad_mode(str, optional, optional):
(Default value = 'reflect')
hop_length:
(Default value = None)
Returns:
Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
@ -480,8 +504,10 @@ class SpectralConvergenceLoss(nn.Layer):
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
Args:
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
x_mag (Tensor):
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor):
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns:
Tensor: Spectral convergence loss value.
"""
@ -501,8 +527,10 @@ class LogSTFTMagnitudeLoss(nn.Layer):
def forward(self, x_mag, y_mag):
"""Calculate forward propagation.
Args:
x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
x_mag (Tensor):
Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
y_mag (Tensor):
Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
Returns:
Tensor: Log STFT magnitude loss value.
"""
@ -531,11 +559,15 @@ class STFTLoss(nn.Layer):
def forward(self, x, y):
"""Calculate forward propagation.
Args:
x (Tensor): Predicted signal (B, T).
y (Tensor): Groundtruth signal (B, T).
x (Tensor):
Predicted signal (B, T).
y (Tensor):
Groundtruth signal (B, T).
Returns:
Tensor: Spectral convergence loss value.
Tensor: Log STFT magnitude loss value.
Tensor:
Spectral convergence loss value.
Tensor:
Log STFT magnitude loss value.
"""
x_mag = stft(x, self.fft_size, self.shift_size, self.win_length,
self.window)
@ -558,10 +590,14 @@ class MultiResolutionSTFTLoss(nn.Layer):
window="hann", ):
"""Initialize Multi resolution STFT loss module.
Args:
fft_sizes (list): List of FFT sizes.
hop_sizes (list): List of hop sizes.
win_lengths (list): List of window lengths.
window (str): Window function type.
fft_sizes (list):
List of FFT sizes.
hop_sizes (list):
List of hop sizes.
win_lengths (list):
List of window lengths.
window (str):
Window function type.
"""
super().__init__()
assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
@ -573,11 +609,15 @@ class MultiResolutionSTFTLoss(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Predicted signal (B, T) or (B, #subband, T).
y (Tensor): Groundtruth signal (B, T) or (B, #subband, T).
x (Tensor):
Predicted signal (B, T) or (B, #subband, T).
y (Tensor):
Groundtruth signal (B, T) or (B, #subband, T).
Returns:
Tensor: Multi resolution spectral convergence loss value.
Tensor: Multi resolution log STFT magnitude loss value.
Tensor:
Multi resolution spectral convergence loss value.
Tensor:
Multi resolution log STFT magnitude loss value.
"""
if len(x.shape) == 3:
# (B, C, T) -> (B x C, T)
@ -615,9 +655,11 @@ class GeneratorAdversarialLoss(nn.Layer):
def forward(self, outputs):
"""Calcualate generator adversarial loss.
Args:
outputs (Tensor or List): Discriminator outputs or list of discriminator outputs.
outputs (Tensor or List):
Discriminator outputs or list of discriminator outputs.
Returns:
Tensor: Generator adversarial loss value.
Tensor:
Generator adversarial loss value.
"""
if isinstance(outputs, (tuple, list)):
adv_loss = 0.0
@ -659,13 +701,15 @@ class DiscriminatorAdversarialLoss(nn.Layer):
"""Calcualate discriminator adversarial loss.
Args:
outputs_hat (Tensor or list): Discriminator outputs or list of
discriminator outputs calculated from generator outputs.
outputs (Tensor or list): Discriminator outputs or list of
discriminator outputs calculated from groundtruth.
outputs_hat (Tensor or list):
Discriminator outputs or list of discriminator outputs calculated from generator outputs.
outputs (Tensor or list):
Discriminator outputs or list of discriminator outputs calculated from groundtruth.
Returns:
Tensor: Discriminator real loss value.
Tensor: Discriminator fake loss value.
Tensor:
Discriminator real loss value.
Tensor:
Discriminator fake loss value.
"""
if isinstance(outputs, (tuple, list)):
real_loss = 0.0
@ -766,9 +810,12 @@ def masked_l1_loss(prediction, target, mask):
"""Compute maksed L1 loss.
Args:
prediction(Tensor): The prediction.
target(Tensor): The target. The shape should be broadcastable to ``prediction``.
mask(Tensor): The mask. The shape should be broadcatable to the broadcasted shape of
prediction(Tensor):
The prediction.
target(Tensor):
The target. The shape should be broadcastable to ``prediction``.
mask(Tensor):
The mask. The shape should be broadcatable to the broadcasted shape of
``prediction`` and ``target``.
Returns:
@ -916,8 +963,10 @@ class MelSpectrogramLoss(nn.Layer):
def forward(self, y_hat, y):
"""Calculate Mel-spectrogram loss.
Args:
y_hat(Tensor): Generated single tensor (B, 1, T).
y(Tensor): Groundtruth single tensor (B, 1, T).
y_hat(Tensor):
Generated single tensor (B, 1, T).
y(Tensor):
Groundtruth single tensor (B, 1, T).
Returns:
Tensor: Mel-spectrogram loss value.
@ -947,9 +996,11 @@ class FeatureMatchLoss(nn.Layer):
"""Calcualate feature matching loss.
Args:
feats_hat(list): List of list of discriminator outputs
feats_hat(list):
List of list of discriminator outputs
calcuated from generater outputs.
feats(list): List of list of discriminator outputs
feats(list):
List of list of discriminator outputs
Returns:
Tensor: Feature matching loss value.
@ -986,11 +1037,16 @@ class KLDivergenceLoss(nn.Layer):
"""Calculate KL divergence loss.
Args:
z_p (Tensor): Flow hidden representation (B, H, T_feats).
logs_q (Tensor): Posterior encoder projected scale (B, H, T_feats).
m_p (Tensor): Expanded text encoder projected mean (B, H, T_feats).
logs_p (Tensor): Expanded text encoder projected scale (B, H, T_feats).
z_mask (Tensor): Mask tensor (B, 1, T_feats).
z_p (Tensor):
Flow hidden representation (B, H, T_feats).
logs_q (Tensor):
Posterior encoder projected scale (B, H, T_feats).
m_p (Tensor):
Expanded text encoder projected mean (B, H, T_feats).
logs_p (Tensor):
Expanded text encoder projected scale (B, H, T_feats).
z_mask (Tensor):
Mask tensor (B, 1, T_feats).
Returns:
Tensor: KL divergence loss.

@ -25,8 +25,10 @@ def pad_list(xs, pad_value):
"""Perform padding for the list of tensors.
Args:
xs (List[Tensor]): List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value (float): Value for padding.
xs (List[Tensor]):
List of Tensors [(T_1, `*`), (T_2, `*`), ..., (T_B, `*`)].
pad_value (float):
Value for padding.
Returns:
Tensor: Padded tensor (B, Tmax, `*`).
@ -55,10 +57,13 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
"""Make mask tensor containing indices of padded part.
Args:
lengths (Tensor(int64)): Batch of lengths (B,).
xs (Tensor, optional): The reference tensor.
lengths (Tensor(int64)):
Batch of lengths (B,).
xs (Tensor, optional):
The reference tensor.
If set, masks will be the same shape as this tensor.
length_dim (int, optional): Dimension indicator of the above tensor.
length_dim (int, optional):
Dimension indicator of the above tensor.
See the example.
Returns:
@ -166,14 +171,18 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1):
"""Make mask tensor containing indices of non-padded part.
Args:
lengths (Tensor(int64) or List): Batch of lengths (B,).
xs (Tensor, optional): The reference tensor.
lengths (Tensor(int64) or List):
Batch of lengths (B,).
xs (Tensor, optional):
The reference tensor.
If set, masks will be the same shape as this tensor.
length_dim (int, optional): Dimension indicator of the above tensor.
length_dim (int, optional):
Dimension indicator of the above tensor.
See the example.
Returns:
Tensor(bool): mask tensor containing indices of padded part bool.
Tensor(bool):
mask tensor containing indices of padded part bool.
Examples:
With only lengths.
@ -257,8 +266,10 @@ def initialize(model: nn.Layer, init: str):
Custom initialization routines can be implemented into submodules
Args:
model (nn.Layer): Target.
init (str): Method of initialization.
model (nn.Layer):
Target.
init (str):
Method of initialization.
"""
assert check_argument_types()
@ -285,12 +296,17 @@ def get_random_segments(
segment_size: int, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Get random segments.
Args:
x (Tensor): Input tensor (B, C, T).
x_lengths (Tensor): Length tensor (B,).
segment_size (int): Segment size.
x (Tensor):
Input tensor (B, C, T).
x_lengths (Tensor):
Length tensor (B,).
segment_size (int):
Segment size.
Returns:
Tensor: Segmented tensor (B, C, segment_size).
Tensor: Start index tensor (B,).
Tensor:
Segmented tensor (B, C, segment_size).
Tensor:
Start index tensor (B,).
"""
b, c, t = paddle.shape(x)
max_start_idx = x_lengths - segment_size
@ -306,9 +322,12 @@ def get_segments(
segment_size: int, ) -> paddle.Tensor:
"""Get segments.
Args:
x (Tensor): Input tensor (B, C, T).
start_idxs (Tensor): Start index tensor (B,).
segment_size (int): Segment size.
x (Tensor):
Input tensor (B, C, T).
start_idxs (Tensor):
Start index tensor (B,).
segment_size (int):
Segment size.
Returns:
Tensor: Segmented tensor (B, C, segment_size).
"""
@ -353,14 +372,20 @@ def phones_masking(xs_pad: paddle.Tensor,
span_bdy: paddle.Tensor=None):
'''
Args:
xs_pad (paddle.Tensor): input speech (B, Tmax, D).
src_mask (paddle.Tensor): mask of speech (B, 1, Tmax).
align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2).
align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2).
align_start_lens (paddle.Tensor): length of align_start (B, ).
xs_pad (paddle.Tensor):
input speech (B, Tmax, D).
src_mask (paddle.Tensor):
mask of speech (B, 1, Tmax).
align_start (paddle.Tensor):
frame level phone alignment start (B, Tmax2).
align_end (paddle.Tensor):
frame level phone alignment end (B, Tmax2).
align_start_lens (paddle.Tensor):
length of align_start (B, ).
mlm_prob (float):
mean_phn_span (int):
span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2).
span_bdy (paddle.Tensor):
masked mel boundary of input speech (B, 2).
Returns:
paddle.Tensor[bool]: masked position of input speech (B, Tmax).
'''
@ -416,19 +441,29 @@ def phones_text_masking(xs_pad: paddle.Tensor,
span_bdy: paddle.Tensor=None):
'''
Args:
xs_pad (paddle.Tensor): input speech (B, Tmax, D).
src_mask (paddle.Tensor): mask of speech (B, 1, Tmax).
text_pad (paddle.Tensor): input text (B, Tmax2).
text_mask (paddle.Tensor): mask of text (B, 1, Tmax2).
align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2).
align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2).
align_start_lens (paddle.Tensor): length of align_start (B, ).
xs_pad (paddle.Tensor):
input speech (B, Tmax, D).
src_mask (paddle.Tensor):
mask of speech (B, 1, Tmax).
text_pad (paddle.Tensor):
input text (B, Tmax2).
text_mask (paddle.Tensor):
mask of text (B, 1, Tmax2).
align_start (paddle.Tensor):
frame level phone alignment start (B, Tmax2).
align_end (paddle.Tensor):
frame level phone alignment end (B, Tmax2).
align_start_lens (paddle.Tensor):
length of align_start (B, ).
mlm_prob (float):
mean_phn_span (int):
span_bdy (paddle.Tensor): masked mel boundary of input speech (B, 2).
span_bdy (paddle.Tensor):
masked mel boundary of input speech (B, 2).
Returns:
paddle.Tensor[bool]: masked position of input speech (B, Tmax).
paddle.Tensor[bool]: masked position of input text (B, Tmax2).
paddle.Tensor[bool]:
masked position of input speech (B, Tmax).
paddle.Tensor[bool]:
masked position of input text (B, Tmax2).
'''
bz, sent_len, _ = paddle.shape(xs_pad)
masked_pos = paddle.zeros((bz, sent_len))
@ -488,12 +523,18 @@ def get_seg_pos(speech_pad: paddle.Tensor,
seg_emb: bool=False):
'''
Args:
speech_pad (paddle.Tensor): input speech (B, Tmax, D).
text_pad (paddle.Tensor): input text (B, Tmax2).
align_start (paddle.Tensor): frame level phone alignment start (B, Tmax2).
align_end (paddle.Tensor): frame level phone alignment end (B, Tmax2).
align_start_lens (paddle.Tensor): length of align_start (B, ).
seg_emb (bool): whether to use segment embedding.
speech_pad (paddle.Tensor):
input speech (B, Tmax, D).
text_pad (paddle.Tensor):
input text (B, Tmax2).
align_start (paddle.Tensor):
frame level phone alignment start (B, Tmax2).
align_end (paddle.Tensor):
frame level phone alignment end (B, Tmax2).
align_start_lens (paddle.Tensor):
length of align_start (B, ).
seg_emb (bool):
whether to use segment embedding.
Returns:
paddle.Tensor[int]: n-th phone of each mel, 0<=n<=Tmax2 (B, Tmax).
eg:
@ -579,8 +620,10 @@ def random_spans_noise_mask(length: int,
def _random_seg(num_items, num_segs):
"""Partition a sequence of items randomly into non-empty segments.
Args:
num_items: an integer scalar > 0
num_segs: an integer scalar in [1, num_items]
num_items:
an integer scalar > 0
num_segs:
an integer scalar in [1, num_items]
Returns:
a Tensor with shape [num_segs] containing positive integers that add
up to num_items

@ -26,9 +26,12 @@ def design_prototype_filter(taps=62, cutoff_ratio=0.142, beta=9.0):
filters of cosine modulated filterbanks`_.
Args:
taps (int): The number of filter taps.
cutoff_ratio (float): Cut-off frequency ratio.
beta (float): Beta coefficient for kaiser window.
taps (int):
The number of filter taps.
cutoff_ratio (float):
Cut-off frequency ratio.
beta (float):
Beta coefficient for kaiser window.
Returns:
ndarray:
Impluse response of prototype filter (taps + 1,).
@ -66,10 +69,14 @@ class PQMF(nn.Layer):
See dicussion in https://github.com/kan-bayashi/ParallelWaveGAN/issues/195.
Args:
subbands (int): The number of subbands.
taps (int): The number of filter taps.
cutoff_ratio (float): Cut-off frequency ratio.
beta (float): Beta coefficient for kaiser window.
subbands (int):
The number of subbands.
taps (int):
The number of filter taps.
cutoff_ratio (float):
Cut-off frequency ratio.
beta (float):
Beta coefficient for kaiser window.
"""
super().__init__()
@ -103,7 +110,8 @@ class PQMF(nn.Layer):
def analysis(self, x):
"""Analysis with PQMF.
Args:
x (Tensor): Input tensor (B, 1, T).
x (Tensor):
Input tensor (B, 1, T).
Returns:
Tensor: Output tensor (B, subbands, T // subbands).
"""
@ -113,7 +121,8 @@ class PQMF(nn.Layer):
def synthesis(self, x):
"""Synthesis with PQMF.
Args:
x (Tensor): Input tensor (B, subbands, T // subbands).
x (Tensor):
Input tensor (B, subbands, T // subbands).
Returns:
Tensor: Output tensor (B, 1, T).
"""

@ -50,12 +50,18 @@ class DurationPredictor(nn.Layer):
"""Initilize duration predictor module.
Args:
idim (int):Input dimension.
n_layers (int, optional): Number of convolutional layers.
n_chans (int, optional): Number of channels of convolutional layers.
kernel_size (int, optional): Kernel size of convolutional layers.
dropout_rate (float, optional): Dropout rate.
offset (float, optional): Offset value to avoid nan in log domain.
idim (int):
Input dimension.
n_layers (int, optional):
Number of convolutional layers.
n_chans (int, optional):
Number of channels of convolutional layers.
kernel_size (int, optional):
Kernel size of convolutional layers.
dropout_rate (float, optional):
Dropout rate.
offset (float, optional):
Offset value to avoid nan in log domain.
"""
super().__init__()
@ -99,8 +105,10 @@ class DurationPredictor(nn.Layer):
def forward(self, xs, x_masks=None):
"""Calculate forward propagation.
Args:
xs(Tensor): Batch of input sequences (B, Tmax, idim).
x_masks(ByteTensor, optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
xs(Tensor):
Batch of input sequences (B, Tmax, idim).
x_masks(ByteTensor, optional, optional):
Batch of masks indicating padded part (B, Tmax). (Default value = None)
Returns:
Tensor: Batch of predicted durations in log domain (B, Tmax).
@ -110,8 +118,10 @@ class DurationPredictor(nn.Layer):
def inference(self, xs, x_masks=None):
"""Inference duration.
Args:
xs(Tensor): Batch of input sequences (B, Tmax, idim).
x_masks(Tensor(bool), optional, optional): Batch of masks indicating padded part (B, Tmax). (Default value = None)
xs(Tensor):
Batch of input sequences (B, Tmax, idim).
x_masks(Tensor(bool), optional, optional):
Batch of masks indicating padded part (B, Tmax). (Default value = None)
Returns:
Tensor: Batch of predicted durations in linear domain int64 (B, Tmax).
@ -140,8 +150,10 @@ class DurationPredictorLoss(nn.Layer):
"""Calculate forward propagation.
Args:
outputs(Tensor): Batch of prediction durations in log domain (B, T)
targets(Tensor): Batch of groundtruth durations in linear domain (B, T)
outputs(Tensor):
Batch of prediction durations in log domain (B, T)
targets(Tensor):
Batch of groundtruth durations in linear domain (B, T)
Returns:
Tensor: Mean squared error loss value.

@ -36,7 +36,8 @@ class LengthRegulator(nn.Layer):
"""Initilize length regulator module.
Args:
pad_value (float, optional): Value used for padding.
pad_value (float, optional):
Value used for padding.
"""
super().__init__()
@ -97,9 +98,12 @@ class LengthRegulator(nn.Layer):
"""Calculate forward propagation.
Args:
xs (Tensor): Batch of sequences of char or phoneme embeddings (B, Tmax, D).
ds (Tensor(int64)): Batch of durations of each frame (B, T).
alpha (float, optional): Alpha value to control speed of speech.
xs (Tensor):
Batch of sequences of char or phoneme embeddings (B, Tmax, D).
ds (Tensor(int64)):
Batch of durations of each frame (B, T).
alpha (float, optional):
Alpha value to control speed of speech.
Returns:
Tensor: replicated input tensor based on durations (B, T*, D).

@ -43,11 +43,16 @@ class VariancePredictor(nn.Layer):
"""Initilize duration predictor module.
Args:
idim (int): Input dimension.
n_layers (int, optional): Number of convolutional layers.
n_chans (int, optional): Number of channels of convolutional layers.
kernel_size (int, optional): Kernel size of convolutional layers.
dropout_rate (float, optional): Dropout rate.
idim (int):
Input dimension.
n_layers (int, optional):
Number of convolutional layers.
n_chans (int, optional):
Number of channels of convolutional layers.
kernel_size (int, optional):
Kernel size of convolutional layers.
dropout_rate (float, optional):
Dropout rate.
"""
assert check_argument_types()
super().__init__()
@ -74,11 +79,14 @@ class VariancePredictor(nn.Layer):
"""Calculate forward propagation.
Args:
xs (Tensor): Batch of input sequences (B, Tmax, idim).
x_masks (Tensor(bool), optional): Batch of masks indicating padded part (B, Tmax, 1).
xs (Tensor):
Batch of input sequences (B, Tmax, idim).
x_masks (Tensor(bool), optional):
Batch of masks indicating padded part (B, Tmax, 1).
Returns:
Tensor: Batch of predicted sequences (B, Tmax, 1).
Tensor:
Batch of predicted sequences (B, Tmax, 1).
"""
# (B, idim, Tmax)
xs = xs.transpose([0, 2, 1])

@ -29,15 +29,24 @@ class WaveNetResidualBlock(nn.Layer):
refer to `WaveNet: A Generative Model for Raw Audio <https://arxiv.org/abs/1609.03499>`_.
Args:
kernel_size (int, optional): Kernel size of the 1D convolution, by default 3
residual_channels (int, optional): Feature size of the residual output(and also the input), by default 64
gate_channels (int, optional): Output feature size of the 1D convolution, by default 128
skip_channels (int, optional): Feature size of the skip output, by default 64
aux_channels (int, optional): Feature size of the auxiliary input (e.g. spectrogram), by default 80
dropout (float, optional): Probability of the dropout before the 1D convolution, by default 0.
dilation (int, optional): Dilation of the 1D convolution, by default 1
bias (bool, optional): Whether to use bias in the 1D convolution, by default True
use_causal_conv (bool, optional): Whether to use causal padding for the 1D convolution, by default False
kernel_size (int, optional):
Kernel size of the 1D convolution, by default 3
residual_channels (int, optional):
Feature size of the residual output(and also the input), by default 64
gate_channels (int, optional):
Output feature size of the 1D convolution, by default 128
skip_channels (int, optional):
Feature size of the skip output, by default 64
aux_channels (int, optional):
Feature size of the auxiliary input (e.g. spectrogram), by default 80
dropout (float, optional):
Probability of the dropout before the 1D convolution, by default 0.
dilation (int, optional):
Dilation of the 1D convolution, by default 1
bias (bool, optional):
Whether to use bias in the 1D convolution, by default True
use_causal_conv (bool, optional):
Whether to use causal padding for the 1D convolution, by default False
"""
def __init__(self,
@ -81,13 +90,17 @@ class WaveNetResidualBlock(nn.Layer):
def forward(self, x, c):
"""
Args:
x (Tensor): the input features. Shape (N, C_res, T)
c (Tensor): the auxiliary input. Shape (N, C_aux, T)
x (Tensor):
the input features. Shape (N, C_res, T)
c (Tensor):
the auxiliary input. Shape (N, C_aux, T)
Returns:
res (Tensor): Shape (N, C_res, T), the residual output, which is used as the
res (Tensor):
Shape (N, C_res, T), the residual output, which is used as the
input of the next ResidualBlock in a stack of ResidualBlocks.
skip (Tensor): Shape (N, C_skip, T), the skip output, which is collected among
skip (Tensor):
Shape (N, C_skip, T), the skip output, which is collected among
each layer in a stack of ResidualBlocks.
"""
x_input = x
@ -121,13 +134,20 @@ class HiFiGANResidualBlock(nn.Layer):
):
"""Initialize HiFiGANResidualBlock module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
channels (int): Number of channels for convolution layer.
dilations (List[int]): List of dilation factors.
use_additional_convs (bool): Whether to use additional convolution layers.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
kernel_size (int):
Kernel size of dilation convolution layer.
channels (int):
Number of channels for convolution layer.
dilations (List[int]):
List of dilation factors.
use_additional_convs (bool):
Whether to use additional convolution layers.
bias (bool):
Whether to add bias parameter in convolution layers.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (dict):
Hyperparameters for activation function.
"""
super().__init__()
@ -167,7 +187,8 @@ class HiFiGANResidualBlock(nn.Layer):
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
x (Tensor):
Input tensor (B, channels, T).
Returns:
Tensor: Output tensor (B, channels, T).
"""

@ -39,15 +39,24 @@ class ResidualStack(nn.Layer):
"""Initialize ResidualStack module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
channels (int): Number of channels of convolution layers.
dilation (int): Dilation factor.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (Dict[str,Any]): Hyperparameters for activation function.
pad (str): Padding function module name before dilated convolution layer.
pad_params (Dict[str, Any]): Hyperparameters for padding function.
use_causal_conv (bool): Whether to use causal convolution.
kernel_size (int):
Kernel size of dilation convolution layer.
channels (int):
Number of channels of convolution layers.
dilation (int):
Dilation factor.
bias (bool):
Whether to add bias parameter in convolution layers.
nonlinear_activation (str):
Activation function module name.
nonlinear_activation_params (Dict[str,Any]):
Hyperparameters for activation function.
pad (str):
Padding function module name before dilated convolution layer.
pad_params (Dict[str, Any]):
Hyperparameters for padding function.
use_causal_conv (bool):
Whether to use causal convolution.
"""
super().__init__()
# for compatibility
@ -95,7 +104,8 @@ class ResidualStack(nn.Layer):
"""Calculate forward propagation.
Args:
c (Tensor): Input tensor (B, channels, T).
c (Tensor):
Input tensor (B, channels, T).
Returns:
Tensor: Output tensor (B, chennels, T).
"""

@ -32,16 +32,26 @@ class StyleEncoder(nn.Layer):
Speech Synthesis`: https://arxiv.org/abs/1803.09017
Args:
idim (int, optional): Dimension of the input mel-spectrogram.
gst_tokens (int, optional): The number of GST embeddings.
gst_token_dim (int, optional): Dimension of each GST embedding.
gst_heads (int, optional): The number of heads in GST multihead attention.
conv_layers (int, optional): The number of conv layers in the reference encoder.
conv_chans_list (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
conv_stride (int, optional): Stride size of conv layers in the reference encoder.
gru_layers (int, optional): The number of GRU layers in the reference encoder.
gru_units (int, optional):The number of GRU units in the reference encoder.
idim (int, optional):
Dimension of the input mel-spectrogram.
gst_tokens (int, optional):
The number of GST embeddings.
gst_token_dim (int, optional):
Dimension of each GST embedding.
gst_heads (int, optional):
The number of heads in GST multihead attention.
conv_layers (int, optional):
The number of conv layers in the reference encoder.
conv_chans_list (Sequence[int], optional):
List of the number of channels of conv layers in the referece encoder.
conv_kernel_size (int, optional):
Kernal size of conv layers in the reference encoder.
conv_stride (int, optional):
Stride size of conv layers in the reference encoder.
gru_layers (int, optional):
The number of GRU layers in the reference encoder.
gru_units (int, optional):
The number of GRU units in the reference encoder.
Todo:
* Support manual weight specification in inference.
@ -82,7 +92,8 @@ class StyleEncoder(nn.Layer):
"""Calculate forward propagation.
Args:
speech (Tensor): Batch of padded target features (B, Lmax, odim).
speech (Tensor):
Batch of padded target features (B, Lmax, odim).
Returns:
Tensor: Style token embeddings (B, token_dim).
@ -104,13 +115,20 @@ class ReferenceEncoder(nn.Layer):
Speech Synthesis`: https://arxiv.org/abs/1803.09017
Args:
idim (int, optional): Dimension of the input mel-spectrogram.
conv_layers (int, optional): The number of conv layers in the reference encoder.
conv_chans_list: (Sequence[int], optional): List of the number of channels of conv layers in the referece encoder.
conv_kernel_size (int, optional): Kernal size of conv layers in the reference encoder.
conv_stride (int, optional): Stride size of conv layers in the reference encoder.
gru_layers (int, optional): The number of GRU layers in the reference encoder.
gru_units (int, optional): The number of GRU units in the reference encoder.
idim (int, optional):
Dimension of the input mel-spectrogram.
conv_layers (int, optional):
The number of conv layers in the reference encoder.
conv_chans_list: (Sequence[int], optional):
List of the number of channels of conv layers in the referece encoder.
conv_kernel_size (int, optional):
Kernal size of conv layers in the reference encoder.
conv_stride (int, optional):
Stride size of conv layers in the reference encoder.
gru_layers (int, optional):
The number of GRU layers in the reference encoder.
gru_units (int, optional):
The number of GRU units in the reference encoder.
"""
@ -168,7 +186,8 @@ class ReferenceEncoder(nn.Layer):
def forward(self, speech: paddle.Tensor) -> paddle.Tensor:
"""Calculate forward propagation.
Args:
speech (Tensor): Batch of padded target features (B, Lmax, idim).
speech (Tensor):
Batch of padded target features (B, Lmax, idim).
Returns:
Tensor: Reference embedding (B, gru_units)
@ -200,11 +219,16 @@ class StyleTokenLayer(nn.Layer):
.. _`Style Tokens: Unsupervised Style Modeling, Control and Transfer in End-to-End
Speech Synthesis`: https://arxiv.org/abs/1803.09017
Args:
ref_embed_dim (int, optional): Dimension of the input reference embedding.
gst_tokens (int, optional): The number of GST embeddings.
gst_token_dim (int, optional): Dimension of each GST embedding.
gst_heads (int, optional): The number of heads in GST multihead attention.
dropout_rate (float, optional): Dropout rate in multi-head attention.
ref_embed_dim (int, optional):
Dimension of the input reference embedding.
gst_tokens (int, optional):
The number of GST embeddings.
gst_token_dim (int, optional):
Dimension of each GST embedding.
gst_heads (int, optional):
The number of heads in GST multihead attention.
dropout_rate (float, optional):
Dropout rate in multi-head attention.
"""
@ -236,7 +260,8 @@ class StyleTokenLayer(nn.Layer):
"""Calculate forward propagation.
Args:
ref_embs (Tensor): Reference embeddings (B, ref_embed_dim).
ref_embs (Tensor):
Reference embeddings (B, ref_embed_dim).
Returns:
Tensor: Style token embeddings (B, gst_token_dim).

@ -31,10 +31,14 @@ def _apply_attention_constraint(e,
Text-to-Speech with Convolutional Sequence Learning`_.
Args:
e(Tensor): Attention energy before applying softmax (1, T).
last_attended_idx(int): The index of the inputs of the last attended [0, T].
backward_window(int, optional, optional): Backward window size in attention constraint. (Default value = 1)
forward_window(int, optional, optional): Forward window size in attetion constraint. (Default value = 3)
e(Tensor):
Attention energy before applying softmax (1, T).
last_attended_idx(int):
The index of the inputs of the last attended [0, T].
backward_window(int, optional, optional):
Backward window size in attention constraint. (Default value = 1)
forward_window(int, optional, optional):
Forward window size in attetion constraint. (Default value = 3)
Returns:
Tensor: Monotonic constrained attention energy (1, T).
@ -62,12 +66,18 @@ class AttLoc(nn.Layer):
(https://arxiv.org/pdf/1506.07503.pdf)
Args:
eprojs (int): projection-units of encoder
dunits (int): units of decoder
att_dim (int): attention dimension
aconv_chans (int): channels of attention convolution
aconv_filts (int): filter size of attention convolution
han_mode (bool): flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
eprojs (int):
projection-units of encoder
dunits (int):
units of decoder
att_dim (int):
attention dimension
aconv_chans (int):
channels of attention convolution
aconv_filts (int):
filter size of attention convolution
han_mode (bool):
flag to swith on mode of hierarchical attention and not store pre_compute_enc_h
"""
def __init__(self,
@ -117,18 +127,29 @@ class AttLoc(nn.Layer):
forward_window=3, ):
"""Calculate AttLoc forward propagation.
Args:
enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
enc_hs_len(Tensor): padded encoder hidden state length (B)
dec_z(Tensor dec_z): decoder hidden state (B, D_dec)
att_prev(Tensor): previous attention weight (B, T_max)
scaling(float, optional): scaling parameter before applying softmax (Default value = 2.0)
forward_window(Tensor, optional): forward window size when constraining attention (Default value = 3)
last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
backward_window(int, optional): backward window size in attention constraint (Default value = 1)
forward_window(int, optional): forward window size in attetion constraint (Default value = 3)
enc_hs_pad(Tensor):
padded encoder hidden state (B, T_max, D_enc)
enc_hs_len(Tensor):
padded encoder hidden state length (B)
dec_z(Tensor dec_z):
decoder hidden state (B, D_dec)
att_prev(Tensor):
previous attention weight (B, T_max)
scaling(float, optional):
scaling parameter before applying softmax (Default value = 2.0)
forward_window(Tensor, optional):
forward window size when constraining attention (Default value = 3)
last_attended_idx(int, optional):
index of the inputs of the last attended (Default value = None)
backward_window(int, optional):
backward window size in attention constraint (Default value = 1)
forward_window(int, optional):
forward window size in attetion constraint (Default value = 3)
Returns:
Tensor: attention weighted encoder state (B, D_enc)
Tensor: previous attention weights (B, T_max)
Tensor:
attention weighted encoder state (B, D_enc)
Tensor:
previous attention weights (B, T_max)
"""
batch = paddle.shape(enc_hs_pad)[0]
# pre-compute all h outside the decoder loop
@ -192,11 +213,16 @@ class AttForward(nn.Layer):
(https://arxiv.org/pdf/1807.06736.pdf)
Args:
eprojs (int): projection-units of encoder
dunits (int): units of decoder
att_dim (int): attention dimension
aconv_chans (int): channels of attention convolution
aconv_filts (int): filter size of attention convolution
eprojs (int):
projection-units of encoder
dunits (int):
units of decoder
att_dim (int):
attention dimension
aconv_chans (int):
channels of attention convolution
aconv_filts (int):
filter size of attention convolution
"""
def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
@ -239,18 +265,28 @@ class AttForward(nn.Layer):
"""Calculate AttForward forward propagation.
Args:
enc_hs_pad(Tensor): padded encoder hidden state (B, T_max, D_enc)
enc_hs_len(list): padded encoder hidden state length (B,)
dec_z(Tensor): decoder hidden state (B, D_dec)
att_prev(Tensor): attention weights of previous step (B, T_max)
scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
backward_window(int, optional): backward window size in attention constraint (Default value = 1)
forward_window(int, optional): (Default value = 3)
enc_hs_pad(Tensor):
padded encoder hidden state (B, T_max, D_enc)
enc_hs_len(list):
padded encoder hidden state length (B,)
dec_z(Tensor):
decoder hidden state (B, D_dec)
att_prev(Tensor):
attention weights of previous step (B, T_max)
scaling(float, optional):
scaling parameter before applying softmax (Default value = 1.0)
last_attended_idx(int, optional):
index of the inputs of the last attended (Default value = None)
backward_window(int, optional):
backward window size in attention constraint (Default value = 1)
forward_window(int, optional):
(Default value = 3)
Returns:
Tensor: attention weighted encoder state (B, D_enc)
Tensor: previous attention weights (B, T_max)
Tensor:
attention weighted encoder state (B, D_enc)
Tensor:
previous attention weights (B, T_max)
"""
batch = len(enc_hs_pad)
# pre-compute all h outside the decoder loop
@ -321,12 +357,18 @@ class AttForwardTA(nn.Layer):
(https://arxiv.org/pdf/1807.06736.pdf)
Args:
eunits (int): units of encoder
dunits (int): units of decoder
att_dim (int): attention dimension
aconv_chans (int): channels of attention convolution
aconv_filts (int): filter size of attention convolution
odim (int): output dimension
eunits (int):
units of encoder
dunits (int):
units of decoder
att_dim (int):
attention dimension
aconv_chans (int):
channels of attention convolution
aconv_filts (int):
filter size of attention convolution
odim (int):
output dimension
"""
def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim):
@ -372,19 +414,30 @@ class AttForwardTA(nn.Layer):
"""Calculate AttForwardTA forward propagation.
Args:
enc_hs_pad(Tensor): padded encoder hidden state (B, Tmax, eunits)
enc_hs_len(list Tensor): padded encoder hidden state length (B,)
dec_z(Tensor): decoder hidden state (B, dunits)
att_prev(Tensor): attention weights of previous step (B, T_max)
out_prev(Tensor): decoder outputs of previous step (B, odim)
scaling(float, optional): scaling parameter before applying softmax (Default value = 1.0)
last_attended_idx(int, optional): index of the inputs of the last attended (Default value = None)
backward_window(int, optional): backward window size in attention constraint (Default value = 1)
forward_window(int, optional): (Default value = 3)
enc_hs_pad(Tensor):
padded encoder hidden state (B, Tmax, eunits)
enc_hs_len(list Tensor):
padded encoder hidden state length (B,)
dec_z(Tensor):
decoder hidden state (B, dunits)
att_prev(Tensor):
attention weights of previous step (B, T_max)
out_prev(Tensor):
decoder outputs of previous step (B, odim)
scaling(float, optional):
scaling parameter before applying softmax (Default value = 1.0)
last_attended_idx(int, optional):
index of the inputs of the last attended (Default value = None)
backward_window(int, optional):
backward window size in attention constraint (Default value = 1)
forward_window(int, optional):
(Default value = 3)
Returns:
Tensor: attention weighted encoder state (B, dunits)
Tensor: previous attention weights (B, Tmax)
Tensor:
attention weighted encoder state (B, dunits)
Tensor:
previous attention weights (B, Tmax)
"""
batch = len(enc_hs_pad)
# pre-compute all h outside the decoder loop

@ -45,10 +45,14 @@ class Prenet(nn.Layer):
"""Initialize prenet module.
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
n_layers (int, optional): The number of prenet layers.
n_units (int, optional): The number of prenet units.
idim (int):
Dimension of the inputs.
odim (int):
Dimension of the outputs.
n_layers (int, optional):
The number of prenet layers.
n_units (int, optional):
The number of prenet units.
"""
super().__init__()
self.dropout_rate = dropout_rate
@ -62,7 +66,8 @@ class Prenet(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Batch of input tensors (B, ..., idim).
x (Tensor):
Batch of input tensors (B, ..., idim).
Returns:
Tensor: Batch of output tensors (B, ..., odim).
@ -212,7 +217,8 @@ class ZoneOutCell(nn.Layer):
"""Calculate forward propagation.
Args:
inputs (Tensor): Batch of input tensor (B, input_size).
inputs (Tensor):
Batch of input tensor (B, input_size).
hidden (tuple):
- Tensor: Batch of initial hidden states (B, hidden_size).
- Tensor: Batch of initial cell states (B, hidden_size).
@ -277,26 +283,39 @@ class Decoder(nn.Layer):
"""Initialize Tacotron2 decoder module.
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
att (nn.Layer): Instance of attention class.
dlayers (int, optional): The number of decoder lstm layers.
dunits (int, optional): The number of decoder lstm units.
prenet_layers (int, optional): The number of prenet layers.
prenet_units (int, optional): The number of prenet units.
postnet_layers (int, optional): The number of postnet layers.
postnet_filts (int, optional): The number of postnet filter size.
postnet_chans (int, optional): The number of postnet filter channels.
output_activation_fn (nn.Layer, optional): Activation function for outputs.
cumulate_att_w (bool, optional): Whether to cumulate previous attention weight.
use_batch_norm (bool, optional): Whether to use batch normalization.
use_concate : bool, optional
idim (int):
Dimension of the inputs.
odim (int):
Dimension of the outputs.
att (nn.Layer):
Instance of attention class.
dlayers (int, optional):
The number of decoder lstm layers.
dunits (int, optional):
The number of decoder lstm units.
prenet_layers (int, optional):
The number of prenet layers.
prenet_units (int, optional):
The number of prenet units.
postnet_layers (int, optional):
The number of postnet layers.
postnet_filts (int, optional):
The number of postnet filter size.
postnet_chans (int, optional):
The number of postnet filter channels.
output_activation_fn (nn.Layer, optional):
Activation function for outputs.
cumulate_att_w (bool, optional):
Whether to cumulate previous attention weight.
use_batch_norm (bool, optional):
Whether to use batch normalization.
use_concate (bool, optional):
Whether to concatenate encoder embedding with decoder lstm outputs.
dropout_rate : float, optional
dropout_rate (float, optional):
Dropout rate.
zoneout_rate : float, optional
zoneout_rate (float, optional):
Zoneout rate.
reduction_factor : int, optional
reduction_factor (int, optional):
Reduction factor.
"""
super().__init__()
@ -363,15 +382,22 @@ class Decoder(nn.Layer):
"""Calculate forward propagation.
Args:
hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
hlens (Tensor(int64) padded): Batch of lengths of each input batch (B,).
ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
hs (Tensor):
Batch of the sequences of padded hidden states (B, Tmax, idim).
hlens (Tensor(int64) padded):
Batch of lengths of each input batch (B,).
ys (Tensor):
Batch of the sequences of padded target features (B, Lmax, odim).
Returns:
Tensor: Batch of output tensors after postnet (B, Lmax, odim).
Tensor: Batch of output tensors before postnet (B, Lmax, odim).
Tensor: Batch of logits of stop prediction (B, Lmax).
Tensor: Batch of attention weights (B, Lmax, Tmax).
Tensor:
Batch of output tensors after postnet (B, Lmax, odim).
Tensor:
Batch of output tensors before postnet (B, Lmax, odim).
Tensor:
Batch of logits of stop prediction (B, Lmax).
Tensor:
Batch of attention weights (B, Lmax, Tmax).
Note:
This computation is performed in teacher-forcing manner.
@ -471,20 +497,30 @@ class Decoder(nn.Layer):
forward_window=None, ):
"""Generate the sequence of features given the sequences of characters.
Args:
h(Tensor): Input sequence of encoder hidden states (T, C).
threshold(float, optional, optional): Threshold to stop generation. (Default value = 0.5)
minlenratio(float, optional, optional): Minimum length ratio. If set to 1.0 and the length of input is 10,
h(Tensor):
Input sequence of encoder hidden states (T, C).
threshold(float, optional, optional):
Threshold to stop generation. (Default value = 0.5)
minlenratio(float, optional, optional):
Minimum length ratio. If set to 1.0 and the length of input is 10,
the minimum length of outputs will be 10 * 1 = 10. (Default value = 0.0)
maxlenratio(float, optional, optional): Minimum length ratio. If set to 10 and the length of input is 10,
maxlenratio(float, optional, optional):
Minimum length ratio. If set to 10 and the length of input is 10,
the maximum length of outputs will be 10 * 10 = 100. (Default value = 0.0)
use_att_constraint(bool, optional): Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
backward_window(int, optional): Backward window size in attention constraint. (Default value = None)
forward_window(int, optional): (Default value = None)
use_att_constraint(bool, optional):
Whether to apply attention constraint introduced in `Deep Voice 3`_. (Default value = False)
backward_window(int, optional):
Backward window size in attention constraint. (Default value = None)
forward_window(int, optional):
(Default value = None)
Returns:
Tensor: Output sequence of features (L, odim).
Tensor: Output sequence of stop probabilities (L,).
Tensor: Attention weights (L, T).
Tensor:
Output sequence of features (L, odim).
Tensor:
Output sequence of stop probabilities (L,).
Tensor:
Attention weights (L, T).
Note:
This computation is performed in auto-regressive manner.
@ -625,9 +661,12 @@ class Decoder(nn.Layer):
"""Calculate all of the attention weights.
Args:
hs (Tensor): Batch of the sequences of padded hidden states (B, Tmax, idim).
hlens (Tensor(int64)): Batch of lengths of each input batch (B,).
ys (Tensor): Batch of the sequences of padded target features (B, Lmax, odim).
hs (Tensor):
Batch of the sequences of padded hidden states (B, Tmax, idim).
hlens (Tensor(int64)):
Batch of lengths of each input batch (B,).
ys (Tensor):
Batch of the sequences of padded target features (B, Lmax, odim).
Returns:
numpy.ndarray:

@ -46,17 +46,28 @@ class Encoder(nn.Layer):
padding_idx=0, ):
"""Initialize Tacotron2 encoder module.
Args:
idim (int): Dimension of the inputs.
input_layer (str): Input layer type.
embed_dim (int, optional): Dimension of character embedding.
elayers (int, optional): The number of encoder blstm layers.
eunits (int, optional): The number of encoder blstm units.
econv_layers (int, optional): The number of encoder conv layers.
econv_filts (int, optional): The number of encoder conv filter size.
econv_chans (int, optional): The number of encoder conv filter channels.
use_batch_norm (bool, optional): Whether to use batch normalization.
use_residual (bool, optional): Whether to use residual connection.
dropout_rate (float, optional): Dropout rate.
idim (int):
Dimension of the inputs.
input_layer (str):
Input layer type.
embed_dim (int, optional):
Dimension of character embedding.
elayers (int, optional):
The number of encoder blstm layers.
eunits (int, optional):
The number of encoder blstm units.
econv_layers (int, optional):
The number of encoder conv layers.
econv_filts (int, optional):
The number of encoder conv filter size.
econv_chans (int, optional):
The number of encoder conv filter channels.
use_batch_norm (bool, optional):
Whether to use batch normalization.
use_residual (bool, optional):
Whether to use residual connection.
dropout_rate (float, optional):
Dropout rate.
"""
super().__init__()
@ -127,14 +138,18 @@ class Encoder(nn.Layer):
"""Calculate forward propagation.
Args:
xs (Tensor): Batch of the padded sequence. Either character ids (B, Tmax)
xs (Tensor):
Batch of the padded sequence. Either character ids (B, Tmax)
or acoustic feature (B, Tmax, idim * encoder_reduction_factor).
Padded value should be 0.
ilens (Tensor(int64)): Batch of lengths of each input batch (B,).
ilens (Tensor(int64)):
Batch of lengths of each input batch (B,).
Returns:
Tensor: Batch of the sequences of encoder states(B, Tmax, eunits).
Tensor(int64): Batch of lengths of each sequence (B,)
Tensor:
Batch of the sequences of encoder states(B, Tmax, eunits).
Tensor(int64):
Batch of lengths of each sequence (B,)
"""
xs = self.embed(xs).transpose([0, 2, 1])
if self.convs is not None:
@ -161,8 +176,8 @@ class Encoder(nn.Layer):
"""Inference.
Args:
x (Tensor): The sequeunce of character ids (T,)
or acoustic feature (T, idim * encoder_reduction_factor).
x (Tensor):
The sequeunce of character ids (T,) or acoustic feature (T, idim * encoder_reduction_factor).
Returns:
Tensor: The sequences of encoder states(T, eunits).

@ -60,11 +60,15 @@ class TADELayer(nn.Layer):
def forward(self, x, c):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
c (Tensor): Auxiliary input tensor (B, aux_channels, T).
x (Tensor):
Input tensor (B, in_channels, T).
c (Tensor):
Auxiliary input tensor (B, aux_channels, T).
Returns:
Tensor: Output tensor (B, in_channels, T * upsample_factor).
Tensor: Upsampled aux tensor (B, in_channels, T * upsample_factor).
Tensor:
Output tensor (B, in_channels, T * upsample_factor).
Tensor:
Upsampled aux tensor (B, in_channels, T * upsample_factor).
"""
x = self.norm(x)
@ -138,11 +142,15 @@ class TADEResBlock(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, in_channels, T).
c (Tensor): Auxiliary input tensor (B, aux_channels, T).
x (Tensor):
Input tensor (B, in_channels, T).
c (Tensor):
Auxiliary input tensor (B, aux_channels, T).
Returns:
Tensor: Output tensor (B, in_channels, T * upsample_factor).
Tensor: Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
Tensor:
Output tensor (B, in_channels, T * upsample_factor).
Tensor:
Upsampled auxirialy tensor (B, in_channels, T * upsample_factor).
"""
residual = x
x, c = self.tade1(x, c)

@ -25,9 +25,12 @@ from paddlespeech.t2s.modules.masked_fill import masked_fill
class MultiHeadedAttention(nn.Layer):
"""Multi-Head Attention layer.
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
n_head (int):
The number of heads.
n_feat (int):
The number of features.
dropout_rate (float):
Dropout rate.
"""
def __init__(self, n_head, n_feat, dropout_rate):
@ -48,14 +51,20 @@ class MultiHeadedAttention(nn.Layer):
"""Transform query, key and value.
Args:
query(Tensor): query tensor (#batch, time1, size).
key(Tensor): Key tensor (#batch, time2, size).
value(Tensor): Value tensor (#batch, time2, size).
query(Tensor):
query tensor (#batch, time1, size).
key(Tensor):
Key tensor (#batch, time2, size).
value(Tensor):
Value tensor (#batch, time2, size).
Returns:
Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
Tensor:
Transformed query tensor (#batch, n_head, time1, d_k).
Tensor:
Transformed key tensor (#batch, n_head, time2, d_k).
Tensor:
Transformed value tensor (#batch, n_head, time2, d_k).
"""
n_batch = paddle.shape(query)[0]
@ -77,9 +86,12 @@ class MultiHeadedAttention(nn.Layer):
"""Compute attention context vector.
Args:
value(Tensor): Transformed value (#batch, n_head, time2, d_k).
scores(Tensor): Attention score (#batch, n_head, time1, time2).
mask(Tensor, optional): Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
value(Tensor):
Transformed value (#batch, n_head, time2, d_k).
scores(Tensor):
Attention score (#batch, n_head, time1, time2).
mask(Tensor, optional):
Mask (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
Returns:
Tensor: Transformed value (#batch, time1, d_model) weighted by the attention score (#batch, time1, time2).
@ -113,10 +125,14 @@ class MultiHeadedAttention(nn.Layer):
"""Compute scaled dot product attention.
Args:
query(Tensor): Query tensor (#batch, time1, size).
key(Tensor): Key tensor (#batch, time2, size).
value(Tensor): Value tensor (#batch, time2, size).
mask(Tensor, optional): Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
query(Tensor):
Query tensor (#batch, time1, size).
key(Tensor):
Key tensor (#batch, time2, size).
value(Tensor):
Value tensor (#batch, time2, size).
mask(Tensor, optional):
Mask tensor (#batch, 1, time2) or (#batch, time1, time2). (Default value = None)
Returns:
Tensor: Output tensor (#batch, time1, d_model).
@ -134,10 +150,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
Paper: https://arxiv.org/abs/1901.02860
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
n_head (int):
The number of heads.
n_feat (int):
The number of features.
dropout_rate (float):
Dropout rate.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@ -161,10 +181,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
def rel_shift(self, x):
"""Compute relative positional encoding.
Args:
x(Tensor): Input tensor (batch, head, time1, 2*time1-1).
x(Tensor):
Input tensor (batch, head, time1, 2*time1-1).
Returns:
Tensor:Output tensor.
Tensor: Output tensor.
"""
b, h, t1, t2 = paddle.shape(x)
zero_pad = paddle.zeros((b, h, t1, 1))
@ -183,11 +204,16 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
query(Tensor): Query tensor (#batch, time1, size).
key(Tensor): Key tensor (#batch, time2, size).
value(Tensor): Value tensor (#batch, time2, size).
pos_emb(Tensor): Positional embedding tensor (#batch, 2*time1-1, size).
mask(Tensor): Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
query(Tensor):
Query tensor (#batch, time1, size).
key(Tensor):
Key tensor (#batch, time2, size).
value(Tensor):
Value tensor (#batch, time2, size).
pos_emb(Tensor):
Positional embedding tensor (#batch, 2*time1-1, size).
mask(Tensor):
Mask tensor (#batch, 1, time2) or (#batch, time1, time2).
Returns:
Tensor: Output tensor (#batch, time1, d_model).
@ -228,10 +254,14 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
Paper: https://arxiv.org/abs/1901.02860
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
n_head (int):
The number of heads.
n_feat (int):
The number of features.
dropout_rate (float):
Dropout rate.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_head, n_feat, dropout_rate, zero_triu=False):
@ -255,8 +285,8 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
def rel_shift(self, x):
"""Compute relative positional encoding.
Args:
x(Tensor): Input tensor (batch, head, time1, time2).
x(Tensor):
Input tensor (batch, head, time1, time2).
Returns:
Tensor:Output tensor.
"""

@ -37,28 +37,46 @@ class Decoder(nn.Layer):
"""Transfomer decoder module.
Args:
odim (int): Output diminsion.
self_attention_layer_type (str): Self-attention layer type.
attention_dim (int): Dimention of attention.
attention_heads (int): The number of heads of multi head attention.
conv_wshare (int): The number of kernel of convolution. Only used in
odim (int):
Output diminsion.
self_attention_layer_type (str):
Self-attention layer type.
attention_dim (int):
Dimention of attention.
attention_heads (int):
The number of heads of multi head attention.
conv_wshare (int):
The number of kernel of convolution. Only used in
self_attention_layer_type == "lightconv*" or "dynamiconv*".
conv_kernel_length (Union[int, str]):Kernel size str of convolution
conv_kernel_length (Union[int, str]):
Kernel size str of convolution
(e.g. 71_71_71_71_71_71). Only used in self_attention_layer_type == "lightconv*" or "dynamiconv*".
conv_usebias (bool): Whether to use bias in convolution. Only used in
conv_usebias (bool):
Whether to use bias in convolution. Only used in
self_attention_layer_type == "lightconv*" or "dynamiconv*".
linear_units(int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
self_attention_dropout_rate (float): Dropout rate in self-attention.
src_attention_dropout_rate (float): Dropout rate in source-attention.
input_layer (Union[str, nn.Layer]): Input layer type.
use_output_layer (bool): Whether to use output layer.
pos_enc_class (nn.Layer): Positional encoding module class.
linear_units(int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
self_attention_dropout_rate (float):
Dropout rate in self-attention.
src_attention_dropout_rate (float):
Dropout rate in source-attention.
input_layer (Union[str, nn.Layer]):
Input layer type.
use_output_layer (bool):
Whether to use output layer.
pos_enc_class (nn.Layer):
Positional encoding module class.
`PositionalEncoding `or `ScaledPositionalEncoding`
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
@ -143,17 +161,22 @@ class Decoder(nn.Layer):
def forward(self, tgt, tgt_mask, memory, memory_mask):
"""Forward decoder.
Args:
tgt(Tensor): Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
tgt(Tensor):
Input token ids, int64 (#batch, maxlen_out) if input_layer == "embed".
In the other case, input tensor (#batch, maxlen_out, odim).
tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
tgt_mask(Tensor):
Input token mask (#batch, maxlen_out).
memory(Tensor):
Encoded memory, float32 (#batch, maxlen_in, feat).
memory_mask(Tensor):
Encoded memory mask (#batch, maxlen_in).
Returns:
Tensor:
Decoded token score before softmax (#batch, maxlen_out, odim) if use_output_layer is True.
In the other case,final block outputs (#batch, maxlen_out, attention_dim).
Tensor: Score mask before softmax (#batch, maxlen_out).
Tensor:
Score mask before softmax (#batch, maxlen_out).
"""
x = self.embed(tgt)
@ -169,14 +192,20 @@ class Decoder(nn.Layer):
"""Forward one step.
Args:
tgt(Tensor): Input token ids, int64 (#batch, maxlen_out).
tgt_mask(Tensor): Input token mask (#batch, maxlen_out).
memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, feat).
cache((List[Tensor]), optional): List of cached tensors. (Default value = None)
tgt(Tensor):
Input token ids, int64 (#batch, maxlen_out).
tgt_mask(Tensor):
Input token mask (#batch, maxlen_out).
memory(Tensor):
Encoded memory, float32 (#batch, maxlen_in, feat).
cache((List[Tensor]), optional):
List of cached tensors. (Default value = None)
Returns:
Tensor: Output tensor (batch, maxlen_out, odim).
List[Tensor]: List of cache tensors of each decoder layer.
Tensor:
Output tensor (batch, maxlen_out, odim).
List[Tensor]:
List of cache tensors of each decoder layer.
"""
x = self.embed(tgt)
@ -219,9 +248,12 @@ class Decoder(nn.Layer):
"""Score new token batch (required).
Args:
ys(Tensor): paddle.int64 prefix tokens (n_batch, ylen).
states(List[Any]): Scorer states for prefix tokens.
xs(Tensor): The encoder feature that generates ys (n_batch, xlen, n_feat).
ys(Tensor):
paddle.int64 prefix tokens (n_batch, ylen).
states(List[Any]):
Scorer states for prefix tokens.
xs(Tensor):
The encoder feature that generates ys (n_batch, xlen, n_feat).
Returns:
tuple[Tensor, List[Any]]:

@ -24,16 +24,23 @@ class DecoderLayer(nn.Layer):
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
size (int):
Input dimension.
self_attn (nn.Layer):
Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
src_attn (nn.Layer): Self-attention module instance.
src_attn (nn.Layer):
Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
feed_forward (nn.Layer): Feed-forward module instance.
feed_forward (nn.Layer):
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
dropout_rate (float):
Dropout rate.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
@ -69,11 +76,16 @@ class DecoderLayer(nn.Layer):
"""Compute decoded features.
Args:
tgt(Tensor): Input tensor (#batch, maxlen_out, size).
tgt_mask(Tensor): Mask for input tensor (#batch, maxlen_out).
memory(Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
memory_mask(Tensor): Encoded memory mask (#batch, maxlen_in).
cache(List[Tensor], optional): List of cached tensors.
tgt(Tensor):
Input tensor (#batch, maxlen_out, size).
tgt_mask(Tensor):
Mask for input tensor (#batch, maxlen_out).
memory(Tensor):
Encoded memory, float32 (#batch, maxlen_in, size).
memory_mask(Tensor):
Encoded memory mask (#batch, maxlen_in).
cache(List[Tensor], optional):
List of cached tensors.
Each tensor shape should be (#batch, maxlen_out - 1, size). (Default value = None)
Returns:
Tensor

@ -23,11 +23,16 @@ class PositionalEncoding(nn.Layer):
"""Positional encoding.
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
reverse (bool): Whether to reverse the input position.
type (str): dtype of param
d_model (int):
Embedding dimension.
dropout_rate (float):
Dropout rate.
max_len (int):
Maximum input length.
reverse (bool):
Whether to reverse the input position.
type (str):
dtype of param
"""
def __init__(self,
@ -68,7 +73,8 @@ class PositionalEncoding(nn.Layer):
"""Add positional encoding.
Args:
x (Tensor): Input tensor (batch, time, `*`).
x (Tensor):
Input tensor (batch, time, `*`).
Returns:
Tensor: Encoded tensor (batch, time, `*`).
@ -84,10 +90,14 @@ class ScaledPositionalEncoding(PositionalEncoding):
See Sec. 3.2 https://arxiv.org/abs/1809.08895
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
dtype (str): dtype of param
d_model (int):
Embedding dimension.
dropout_rate (float):
Dropout rate.
max_len (int):
Maximum input length.
dtype (str):
dtype of param
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -111,7 +121,8 @@ class ScaledPositionalEncoding(PositionalEncoding):
"""Add positional encoding.
Args:
x (Tensor): Input tensor (batch, time, `*`).
x (Tensor):
Input tensor (batch, time, `*`).
Returns:
Tensor: Encoded tensor (batch, time, `*`).
"""
@ -127,9 +138,12 @@ class RelPositionalEncoding(nn.Layer):
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
d_model (int):
Embedding dimension.
dropout_rate (float):
Dropout rate.
max_len (int):
Maximum input length.
"""
def __init__(self, d_model, dropout_rate, max_len=5000, dtype="float32"):
@ -175,7 +189,8 @@ class RelPositionalEncoding(nn.Layer):
def forward(self, x: paddle.Tensor):
"""Add positional encoding.
Args:
x (Tensor):Input tensor (batch, time, `*`).
x (Tensor):
Input tensor (batch, time, `*`).
Returns:
Tensor: Encoded tensor (batch, time, `*`).
"""
@ -195,18 +210,24 @@ class LegacyRelPositionalEncoding(PositionalEncoding):
See : Appendix B in https://arxiv.org/abs/1901.02860
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int): Maximum input length.
d_model (int):
Embedding dimension.
dropout_rate (float):
Dropout rate.
max_len (int):
Maximum input length.
"""
def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000):
"""
Args:
d_model (int): Embedding dimension.
dropout_rate (float): Dropout rate.
max_len (int, optional): [Maximum input length.]. Defaults to 5000.
d_model (int):
Embedding dimension.
dropout_rate (float):
Dropout rate.
max_len (int, optional):
[Maximum input length.]. Defaults to 5000.
"""
super().__init__(d_model, dropout_rate, max_len, reverse=True)
@ -234,10 +255,13 @@ class LegacyRelPositionalEncoding(PositionalEncoding):
def forward(self, x: paddle.Tensor):
"""Compute positional encoding.
Args:
x (paddle.Tensor): Input tensor (batch, time, `*`).
x (Tensor):
Input tensor (batch, time, `*`).
Returns:
paddle.Tensor: Encoded tensor (batch, time, `*`).
paddle.Tensor: Positional embedding tensor (1, time, `*`).
Tensor:
Encoded tensor (batch, time, `*`).
Tensor:
Positional embedding tensor (1, time, `*`).
"""
self.extend_pe(x)
x = x * self.xscale

@ -38,32 +38,55 @@ class BaseEncoder(nn.Layer):
"""Base Encoder module.
Args:
idim (int): Input dimension.
attention_dim (int): Dimention of attention.
attention_heads (int): The number of heads of multi head attention.
linear_units (int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
attention_dropout_rate (float): Dropout rate in attention.
input_layer (Union[str, nn.Layer]): Input layer type.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
idim (int):
Input dimension.
attention_dim (int):
Dimention of attention.
attention_heads (int):
The number of heads of multi head attention.
linear_units (int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, nn.Layer]):
Input layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
macaron_style (bool): Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str): Encoder positional encoding layer type.
selfattention_layer_type (str): Encoder attention layer type.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int): Kernerl size of convolution module.
padding_idx (int): Padding idx for input_layer=embed.
stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
intermediate_layers (Union[List[int], None]): indices of intermediate CTC layer.
positionwise_layer_type (str):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int):
Kernel size of positionwise conv1d layer.
macaron_style (bool):
Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str):
Encoder positional encoding layer type.
selfattention_layer_type (str):
Encoder attention layer type.
activation_type (str):
Encoder activation function type.
use_cnn_module (bool):
Whether to use convolution module.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int):
Kernerl size of convolution module.
padding_idx (int):
Padding idx for input_layer=embed.
stochastic_depth_rate (float):
Maximum probability to skip the encoder layer.
intermediate_layers (Union[List[int], None]):
indices of intermediate CTC layer.
indices start from 1.
if not None, intermediate outputs are returned (which changes return type
signature.)
@ -266,12 +289,16 @@ class BaseEncoder(nn.Layer):
"""Encode input sequence.
Args:
xs (Tensor): Input tensor (#batch, time, idim).
masks (Tensor): Mask tensor (#batch, 1, time).
xs (Tensor):
Input tensor (#batch, time, idim).
masks (Tensor):
Mask tensor (#batch, 1, time).
Returns:
Tensor: Output tensor (#batch, time, attention_dim).
Tensor: Mask tensor (#batch, 1, time).
Tensor:
Output tensor (#batch, time, attention_dim).
Tensor:
Mask tensor (#batch, 1, time).
"""
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
@ -284,26 +311,43 @@ class TransformerEncoder(BaseEncoder):
"""Transformer encoder module.
Args:
idim (int): Input dimension.
attention_dim (int): Dimention of attention.
attention_heads (int): The number of heads of multi head attention.
linear_units (int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
attention_dropout_rate (float): Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]): Input layer type.
pos_enc_layer_type (str): Encoder positional encoding layer type.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
idim (int):
Input dimension.
attention_dim (int):
Dimention of attention.
attention_heads (int):
The number of heads of multi head attention.
linear_units (int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, paddle.nn.Layer]):
Input layer type.
pos_enc_layer_type (str):
Encoder positional encoding layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
selfattention_layer_type (str): Encoder attention layer type.
activation_type (str): Encoder activation function type.
padding_idx (int): Padding idx for input_layer=embed.
positionwise_layer_type (str):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int):
Kernel size of positionwise conv1d layer.
selfattention_layer_type (str):
Encoder attention layer type.
activation_type (str):
Encoder activation function type.
padding_idx (int):
Padding idx for input_layer=embed.
"""
def __init__(
@ -350,12 +394,16 @@ class TransformerEncoder(BaseEncoder):
"""Encoder input sequence.
Args:
xs(Tensor): Input tensor (#batch, time, idim).
masks(Tensor): Mask tensor (#batch, 1, time).
xs(Tensor):
Input tensor (#batch, time, idim).
masks(Tensor):
Mask tensor (#batch, 1, time).
Returns:
Tensor: Output tensor (#batch, time, attention_dim).
Tensor: Mask tensor (#batch, 1, time).
Tensor:
Output tensor (#batch, time, attention_dim).
Tensor:
Mask tensor (#batch, 1, time).
"""
xs = self.embed(xs)
xs, masks = self.encoders(xs, masks)
@ -367,14 +415,20 @@ class TransformerEncoder(BaseEncoder):
"""Encode input frame.
Args:
xs (Tensor): Input tensor.
masks (Tensor): Mask tensor.
cache (List[Tensor]): List of cache tensors.
xs (Tensor):
Input tensor.
masks (Tensor):
Mask tensor.
cache (List[Tensor]):
List of cache tensors.
Returns:
Tensor: Output tensor.
Tensor: Mask tensor.
List[Tensor]: List of new cache tensors.
Tensor:
Output tensor.
Tensor:
Mask tensor.
List[Tensor]:
List of new cache tensors.
"""
xs = self.embed(xs)
@ -393,32 +447,55 @@ class ConformerEncoder(BaseEncoder):
"""Conformer encoder module.
Args:
idim (int): Input dimension.
attention_dim (int): Dimention of attention.
attention_heads (int): The number of heads of multi head attention.
linear_units (int): The number of units of position-wise feed forward.
num_blocks (int): The number of decoder blocks.
dropout_rate (float): Dropout rate.
positional_dropout_rate (float): Dropout rate after adding positional encoding.
attention_dropout_rate (float): Dropout rate in attention.
input_layer (Union[str, nn.Layer]): Input layer type.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool):Whether to concat attention layer's input and output.
idim (int):
Input dimension.
attention_dim (int):
Dimention of attention.
attention_heads (int):
The number of heads of multi head attention.
linear_units (int):
The number of units of position-wise feed forward.
num_blocks (int):
The number of decoder blocks.
dropout_rate (float):
Dropout rate.
positional_dropout_rate (float):
Dropout rate after adding positional encoding.
attention_dropout_rate (float):
Dropout rate in attention.
input_layer (Union[str, nn.Layer]):
Input layer type.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
macaron_style (bool): Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str): Encoder positional encoding layer type.
selfattention_layer_type (str): Encoder attention layer type.
activation_type (str): Encoder activation function type.
use_cnn_module (bool): Whether to use convolution module.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int): Kernerl size of convolution module.
padding_idx (int): Padding idx for input_layer=embed.
stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
intermediate_layers (Union[List[int], None]):indices of intermediate CTC layer. indices start from 1.
positionwise_layer_type (str):
"linear", "conv1d", or "conv1d-linear".
positionwise_conv_kernel_size (int):
Kernel size of positionwise conv1d layer.
macaron_style (bool):
Whether to use macaron style for positionwise layer.
pos_enc_layer_type (str):
Encoder positional encoding layer type.
selfattention_layer_type (str):
Encoder attention layer type.
activation_type (str):
Encoder activation function type.
use_cnn_module (bool):
Whether to use convolution module.
zero_triu (bool):
Whether to zero the upper triangular part of attention matrix.
cnn_module_kernel (int):
Kernerl size of convolution module.
padding_idx (int):
Padding idx for input_layer=embed.
stochastic_depth_rate (float):
Maximum probability to skip the encoder layer.
intermediate_layers (Union[List[int], None]):
indices of intermediate CTC layer. indices start from 1.
if not None, intermediate outputs are returned (which changes return type signature.)
"""
@ -478,11 +555,15 @@ class ConformerEncoder(BaseEncoder):
"""Encode input sequence.
Args:
xs (Tensor): Input tensor (#batch, time, idim).
masks (Tensor): Mask tensor (#batch, 1, time).
xs (Tensor):
Input tensor (#batch, time, idim).
masks (Tensor):
Mask tensor (#batch, 1, time).
Returns:
Tensor: Output tensor (#batch, time, attention_dim).
Tensor: Mask tensor (#batch, 1, time).
Tensor:
Output tensor (#batch, time, attention_dim).
Tensor:
Mask tensor (#batch, 1, time).
"""
if isinstance(self.embed, (Conv2dSubsampling)):
xs, masks = self.embed(xs, masks)
@ -539,7 +620,8 @@ class Conv1dResidualBlock(nn.Layer):
def forward(self, xs):
"""Encode input sequence.
Args:
xs (Tensor): Input tensor (#batch, idim, T).
xs (Tensor):
Input tensor (#batch, idim, T).
Returns:
Tensor: Output tensor (#batch, odim, T).
"""
@ -582,8 +664,10 @@ class CNNDecoder(nn.Layer):
def forward(self, xs, masks=None):
"""Encode input sequence.
Args:
xs (Tensor): Input tensor (#batch, time, idim).
masks (Tensor): Mask tensor (#batch, 1, time).
xs (Tensor):
Input tensor (#batch, time, idim).
masks (Tensor):
Mask tensor (#batch, 1, time).
Returns:
Tensor: Output tensor (#batch, time, odim).
"""
@ -629,8 +713,10 @@ class CNNPostnet(nn.Layer):
def forward(self, xs, masks=None):
"""Encode input sequence.
Args:
xs (Tensor): Input tensor (#batch, odim, time).
masks (Tensor): Mask tensor (#batch, 1, time).
xs (Tensor):
Input tensor (#batch, odim, time).
masks (Tensor):
Mask tensor (#batch, 1, time).
Returns:
Tensor: Output tensor (#batch, odim, time).
"""

@ -21,14 +21,20 @@ class EncoderLayer(nn.Layer):
"""Encoder layer module.
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
size (int):
Input dimension.
self_attn (nn.Layer):
Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
feed_forward (nn.Layer): Feed-forward module instance.
feed_forward (nn.Layer):
Feed-forward module instance.
`PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool): Whether to use layer_norm before the first block.
concat_after (bool): Whether to concat attention layer's input and output.
dropout_rate (float):
Dropout rate.
normalize_before (bool):
Whether to use layer_norm before the first block.
concat_after (bool):
Whether to concat attention layer's input and output.
if True, additional linear will be applied.
i.e. x -> x + linear(concat(x, att(x)))
if False, no additional linear will be applied. i.e. x -> x + att(x)
@ -59,13 +65,18 @@ class EncoderLayer(nn.Layer):
"""Compute encoded features.
Args:
x(Tensor): Input tensor (#batch, time, size).
mask(Tensor): Mask tensor for the input (#batch, time).
cache(Tensor, optional): Cache tensor of the input (#batch, time - 1, size).
x(Tensor):
Input tensor (#batch, time, size).
mask(Tensor):
Mask tensor for the input (#batch, time).
cache(Tensor, optional):
Cache tensor of the input (#batch, time - 1, size).
Returns:
Tensor: Output tensor (#batch, time, size).
Tensor: Mask tensor (#batch, time).
Tensor:
Output tensor (#batch, time, size).
Tensor:
Mask tensor (#batch, time).
"""
residual = x
if self.normalize_before:

@ -31,12 +31,18 @@ class LightweightConvolution(nn.Layer):
https://github.com/pytorch/fairseq/tree/master/fairseq
Args:
wshare (int): the number of kernel of convolution
n_feat (int): the number of features
dropout_rate (float): dropout_rate
kernel_size (int): kernel size (length)
use_kernel_mask (bool): Use causal mask or not for convolution kernel
use_bias (bool): Use bias term or not.
wshare (int):
the number of kernel of convolution
n_feat (int):
the number of features
dropout_rate (float):
dropout_rate
kernel_size (int):
kernel size (length)
use_kernel_mask (bool):
Use causal mask or not for convolution kernel
use_bias (bool):
Use bias term or not.
"""
@ -94,10 +100,14 @@ class LightweightConvolution(nn.Layer):
This is just for compatibility with self-attention layer (attention.py)
Args:
query (Tensor): input tensor. (batch, time1, d_model)
key (Tensor): NOT USED. (batch, time2, d_model)
value (Tensor): NOT USED. (batch, time2, d_model)
mask : (Tensor): (batch, time1, time2) mask
query (Tensor):
input tensor. (batch, time1, d_model)
key (Tensor):
NOT USED. (batch, time2, d_model)
value (Tensor):
NOT USED. (batch, time2, d_model)
mask : (Tensor):
(batch, time1, time2) mask
Return:
Tensor: ouput. (batch, time1, d_model)

@ -19,8 +19,10 @@ def subsequent_mask(size, dtype=paddle.bool):
"""Create mask for subsequent steps (size, size).
Args:
size (int): size of mask
dtype (paddle.dtype): result dtype
size (int):
size of mask
dtype (paddle.dtype):
result dtype
Return:
Tensor:
>>> subsequent_mask(3)
@ -36,9 +38,12 @@ def target_mask(ys_in_pad, ignore_id, dtype=paddle.bool):
"""Create mask for decoder self-attention.
Args:
ys_pad (Tensor): batch of padded target sequences (B, Lmax)
ignore_id (int): index of padding
dtype (paddle.dtype): result dtype
ys_pad (Tensor):
batch of padded target sequences (B, Lmax)
ignore_id (int):
index of padding
dtype (paddle.dtype):
result dtype
Return:
Tensor: (B, Lmax, Lmax)
"""

@ -32,10 +32,14 @@ class MultiLayeredConv1d(nn.Layer):
"""Initialize MultiLayeredConv1d module.
Args:
in_chans (int): Number of input channels.
hidden_chans (int): Number of hidden channels.
kernel_size (int): Kernel size of conv1d.
dropout_rate (float): Dropout rate.
in_chans (int):
Number of input channels.
hidden_chans (int):
Number of hidden channels.
kernel_size (int):
Kernel size of conv1d.
dropout_rate (float):
Dropout rate.
"""
super().__init__()
@ -58,7 +62,8 @@ class MultiLayeredConv1d(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Batch of input tensors (B, T, in_chans).
x (Tensor):
Batch of input tensors (B, T, in_chans).
Returns:
Tensor: Batch of output tensors (B, T, in_chans).
@ -79,10 +84,14 @@ class Conv1dLinear(nn.Layer):
"""Initialize Conv1dLinear module.
Args:
in_chans (int): Number of input channels.
hidden_chans (int): Number of hidden channels.
kernel_size (int): Kernel size of conv1d.
dropout_rate (float): Dropout rate.
in_chans (int):
Number of input channels.
hidden_chans (int):
Number of hidden channels.
kernel_size (int):
Kernel size of conv1d.
dropout_rate (float):
Dropout rate.
"""
super().__init__()
self.w_1 = nn.Conv1D(
@ -99,7 +108,8 @@ class Conv1dLinear(nn.Layer):
"""Calculate forward propagation.
Args:
x (Tensor): Batch of input tensors (B, T, in_chans).
x (Tensor):
Batch of input tensors (B, T, in_chans).
Returns:
Tensor: Batch of output tensors (B, T, in_chans).

@ -21,9 +21,12 @@ class PositionwiseFeedForward(nn.Layer):
"""Positionwise feed forward layer.
Args:
idim (int): Input dimenstion.
hidden_units (int): The number of hidden units.
dropout_rate (float): Dropout rate.
idim (int):
Input dimenstion.
hidden_units (int):
The number of hidden units.
dropout_rate (float):
Dropout rate.
"""
def __init__(self,

@ -30,8 +30,10 @@ def repeat(N, fn):
"""Repeat module N times.
Args:
N (int): Number of repeat time.
fn (Callable): Function to generate module.
N (int):
Number of repeat time.
fn (Callable):
Function to generate module.
Returns:
MultiSequential: Repeated model instance.

@ -23,10 +23,14 @@ class Conv2dSubsampling(nn.Layer):
"""Convolutional 2D subsampling (to 1/4 length).
Args:
idim (int): Input dimension.
odim (int): Output dimension.
dropout_rate (float): Dropout rate.
pos_enc (nn.Layer): Custom position encoding layer.
idim (int):
Input dimension.
odim (int):
Output dimension.
dropout_rate (float):
Dropout rate.
pos_enc (nn.Layer):
Custom position encoding layer.
"""
def __init__(self, idim, odim, dropout_rate, pos_enc=None):
@ -45,11 +49,15 @@ class Conv2dSubsampling(nn.Layer):
def forward(self, x, x_mask):
"""Subsample x.
Args:
x (Tensor): Input tensor (#batch, time, idim).
x_mask (Tensor): Input mask (#batch, 1, time).
x (Tensor):
Input tensor (#batch, time, idim).
x_mask (Tensor):
Input mask (#batch, 1, time).
Returns:
Tensor: Subsampled tensor (#batch, time', odim), where time' = time // 4.
Tensor: Subsampled mask (#batch, 1, time'), where time' = time // 4.
Tensor:
Subsampled tensor (#batch, time', odim), where time' = time // 4.
Tensor:
Subsampled mask (#batch, 1, time'), where time' = time // 4.
"""
# (b, c, t, f)
x = x.unsqueeze(1)

@ -28,9 +28,12 @@ class Stretch2D(nn.Layer):
"""Strech an image (or image-like object) with some interpolation.
Args:
w_scale (int): Scalar of width.
h_scale (int): Scalar of the height.
mode (str, optional): Interpolation mode, modes suppored are "nearest", "bilinear",
w_scale (int):
Scalar of width.
h_scale (int):
Scalar of the height.
mode (str, optional):
Interpolation mode, modes suppored are "nearest", "bilinear",
"trilinear", "bicubic", "linear" and "area",by default "nearest"
For more details about interpolation, see
`paddle.nn.functional.interpolate <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/interpolate_en.html>`_.
@ -44,11 +47,12 @@ class Stretch2D(nn.Layer):
"""
Args:
x (Tensor): Shape (N, C, H, W)
x (Tensor):
Shape (N, C, H, W)
Returns:
Tensor: The stretched image.
Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
Tensor:
The stretched image. Shape (N, C, H', W'), where ``H'=h_scale * H``, ``W'=w_scale * W``.
"""
out = F.interpolate(
@ -61,12 +65,18 @@ class UpsampleNet(nn.Layer):
convolutions.
Args:
upsample_scales (List[int]): Upsampling factors for each strech.
nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
upsample_scales (List[int]):
Upsampling factors for each strech.
nonlinear_activation (Optional[str], optional):
Activation after each convolution, by default None
nonlinear_activation_params (Dict[str, Any], optional):
Parameters passed to construct the activation, by default {}
interpolate_mode (str, optional):
Interpolation mode of the strech, by default "nearest"
freq_axis_kernel_size (int, optional):
Convolution kernel size along the frequency axis, by default 1
use_causal_conv (bool, optional):
Whether to use causal padding before convolution, by default False
If True, Causal padding is used along the time axis,
i.e. padding amount is ``receptive field - 1`` and 0 for before and after, respectively.
If False, "same" padding is used along the time axis.
@ -106,7 +116,8 @@ class UpsampleNet(nn.Layer):
def forward(self, c):
"""
Args:
c (Tensor): spectrogram. Shape (N, F, T)
c (Tensor):
spectrogram. Shape (N, F, T)
Returns:
Tensor: upsampled spectrogram.
@ -126,17 +137,25 @@ class ConvInUpsampleNet(nn.Layer):
UpsampleNet.
Args:
upsample_scales (List[int]): Upsampling factors for each strech.
nonlinear_activation (Optional[str], optional): Activation after each convolution, by default None
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to construct the activation, by default {}
interpolate_mode (str, optional): Interpolation mode of the strech, by default "nearest"
freq_axis_kernel_size (int, optional): Convolution kernel size along the frequency axis, by default 1
aux_channels (int, optional): Feature size of the input, by default 80
aux_context_window (int, optional): Context window of the first 1D convolution applied to the input. It
upsample_scales (List[int]):
Upsampling factors for each strech.
nonlinear_activation (Optional[str], optional):
Activation after each convolution, by default None
nonlinear_activation_params (Dict[str, Any], optional):
Parameters passed to construct the activation, by default {}
interpolate_mode (str, optional):
Interpolation mode of the strech, by default "nearest"
freq_axis_kernel_size (int, optional):
Convolution kernel size along the frequency axis, by default 1
aux_channels (int, optional):
Feature size of the input, by default 80
aux_context_window (int, optional):
Context window of the first 1D convolution applied to the input. It
related to the kernel size of the convolution, by default 0
If use causal convolution, the kernel size is ``window + 1``,
else the kernel size is ``2 * window + 1``.
use_causal_conv (bool, optional): Whether to use causal padding before convolution, by default False
use_causal_conv (bool, optional):
Whether to use causal padding before convolution, by default False
If True, Causal padding is used along the time axis, i.e. padding
amount is ``receptive field - 1`` and 0 for before and after, respectively.
If False, "same" padding is used along the time axis.
@ -171,7 +190,8 @@ class ConvInUpsampleNet(nn.Layer):
def forward(self, c):
"""
Args:
c (Tensor): spectrogram. Shape (N, F, T)
c (Tensor):
spectrogram. Shape (N, F, T)
Returns:
Tensors: upsampled spectrogram. Shape (N, F, T'), where ``T' = upsample_factor * T``,

@ -58,8 +58,10 @@ class ExperimentBase(object):
need.
Args:
config (yacs.config.CfgNode): The configuration used for the experiment.
args (argparse.Namespace): The parsed command line arguments.
config (yacs.config.CfgNode):
The configuration used for the experiment.
args (argparse.Namespace):
The parsed command line arguments.
Examples:
>>> def main_sp(config, args):

@ -25,7 +25,8 @@ def _load_latest_checkpoint(checkpoint_dir: str) -> int:
"""Get the iteration number corresponding to the latest saved checkpoint.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
checkpoint_dir (str):
the directory where checkpoint is saved.
Returns:
int: the latest iteration number.
@ -46,8 +47,10 @@ def _save_checkpoint(checkpoint_dir: str, iteration: int):
"""Save the iteration number of the latest model to be checkpointed.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
iteration (int): the latest iteration number.
checkpoint_dir (str):
the directory where checkpoint is saved.
iteration (int):
the latest iteration number.
Returns:
None
@ -65,11 +68,14 @@ def load_parameters(model,
"""Load a specific model checkpoint from disk.
Args:
model (Layer): model to load parameters.
optimizer (Optimizer, optional): optimizer to load states if needed.
Defaults to None.
checkpoint_dir (str, optional): the directory where checkpoint is saved.
checkpoint_path (str, optional): if specified, load the checkpoint
model (Layer):
model to load parameters.
optimizer (Optimizer, optional):
optimizer to load states if needed. Defaults to None.
checkpoint_dir (str, optional):
the directory where checkpoint is saved.
checkpoint_path (str, optional):
if specified, load the checkpoint
stored in the checkpoint_path and the argument 'checkpoint_dir' will
be ignored. Defaults to None.
@ -113,11 +119,14 @@ def save_parameters(checkpoint_dir, iteration, model, optimizer=None):
"""Checkpoint the latest trained model parameters.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
iteration (int): the latest iteration number.
model (Layer): model to be checkpointed.
optimizer (Optimizer, optional): optimizer to be checkpointed.
Defaults to None.
checkpoint_dir (str):
the directory where checkpoint is saved.
iteration (int):
the latest iteration number.
model (Layer):
model to be checkpointed.
optimizer (Optimizer, optional):
optimizer to be checkpointed. Defaults to None.
Returns:
None

@ -71,10 +71,14 @@ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
hypothesis sequence in word-level.
Args:
reference (str): The reference sentence.
hypothesis (str): The hypothesis sentence.
ignore_case (bool): Whether case-sensitive or not.
delimiter (char(str)): Delimiter of input sentences.
reference (str):
The reference sentence.
hypothesis (str):
The hypothesis sentence.
ignore_case (bool):
Whether case-sensitive or not.
delimiter (char(str)):
Delimiter of input sentences.
Returns:
list: Levenshtein distance and word number of reference sentence.

@ -24,8 +24,10 @@ import numpy as np
def read_hdf5(filename: Union[Path, str], dataset_name: str) -> Any:
"""Read a dataset from a HDF5 file.
Args:
filename (Union[Path, str]): Path of the HDF5 file.
dataset_name (str): Name of the dataset to read.
filename (Union[Path, str]):
Path of the HDF5 file.
dataset_name (str):
Name of the dataset to read.
Returns:
Any: The retrieved dataset.

@ -22,7 +22,8 @@ def convert_dtype_to_np_dtype_(dtype):
Convert paddle's data type to corrsponding numpy data type.
Args:
dtype(np.dtype): the data type in paddle.
dtype(np.dtype):
the data type in paddle.
Returns:
type: the data type in numpy.

@ -76,7 +76,7 @@ server = [
"fastapi",
"uvicorn",
"pattern_singleton",
"websockets",
"websockets"
]
requirements = {

Loading…
Cancel
Save