|
|
|
@ -97,81 +97,104 @@ class VITSGenerator(nn.Layer):
|
|
|
|
|
stochastic_duration_predictor_dds_conv_layers: int=3, ):
|
|
|
|
|
"""Initialize VITS generator module.
|
|
|
|
|
Args:
|
|
|
|
|
vocabs (int): Input vocabulary size.
|
|
|
|
|
aux_channels (int): Number of acoustic feature channels.
|
|
|
|
|
hidden_channels (int): Number of hidden channels.
|
|
|
|
|
spks (Optional[int]): Number of speakers. If set to > 1, assume that the
|
|
|
|
|
vocabs (int):
|
|
|
|
|
Input vocabulary size.
|
|
|
|
|
aux_channels (int):
|
|
|
|
|
Number of acoustic feature channels.
|
|
|
|
|
hidden_channels (int):
|
|
|
|
|
Number of hidden channels.
|
|
|
|
|
spks (Optional[int]):
|
|
|
|
|
Number of speakers. If set to > 1, assume that the
|
|
|
|
|
sids will be provided as the input and use sid embedding layer.
|
|
|
|
|
langs (Optional[int]): Number of languages. If set to > 1, assume that the
|
|
|
|
|
langs (Optional[int]):
|
|
|
|
|
Number of languages. If set to > 1, assume that the
|
|
|
|
|
lids will be provided as the input and use sid embedding layer.
|
|
|
|
|
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
|
|
|
|
|
spk_embed_dim (Optional[int]):
|
|
|
|
|
Speaker embedding dimension. If set to > 0,
|
|
|
|
|
assume that spembs will be provided as the input.
|
|
|
|
|
global_channels (int): Number of global conditioning channels.
|
|
|
|
|
segment_size (int): Segment size for decoder.
|
|
|
|
|
text_encoder_attention_heads (int): Number of heads in conformer block
|
|
|
|
|
of text encoder.
|
|
|
|
|
text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
|
|
|
|
|
of text encoder.
|
|
|
|
|
text_encoder_blocks (int): Number of conformer blocks in text encoder.
|
|
|
|
|
text_encoder_positionwise_layer_type (str): Position-wise layer type in
|
|
|
|
|
conformer block of text encoder.
|
|
|
|
|
text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
|
|
|
|
|
kernel size in conformer block of text encoder. Only used when the
|
|
|
|
|
above layer type is conv1d or conv1d-linear.
|
|
|
|
|
text_encoder_positional_encoding_layer_type (str): Positional encoding layer
|
|
|
|
|
type in conformer block of text encoder.
|
|
|
|
|
text_encoder_self_attention_layer_type (str): Self-attention layer type in
|
|
|
|
|
conformer block of text encoder.
|
|
|
|
|
text_encoder_activation_type (str): Activation function type in conformer
|
|
|
|
|
block of text encoder.
|
|
|
|
|
text_encoder_normalize_before (bool): Whether to apply layer norm before
|
|
|
|
|
self-attention in conformer block of text encoder.
|
|
|
|
|
text_encoder_dropout_rate (float): Dropout rate in conformer block of
|
|
|
|
|
text encoder.
|
|
|
|
|
text_encoder_positional_dropout_rate (float): Dropout rate for positional
|
|
|
|
|
encoding in conformer block of text encoder.
|
|
|
|
|
text_encoder_attention_dropout_rate (float): Dropout rate for attention in
|
|
|
|
|
conformer block of text encoder.
|
|
|
|
|
text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
|
|
|
|
|
will be used when only use_conformer_conv_in_text_encoder = True.
|
|
|
|
|
use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
|
|
|
|
|
in conformer block of text encoder.
|
|
|
|
|
use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
|
|
|
|
|
conformer block of text encoder.
|
|
|
|
|
decoder_kernel_size (int): Decoder kernel size.
|
|
|
|
|
decoder_channels (int): Number of decoder initial channels.
|
|
|
|
|
decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
|
|
|
|
|
decoder_upsample_kernel_sizes (List[int]): List of kernel size for
|
|
|
|
|
upsampling layers in decoder.
|
|
|
|
|
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
|
|
|
|
|
in decoder.
|
|
|
|
|
decoder_resblock_dilations (List[List[int]]): List of list of dilations for
|
|
|
|
|
resblocks in decoder.
|
|
|
|
|
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
|
|
|
|
|
decoder.
|
|
|
|
|
posterior_encoder_kernel_size (int): Posterior encoder kernel size.
|
|
|
|
|
posterior_encoder_layers (int): Number of layers of posterior encoder.
|
|
|
|
|
posterior_encoder_stacks (int): Number of stacks of posterior encoder.
|
|
|
|
|
posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
|
|
|
|
|
posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
|
|
|
|
|
use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
|
|
|
|
|
normalization in posterior encoder.
|
|
|
|
|
flow_flows (int): Number of flows in flow.
|
|
|
|
|
flow_kernel_size (int): Kernel size in flow.
|
|
|
|
|
flow_base_dilation (int): Base dilation in flow.
|
|
|
|
|
flow_layers (int): Number of layers in flow.
|
|
|
|
|
flow_dropout_rate (float): Dropout rate in flow
|
|
|
|
|
use_weight_norm_in_flow (bool): Whether to apply weight normalization in
|
|
|
|
|
flow.
|
|
|
|
|
use_only_mean_in_flow (bool): Whether to use only mean in flow.
|
|
|
|
|
stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
|
|
|
|
|
duration predictor.
|
|
|
|
|
stochastic_duration_predictor_dropout_rate (float): Dropout rate in
|
|
|
|
|
stochastic duration predictor.
|
|
|
|
|
stochastic_duration_predictor_flows (int): Number of flows in stochastic
|
|
|
|
|
duration predictor.
|
|
|
|
|
stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
|
|
|
|
|
layers in stochastic duration predictor.
|
|
|
|
|
global_channels (int):
|
|
|
|
|
Number of global conditioning channels.
|
|
|
|
|
segment_size (int):
|
|
|
|
|
Segment size for decoder.
|
|
|
|
|
text_encoder_attention_heads (int):
|
|
|
|
|
Number of heads in conformer block of text encoder.
|
|
|
|
|
text_encoder_ffn_expand (int):
|
|
|
|
|
Expansion ratio of FFN in conformer block of text encoder.
|
|
|
|
|
text_encoder_blocks (int):
|
|
|
|
|
Number of conformer blocks in text encoder.
|
|
|
|
|
text_encoder_positionwise_layer_type (str):
|
|
|
|
|
Position-wise layer type in conformer block of text encoder.
|
|
|
|
|
text_encoder_positionwise_conv_kernel_size (int):
|
|
|
|
|
Position-wise convolution kernel size in conformer block of text encoder.
|
|
|
|
|
Only used when the above layer type is conv1d or conv1d-linear.
|
|
|
|
|
text_encoder_positional_encoding_layer_type (str):
|
|
|
|
|
Positional encoding layer type in conformer block of text encoder.
|
|
|
|
|
text_encoder_self_attention_layer_type (str):
|
|
|
|
|
Self-attention layer type in conformer block of text encoder.
|
|
|
|
|
text_encoder_activation_type (str):
|
|
|
|
|
Activation function type in conformer block of text encoder.
|
|
|
|
|
text_encoder_normalize_before (bool):
|
|
|
|
|
Whether to apply layer norm before self-attention in conformer block of text encoder.
|
|
|
|
|
text_encoder_dropout_rate (float):
|
|
|
|
|
Dropout rate in conformer block of text encoder.
|
|
|
|
|
text_encoder_positional_dropout_rate (float):
|
|
|
|
|
Dropout rate for positional encoding in conformer block of text encoder.
|
|
|
|
|
text_encoder_attention_dropout_rate (float):
|
|
|
|
|
Dropout rate for attention in conformer block of text encoder.
|
|
|
|
|
text_encoder_conformer_kernel_size (int):
|
|
|
|
|
Conformer conv kernel size. It will be used when only use_conformer_conv_in_text_encoder = True.
|
|
|
|
|
use_macaron_style_in_text_encoder (bool):
|
|
|
|
|
Whether to use macaron style FFN in conformer block of text encoder.
|
|
|
|
|
use_conformer_conv_in_text_encoder (bool):
|
|
|
|
|
Whether to use covolution in conformer block of text encoder.
|
|
|
|
|
decoder_kernel_size (int):
|
|
|
|
|
Decoder kernel size.
|
|
|
|
|
decoder_channels (int):
|
|
|
|
|
Number of decoder initial channels.
|
|
|
|
|
decoder_upsample_scales (List[int]):
|
|
|
|
|
List of upsampling scales in decoder.
|
|
|
|
|
decoder_upsample_kernel_sizes (List[int]):
|
|
|
|
|
List of kernel size for upsampling layers in decoder.
|
|
|
|
|
decoder_resblock_kernel_sizes (List[int]):
|
|
|
|
|
List of kernel size for resblocks in decoder.
|
|
|
|
|
decoder_resblock_dilations (List[List[int]]):
|
|
|
|
|
List of list of dilations for resblocks in decoder.
|
|
|
|
|
use_weight_norm_in_decoder (bool):
|
|
|
|
|
Whether to apply weight normalization in decoder.
|
|
|
|
|
posterior_encoder_kernel_size (int):
|
|
|
|
|
Posterior encoder kernel size.
|
|
|
|
|
posterior_encoder_layers (int):
|
|
|
|
|
Number of layers of posterior encoder.
|
|
|
|
|
posterior_encoder_stacks (int):
|
|
|
|
|
Number of stacks of posterior encoder.
|
|
|
|
|
posterior_encoder_base_dilation (int):
|
|
|
|
|
Base dilation of posterior encoder.
|
|
|
|
|
posterior_encoder_dropout_rate (float):
|
|
|
|
|
Dropout rate for posterior encoder.
|
|
|
|
|
use_weight_norm_in_posterior_encoder (bool):
|
|
|
|
|
Whether to apply weight normalization in posterior encoder.
|
|
|
|
|
flow_flows (int):
|
|
|
|
|
Number of flows in flow.
|
|
|
|
|
flow_kernel_size (int):
|
|
|
|
|
Kernel size in flow.
|
|
|
|
|
flow_base_dilation (int):
|
|
|
|
|
Base dilation in flow.
|
|
|
|
|
flow_layers (int):
|
|
|
|
|
Number of layers in flow.
|
|
|
|
|
flow_dropout_rate (float):
|
|
|
|
|
Dropout rate in flow
|
|
|
|
|
use_weight_norm_in_flow (bool):
|
|
|
|
|
Whether to apply weight normalization in flow.
|
|
|
|
|
use_only_mean_in_flow (bool):
|
|
|
|
|
Whether to use only mean in flow.
|
|
|
|
|
stochastic_duration_predictor_kernel_size (int):
|
|
|
|
|
Kernel size in stochastic duration predictor.
|
|
|
|
|
stochastic_duration_predictor_dropout_rate (float):
|
|
|
|
|
Dropout rate in stochastic duration predictor.
|
|
|
|
|
stochastic_duration_predictor_flows (int):
|
|
|
|
|
Number of flows in stochastic duration predictor.
|
|
|
|
|
stochastic_duration_predictor_dds_conv_layers (int):
|
|
|
|
|
Number of DDS conv layers in stochastic duration predictor.
|
|
|
|
|
"""
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.segment_size = segment_size
|
|
|
|
@ -272,20 +295,33 @@ class VITSGenerator(nn.Layer):
|
|
|
|
|
paddle.Tensor, paddle.Tensor, ], ]:
|
|
|
|
|
"""Calculate forward propagation.
|
|
|
|
|
Args:
|
|
|
|
|
text (Tensor): Text index tensor (B, T_text).
|
|
|
|
|
text_lengths (Tensor): Text length tensor (B,).
|
|
|
|
|
feats (Tensor): Feature tensor (B, aux_channels, T_feats).
|
|
|
|
|
feats_lengths (Tensor): Feature length tensor (B,).
|
|
|
|
|
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
|
|
|
|
|
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
|
|
|
|
|
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
|
|
|
|
|
text (Tensor):
|
|
|
|
|
Text index tensor (B, T_text).
|
|
|
|
|
text_lengths (Tensor):
|
|
|
|
|
Text length tensor (B,).
|
|
|
|
|
feats (Tensor):
|
|
|
|
|
Feature tensor (B, aux_channels, T_feats).
|
|
|
|
|
feats_lengths (Tensor):
|
|
|
|
|
Feature length tensor (B,).
|
|
|
|
|
sids (Optional[Tensor]):
|
|
|
|
|
Speaker index tensor (B,) or (B, 1).
|
|
|
|
|
spembs (Optional[Tensor]):
|
|
|
|
|
Speaker embedding tensor (B, spk_embed_dim).
|
|
|
|
|
lids (Optional[Tensor]):
|
|
|
|
|
Language index tensor (B,) or (B, 1).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
|
|
|
|
|
Tensor: Duration negative log-likelihood (NLL) tensor (B,).
|
|
|
|
|
Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
|
|
|
|
|
Tensor: Segments start index tensor (B,).
|
|
|
|
|
Tensor: Text mask tensor (B, 1, T_text).
|
|
|
|
|
Tensor: Feature mask tensor (B, 1, T_feats).
|
|
|
|
|
Tensor:
|
|
|
|
|
Waveform tensor (B, 1, segment_size * upsample_factor).
|
|
|
|
|
Tensor:
|
|
|
|
|
Duration negative log-likelihood (NLL) tensor (B,).
|
|
|
|
|
Tensor:
|
|
|
|
|
Monotonic attention weight tensor (B, 1, T_feats, T_text).
|
|
|
|
|
Tensor:
|
|
|
|
|
Segments start index tensor (B,).
|
|
|
|
|
Tensor:
|
|
|
|
|
Text mask tensor (B, 1, T_text).
|
|
|
|
|
Tensor:
|
|
|
|
|
Feature mask tensor (B, 1, T_feats).
|
|
|
|
|
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
|
|
|
|
|
- Tensor: Posterior encoder hidden representation (B, H, T_feats).
|
|
|
|
|
- Tensor: Flow hidden representation (B, H, T_feats).
|
|
|
|
@ -402,24 +438,40 @@ class VITSGenerator(nn.Layer):
|
|
|
|
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
|
|
|
|
"""Run inference.
|
|
|
|
|
Args:
|
|
|
|
|
text (Tensor): Input text index tensor (B, T_text,).
|
|
|
|
|
text_lengths (Tensor): Text length tensor (B,).
|
|
|
|
|
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
|
|
|
|
|
feats_lengths (Tensor): Feature length tensor (B,).
|
|
|
|
|
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
|
|
|
|
|
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
|
|
|
|
|
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
|
|
|
|
|
dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
|
|
|
|
|
text (Tensor):
|
|
|
|
|
Input text index tensor (B, T_text,).
|
|
|
|
|
text_lengths (Tensor):
|
|
|
|
|
Text length tensor (B,).
|
|
|
|
|
feats (Tensor):
|
|
|
|
|
Feature tensor (B, aux_channels, T_feats,).
|
|
|
|
|
feats_lengths (Tensor):
|
|
|
|
|
Feature length tensor (B,).
|
|
|
|
|
sids (Optional[Tensor]):
|
|
|
|
|
Speaker index tensor (B,) or (B, 1).
|
|
|
|
|
spembs (Optional[Tensor]):
|
|
|
|
|
Speaker embedding tensor (B, spk_embed_dim).
|
|
|
|
|
lids (Optional[Tensor]):
|
|
|
|
|
Language index tensor (B,) or (B, 1).
|
|
|
|
|
dur (Optional[Tensor]):
|
|
|
|
|
Ground-truth duration (B, T_text,). If provided,
|
|
|
|
|
skip the prediction of durations (i.e., teacher forcing).
|
|
|
|
|
noise_scale (float): Noise scale parameter for flow.
|
|
|
|
|
noise_scale_dur (float): Noise scale parameter for duration predictor.
|
|
|
|
|
alpha (float): Alpha parameter to control the speed of generated speech.
|
|
|
|
|
max_len (Optional[int]): Maximum length of acoustic feature sequence.
|
|
|
|
|
use_teacher_forcing (bool): Whether to use teacher forcing.
|
|
|
|
|
noise_scale (float):
|
|
|
|
|
Noise scale parameter for flow.
|
|
|
|
|
noise_scale_dur (float):
|
|
|
|
|
Noise scale parameter for duration predictor.
|
|
|
|
|
alpha (float):
|
|
|
|
|
Alpha parameter to control the speed of generated speech.
|
|
|
|
|
max_len (Optional[int]):
|
|
|
|
|
Maximum length of acoustic feature sequence.
|
|
|
|
|
use_teacher_forcing (bool):
|
|
|
|
|
Whether to use teacher forcing.
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor: Generated waveform tensor (B, T_wav).
|
|
|
|
|
Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
|
|
|
|
|
Tensor: Duration tensor (B, T_text).
|
|
|
|
|
Tensor:
|
|
|
|
|
Generated waveform tensor (B, T_wav).
|
|
|
|
|
Tensor:
|
|
|
|
|
Monotonic attention weight tensor (B, T_feats, T_text).
|
|
|
|
|
Tensor:
|
|
|
|
|
Duration tensor (B, T_text).
|
|
|
|
|
"""
|
|
|
|
|
# encoder
|
|
|
|
|
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
|
|
|
|
@ -533,15 +585,23 @@ class VITSGenerator(nn.Layer):
|
|
|
|
|
lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
|
|
|
|
|
"""Run voice conversion.
|
|
|
|
|
Args:
|
|
|
|
|
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
|
|
|
|
|
feats_lengths (Tensor): Feature length tensor (B,).
|
|
|
|
|
sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
|
|
|
|
|
sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
|
|
|
|
|
spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
|
|
|
|
|
spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
|
|
|
|
|
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
|
|
|
|
|
feats (Tensor):
|
|
|
|
|
Feature tensor (B, aux_channels, T_feats,).
|
|
|
|
|
feats_lengths (Tensor):
|
|
|
|
|
Feature length tensor (B,).
|
|
|
|
|
sids_src (Optional[Tensor]):
|
|
|
|
|
Speaker index tensor of source feature (B,) or (B, 1).
|
|
|
|
|
sids_tgt (Optional[Tensor]):
|
|
|
|
|
Speaker index tensor of target feature (B,) or (B, 1).
|
|
|
|
|
spembs_src (Optional[Tensor]):
|
|
|
|
|
Speaker embedding tensor of source feature (B, spk_embed_dim).
|
|
|
|
|
spembs_tgt (Optional[Tensor]):
|
|
|
|
|
Speaker embedding tensor of target feature (B, spk_embed_dim).
|
|
|
|
|
lids (Optional[Tensor]):
|
|
|
|
|
Language index tensor (B,) or (B, 1).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor: Generated waveform tensor (B, T_wav).
|
|
|
|
|
Tensor:
|
|
|
|
|
Generated waveform tensor (B, T_wav).
|
|
|
|
|
"""
|
|
|
|
|
# encoder
|
|
|
|
|
g_src = None
|
|
|
|
@ -602,10 +662,13 @@ class VITSGenerator(nn.Layer):
|
|
|
|
|
mask: paddle.Tensor) -> paddle.Tensor:
|
|
|
|
|
"""Generate path a.k.a. monotonic attention.
|
|
|
|
|
Args:
|
|
|
|
|
dur (Tensor): Duration tensor (B, 1, T_text).
|
|
|
|
|
mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
|
|
|
|
|
dur (Tensor):
|
|
|
|
|
Duration tensor (B, 1, T_text).
|
|
|
|
|
mask (Tensor):
|
|
|
|
|
Attention mask tensor (B, 1, T_feats, T_text).
|
|
|
|
|
Returns:
|
|
|
|
|
Tensor: Path tensor (B, 1, T_feats, T_text).
|
|
|
|
|
Tensor:
|
|
|
|
|
Path tensor (B, 1, T_feats, T_text).
|
|
|
|
|
"""
|
|
|
|
|
b, _, t_y, t_x = paddle.shape(mask)
|
|
|
|
|
cum_dur = paddle.cumsum(dur, -1)
|
|
|
|
|