You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
551 lines
26 KiB
551 lines
26 KiB
3 years ago
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
"""Generator module in VITS.
|
||
|
|
||
|
This code is based on https://github.com/jaywalnut310/vits.
|
||
|
|
||
|
"""
|
||
|
import math
|
||
|
from typing import List
|
||
|
from typing import Optional
|
||
|
from typing import Tuple
|
||
|
|
||
|
import numpy as np
|
||
|
import paddle
|
||
|
import paddle.nn.functional as F
|
||
|
from paddle import nn
|
||
|
|
||
|
from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
|
||
|
from paddlespeech.t2s.models.vits.duration_predictor import StochasticDurationPredictor
|
||
|
from paddlespeech.t2s.models.vits.posterior_encoder import PosteriorEncoder
|
||
|
from paddlespeech.t2s.models.vits.residual_coupling import ResidualAffineCouplingBlock
|
||
|
from paddlespeech.t2s.models.vits.text_encoder import TextEncoder
|
||
|
from paddlespeech.t2s.modules.nets_utils import get_random_segments
|
||
|
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
|
||
|
|
||
|
|
||
|
class VITSGenerator(nn.Layer):
|
||
|
"""Generator module in VITS.
|
||
|
This is a module of VITS described in `Conditional Variational Autoencoder
|
||
|
with Adversarial Learning for End-to-End Text-to-Speech`_.
|
||
|
As text encoder, we use conformer architecture instead of the relative positional
|
||
|
Transformer, which contains additional convolution layers.
|
||
|
.. _`Conditional Variational Autoencoder with Adversarial Learning for End-to-End
|
||
|
Text-to-Speech`: https://arxiv.org/abs/2006.04558
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
vocabs: int,
|
||
|
aux_channels: int=513,
|
||
|
hidden_channels: int=192,
|
||
|
spks: Optional[int]=None,
|
||
|
langs: Optional[int]=None,
|
||
|
spk_embed_dim: Optional[int]=None,
|
||
|
global_channels: int=-1,
|
||
|
segment_size: int=32,
|
||
|
text_encoder_attention_heads: int=2,
|
||
|
text_encoder_ffn_expand: int=4,
|
||
|
text_encoder_blocks: int=6,
|
||
|
text_encoder_positionwise_layer_type: str="conv1d",
|
||
|
text_encoder_positionwise_conv_kernel_size: int=1,
|
||
|
text_encoder_positional_encoding_layer_type: str="rel_pos",
|
||
|
text_encoder_self_attention_layer_type: str="rel_selfattn",
|
||
|
text_encoder_activation_type: str="swish",
|
||
|
text_encoder_normalize_before: bool=True,
|
||
|
text_encoder_dropout_rate: float=0.1,
|
||
|
text_encoder_positional_dropout_rate: float=0.0,
|
||
|
text_encoder_attention_dropout_rate: float=0.0,
|
||
|
text_encoder_conformer_kernel_size: int=7,
|
||
|
use_macaron_style_in_text_encoder: bool=True,
|
||
|
use_conformer_conv_in_text_encoder: bool=True,
|
||
|
decoder_kernel_size: int=7,
|
||
|
decoder_channels: int=512,
|
||
|
decoder_upsample_scales: List[int]=[8, 8, 2, 2],
|
||
|
decoder_upsample_kernel_sizes: List[int]=[16, 16, 4, 4],
|
||
|
decoder_resblock_kernel_sizes: List[int]=[3, 7, 11],
|
||
|
decoder_resblock_dilations: List[List[int]]=[[1, 3, 5], [1, 3, 5],
|
||
|
[1, 3, 5]],
|
||
|
use_weight_norm_in_decoder: bool=True,
|
||
|
posterior_encoder_kernel_size: int=5,
|
||
|
posterior_encoder_layers: int=16,
|
||
|
posterior_encoder_stacks: int=1,
|
||
|
posterior_encoder_base_dilation: int=1,
|
||
|
posterior_encoder_dropout_rate: float=0.0,
|
||
|
use_weight_norm_in_posterior_encoder: bool=True,
|
||
|
flow_flows: int=4,
|
||
|
flow_kernel_size: int=5,
|
||
|
flow_base_dilation: int=1,
|
||
|
flow_layers: int=4,
|
||
|
flow_dropout_rate: float=0.0,
|
||
|
use_weight_norm_in_flow: bool=True,
|
||
|
use_only_mean_in_flow: bool=True,
|
||
|
stochastic_duration_predictor_kernel_size: int=3,
|
||
|
stochastic_duration_predictor_dropout_rate: float=0.5,
|
||
|
stochastic_duration_predictor_flows: int=4,
|
||
|
stochastic_duration_predictor_dds_conv_layers: int=3, ):
|
||
|
"""Initialize VITS generator module.
|
||
|
Args:
|
||
|
vocabs (int): Input vocabulary size.
|
||
|
aux_channels (int): Number of acoustic feature channels.
|
||
|
hidden_channels (int): Number of hidden channels.
|
||
|
spks (Optional[int]): Number of speakers. If set to > 1, assume that the
|
||
|
sids will be provided as the input and use sid embedding layer.
|
||
|
langs (Optional[int]): Number of languages. If set to > 1, assume that the
|
||
|
lids will be provided as the input and use sid embedding layer.
|
||
|
spk_embed_dim (Optional[int]): Speaker embedding dimension. If set to > 0,
|
||
|
assume that spembs will be provided as the input.
|
||
|
global_channels (int): Number of global conditioning channels.
|
||
|
segment_size (int): Segment size for decoder.
|
||
|
text_encoder_attention_heads (int): Number of heads in conformer block
|
||
|
of text encoder.
|
||
|
text_encoder_ffn_expand (int): Expansion ratio of FFN in conformer block
|
||
|
of text encoder.
|
||
|
text_encoder_blocks (int): Number of conformer blocks in text encoder.
|
||
|
text_encoder_positionwise_layer_type (str): Position-wise layer type in
|
||
|
conformer block of text encoder.
|
||
|
text_encoder_positionwise_conv_kernel_size (int): Position-wise convolution
|
||
|
kernel size in conformer block of text encoder. Only used when the
|
||
|
above layer type is conv1d or conv1d-linear.
|
||
|
text_encoder_positional_encoding_layer_type (str): Positional encoding layer
|
||
|
type in conformer block of text encoder.
|
||
|
text_encoder_self_attention_layer_type (str): Self-attention layer type in
|
||
|
conformer block of text encoder.
|
||
|
text_encoder_activation_type (str): Activation function type in conformer
|
||
|
block of text encoder.
|
||
|
text_encoder_normalize_before (bool): Whether to apply layer norm before
|
||
|
self-attention in conformer block of text encoder.
|
||
|
text_encoder_dropout_rate (float): Dropout rate in conformer block of
|
||
|
text encoder.
|
||
|
text_encoder_positional_dropout_rate (float): Dropout rate for positional
|
||
|
encoding in conformer block of text encoder.
|
||
|
text_encoder_attention_dropout_rate (float): Dropout rate for attention in
|
||
|
conformer block of text encoder.
|
||
|
text_encoder_conformer_kernel_size (int): Conformer conv kernel size. It
|
||
|
will be used when only use_conformer_conv_in_text_encoder = True.
|
||
|
use_macaron_style_in_text_encoder (bool): Whether to use macaron style FFN
|
||
|
in conformer block of text encoder.
|
||
|
use_conformer_conv_in_text_encoder (bool): Whether to use covolution in
|
||
|
conformer block of text encoder.
|
||
|
decoder_kernel_size (int): Decoder kernel size.
|
||
|
decoder_channels (int): Number of decoder initial channels.
|
||
|
decoder_upsample_scales (List[int]): List of upsampling scales in decoder.
|
||
|
decoder_upsample_kernel_sizes (List[int]): List of kernel size for
|
||
|
upsampling layers in decoder.
|
||
|
decoder_resblock_kernel_sizes (List[int]): List of kernel size for resblocks
|
||
|
in decoder.
|
||
|
decoder_resblock_dilations (List[List[int]]): List of list of dilations for
|
||
|
resblocks in decoder.
|
||
|
use_weight_norm_in_decoder (bool): Whether to apply weight normalization in
|
||
|
decoder.
|
||
|
posterior_encoder_kernel_size (int): Posterior encoder kernel size.
|
||
|
posterior_encoder_layers (int): Number of layers of posterior encoder.
|
||
|
posterior_encoder_stacks (int): Number of stacks of posterior encoder.
|
||
|
posterior_encoder_base_dilation (int): Base dilation of posterior encoder.
|
||
|
posterior_encoder_dropout_rate (float): Dropout rate for posterior encoder.
|
||
|
use_weight_norm_in_posterior_encoder (bool): Whether to apply weight
|
||
|
normalization in posterior encoder.
|
||
|
flow_flows (int): Number of flows in flow.
|
||
|
flow_kernel_size (int): Kernel size in flow.
|
||
|
flow_base_dilation (int): Base dilation in flow.
|
||
|
flow_layers (int): Number of layers in flow.
|
||
|
flow_dropout_rate (float): Dropout rate in flow
|
||
|
use_weight_norm_in_flow (bool): Whether to apply weight normalization in
|
||
|
flow.
|
||
|
use_only_mean_in_flow (bool): Whether to use only mean in flow.
|
||
|
stochastic_duration_predictor_kernel_size (int): Kernel size in stochastic
|
||
|
duration predictor.
|
||
|
stochastic_duration_predictor_dropout_rate (float): Dropout rate in
|
||
|
stochastic duration predictor.
|
||
|
stochastic_duration_predictor_flows (int): Number of flows in stochastic
|
||
|
duration predictor.
|
||
|
stochastic_duration_predictor_dds_conv_layers (int): Number of DDS conv
|
||
|
layers in stochastic duration predictor.
|
||
|
"""
|
||
|
super().__init__()
|
||
|
self.segment_size = segment_size
|
||
|
self.text_encoder = TextEncoder(
|
||
|
vocabs=vocabs,
|
||
|
attention_dim=hidden_channels,
|
||
|
attention_heads=text_encoder_attention_heads,
|
||
|
linear_units=hidden_channels * text_encoder_ffn_expand,
|
||
|
blocks=text_encoder_blocks,
|
||
|
positionwise_layer_type=text_encoder_positionwise_layer_type,
|
||
|
positionwise_conv_kernel_size=text_encoder_positionwise_conv_kernel_size,
|
||
|
positional_encoding_layer_type=text_encoder_positional_encoding_layer_type,
|
||
|
self_attention_layer_type=text_encoder_self_attention_layer_type,
|
||
|
activation_type=text_encoder_activation_type,
|
||
|
normalize_before=text_encoder_normalize_before,
|
||
|
dropout_rate=text_encoder_dropout_rate,
|
||
|
positional_dropout_rate=text_encoder_positional_dropout_rate,
|
||
|
attention_dropout_rate=text_encoder_attention_dropout_rate,
|
||
|
conformer_kernel_size=text_encoder_conformer_kernel_size,
|
||
|
use_macaron_style=use_macaron_style_in_text_encoder,
|
||
|
use_conformer_conv=use_conformer_conv_in_text_encoder, )
|
||
|
self.decoder = HiFiGANGenerator(
|
||
|
in_channels=hidden_channels,
|
||
|
out_channels=1,
|
||
|
channels=decoder_channels,
|
||
|
global_channels=global_channels,
|
||
|
kernel_size=decoder_kernel_size,
|
||
|
upsample_scales=decoder_upsample_scales,
|
||
|
upsample_kernel_sizes=decoder_upsample_kernel_sizes,
|
||
|
resblock_kernel_sizes=decoder_resblock_kernel_sizes,
|
||
|
resblock_dilations=decoder_resblock_dilations,
|
||
|
use_weight_norm=use_weight_norm_in_decoder, )
|
||
|
self.posterior_encoder = PosteriorEncoder(
|
||
|
in_channels=aux_channels,
|
||
|
out_channels=hidden_channels,
|
||
|
hidden_channels=hidden_channels,
|
||
|
kernel_size=posterior_encoder_kernel_size,
|
||
|
layers=posterior_encoder_layers,
|
||
|
stacks=posterior_encoder_stacks,
|
||
|
base_dilation=posterior_encoder_base_dilation,
|
||
|
global_channels=global_channels,
|
||
|
dropout_rate=posterior_encoder_dropout_rate,
|
||
|
use_weight_norm=use_weight_norm_in_posterior_encoder, )
|
||
|
self.flow = ResidualAffineCouplingBlock(
|
||
|
in_channels=hidden_channels,
|
||
|
hidden_channels=hidden_channels,
|
||
|
flows=flow_flows,
|
||
|
kernel_size=flow_kernel_size,
|
||
|
base_dilation=flow_base_dilation,
|
||
|
layers=flow_layers,
|
||
|
global_channels=global_channels,
|
||
|
dropout_rate=flow_dropout_rate,
|
||
|
use_weight_norm=use_weight_norm_in_flow,
|
||
|
use_only_mean=use_only_mean_in_flow, )
|
||
|
# TODO: Add deterministic version as an option
|
||
|
self.duration_predictor = StochasticDurationPredictor(
|
||
|
channels=hidden_channels,
|
||
|
kernel_size=stochastic_duration_predictor_kernel_size,
|
||
|
dropout_rate=stochastic_duration_predictor_dropout_rate,
|
||
|
flows=stochastic_duration_predictor_flows,
|
||
|
dds_conv_layers=stochastic_duration_predictor_dds_conv_layers,
|
||
|
global_channels=global_channels, )
|
||
|
|
||
|
self.upsample_factor = int(np.prod(decoder_upsample_scales))
|
||
|
self.spks = None
|
||
|
if spks is not None and spks > 1:
|
||
|
assert global_channels > 0
|
||
|
self.spks = spks
|
||
|
self.global_emb = nn.Embedding(spks, global_channels)
|
||
|
self.spk_embed_dim = None
|
||
|
if spk_embed_dim is not None and spk_embed_dim > 0:
|
||
|
assert global_channels > 0
|
||
|
self.spk_embed_dim = spk_embed_dim
|
||
|
self.spemb_proj = nn.Linear(spk_embed_dim, global_channels)
|
||
|
self.langs = None
|
||
|
if langs is not None and langs > 1:
|
||
|
assert global_channels > 0
|
||
|
self.langs = langs
|
||
|
self.lang_emb = nn.Embedding(langs, global_channels)
|
||
|
|
||
|
# delayed import
|
||
|
from paddlespeech.t2s.models.vits.monotonic_align import maximum_path
|
||
|
|
||
|
self.maximum_path = maximum_path
|
||
|
|
||
|
def forward(
|
||
|
self,
|
||
|
text: paddle.Tensor,
|
||
|
text_lengths: paddle.Tensor,
|
||
|
feats: paddle.Tensor,
|
||
|
feats_lengths: paddle.Tensor,
|
||
|
sids: Optional[paddle.Tensor]=None,
|
||
|
spembs: Optional[paddle.Tensor]=None,
|
||
|
lids: Optional[paddle.Tensor]=None,
|
||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
|
||
|
paddle.Tensor, paddle.Tensor,
|
||
|
Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
|
||
|
paddle.Tensor, paddle.Tensor, ], ]:
|
||
|
"""Calculate forward propagation.
|
||
|
Args:
|
||
|
text (Tensor): Text index tensor (B, T_text).
|
||
|
text_lengths (Tensor): Text length tensor (B,).
|
||
|
feats (Tensor): Feature tensor (B, aux_channels, T_feats).
|
||
|
feats_lengths (Tensor): Feature length tensor (B,).
|
||
|
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
|
||
|
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
|
||
|
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
|
||
|
Returns:
|
||
|
Tensor: Waveform tensor (B, 1, segment_size * upsample_factor).
|
||
|
Tensor: Duration negative log-likelihood (NLL) tensor (B,).
|
||
|
Tensor: Monotonic attention weight tensor (B, 1, T_feats, T_text).
|
||
|
Tensor: Segments start index tensor (B,).
|
||
|
Tensor: Text mask tensor (B, 1, T_text).
|
||
|
Tensor: Feature mask tensor (B, 1, T_feats).
|
||
|
tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
|
||
|
- Tensor: Posterior encoder hidden representation (B, H, T_feats).
|
||
|
- Tensor: Flow hidden representation (B, H, T_feats).
|
||
|
- Tensor: Expanded text encoder projected mean (B, H, T_feats).
|
||
|
- Tensor: Expanded text encoder projected scale (B, H, T_feats).
|
||
|
- Tensor: Posterior encoder projected mean (B, H, T_feats).
|
||
|
- Tensor: Posterior encoder projected scale (B, H, T_feats).
|
||
|
"""
|
||
|
# forward text encoder
|
||
|
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
|
||
|
|
||
|
# calculate global conditioning
|
||
|
g = None
|
||
|
if self.spks is not None:
|
||
|
# speaker one-hot vector embedding: (B, global_channels, 1)
|
||
|
g = self.global_emb(paddle.reshape(sids, [-1])).unsqueeze(-1)
|
||
|
if self.spk_embed_dim is not None:
|
||
|
# pretreined speaker embedding, e.g., X-vector (B, global_channels, 1)
|
||
|
g_ = self.spemb_proj(F.normalize(spembs)).unsqueeze(-1)
|
||
|
if g is None:
|
||
|
g = g_
|
||
|
else:
|
||
|
g = g + g_
|
||
|
if self.langs is not None:
|
||
|
# language one-hot vector embedding: (B, global_channels, 1)
|
||
|
g_ = self.lang_emb(paddle.reshape(lids, [-1])).unsqueeze(-1)
|
||
|
if g is None:
|
||
|
g = g_
|
||
|
else:
|
||
|
g = g + g_
|
||
|
|
||
|
# forward posterior encoder
|
||
|
z, m_q, logs_q, y_mask = self.posterior_encoder(
|
||
|
feats, feats_lengths, g=g)
|
||
|
|
||
|
# forward flow
|
||
|
# (B, H, T_feats)
|
||
|
z_p = self.flow(z, y_mask, g=g)
|
||
|
|
||
|
# monotonic alignment search
|
||
|
with paddle.no_grad():
|
||
|
# negative cross-entropy
|
||
|
# (B, H, T_text)
|
||
|
s_p_sq_r = paddle.exp(-2 * logs_p)
|
||
|
# (B, 1, T_text)
|
||
|
neg_x_ent_1 = paddle.sum(
|
||
|
-0.5 * math.log(2 * math.pi) - logs_p,
|
||
|
[1],
|
||
|
keepdim=True, )
|
||
|
# (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
|
||
|
neg_x_ent_2 = paddle.matmul(
|
||
|
-0.5 * (z_p**2).transpose([0, 2, 1]),
|
||
|
s_p_sq_r, )
|
||
|
# (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
|
||
|
neg_x_ent_3 = paddle.matmul(
|
||
|
z_p.transpose([0, 2, 1]),
|
||
|
(m_p * s_p_sq_r), )
|
||
|
# (B, 1, T_text)
|
||
|
neg_x_ent_4 = paddle.sum(
|
||
|
-0.5 * (m_p**2) * s_p_sq_r,
|
||
|
[1],
|
||
|
keepdim=True, )
|
||
|
# (B, T_feats, T_text)
|
||
|
neg_x_ent = neg_x_ent_1 + neg_x_ent_2 + neg_x_ent_3 + neg_x_ent_4
|
||
|
# (B, 1, T_feats, T_text)
|
||
|
attn_mask = paddle.unsqueeze(x_mask, 2) * paddle.unsqueeze(y_mask,
|
||
|
-1)
|
||
|
# monotonic attention weight: (B, 1, T_feats, T_text)
|
||
|
attn = (self.maximum_path(
|
||
|
neg_x_ent,
|
||
|
attn_mask.squeeze(1), ).unsqueeze(1).detach())
|
||
|
|
||
|
# forward duration predictor
|
||
|
# (B, 1, T_text)
|
||
|
w = attn.sum(2)
|
||
|
dur_nll = self.duration_predictor(x, x_mask, w=w, g=g)
|
||
|
dur_nll = dur_nll / paddle.sum(x_mask)
|
||
|
|
||
|
# expand the length to match with the feature sequence
|
||
|
# (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
|
||
|
m_p = paddle.matmul(attn.squeeze(1),
|
||
|
m_p.transpose([0, 2, 1])).transpose([0, 2, 1])
|
||
|
# (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
|
||
|
logs_p = paddle.matmul(attn.squeeze(1),
|
||
|
logs_p.transpose([0, 2, 1])).transpose([0, 2, 1])
|
||
|
|
||
|
# get random segments
|
||
|
z_segments, z_start_idxs = get_random_segments(
|
||
|
z,
|
||
|
feats_lengths,
|
||
|
self.segment_size, )
|
||
|
|
||
|
# forward decoder with random segments
|
||
|
wav = self.decoder(z_segments, g=g)
|
||
|
|
||
|
return (wav, dur_nll, attn, z_start_idxs, x_mask, y_mask,
|
||
|
(z, z_p, m_p, logs_p, m_q, logs_q), )
|
||
|
|
||
|
def inference(
|
||
|
self,
|
||
|
text: paddle.Tensor,
|
||
|
text_lengths: paddle.Tensor,
|
||
|
feats: Optional[paddle.Tensor]=None,
|
||
|
feats_lengths: Optional[paddle.Tensor]=None,
|
||
|
sids: Optional[paddle.Tensor]=None,
|
||
|
spembs: Optional[paddle.Tensor]=None,
|
||
|
lids: Optional[paddle.Tensor]=None,
|
||
|
dur: Optional[paddle.Tensor]=None,
|
||
|
noise_scale: float=0.667,
|
||
|
noise_scale_dur: float=0.8,
|
||
|
alpha: float=1.0,
|
||
|
max_len: Optional[int]=None,
|
||
|
use_teacher_forcing: bool=False,
|
||
|
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
|
||
|
"""Run inference.
|
||
|
Args:
|
||
|
text (Tensor): Input text index tensor (B, T_text,).
|
||
|
text_lengths (Tensor): Text length tensor (B,).
|
||
|
feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
|
||
|
feats_lengths (Tensor): Feature length tensor (B,).
|
||
|
sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
|
||
|
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
|
||
|
lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
|
||
|
dur (Optional[Tensor]): Ground-truth duration (B, T_text,). If provided,
|
||
|
skip the prediction of durations (i.e., teacher forcing).
|
||
|
noise_scale (float): Noise scale parameter for flow.
|
||
|
noise_scale_dur (float): Noise scale parameter for duration predictor.
|
||
|
alpha (float): Alpha parameter to control the speed of generated speech.
|
||
|
max_len (Optional[int]): Maximum length of acoustic feature sequence.
|
||
|
use_teacher_forcing (bool): Whether to use teacher forcing.
|
||
|
Returns:
|
||
|
Tensor: Generated waveform tensor (B, T_wav).
|
||
|
Tensor: Monotonic attention weight tensor (B, T_feats, T_text).
|
||
|
Tensor: Duration tensor (B, T_text).
|
||
|
"""
|
||
|
# encoder
|
||
|
x, m_p, logs_p, x_mask = self.text_encoder(text, text_lengths)
|
||
|
g = None
|
||
|
if self.spks is not None:
|
||
|
# (B, global_channels, 1)
|
||
|
g = self.global_emb(paddle.reshape(sids, [-1])).unsqueeze(-1)
|
||
|
if self.spk_embed_dim is not None:
|
||
|
# (B, global_channels, 1)
|
||
|
g_ = self.spemb_proj(F.normalize(spembs.unsqueeze(0))).unsqueeze(-1)
|
||
|
if g is None:
|
||
|
g = g_
|
||
|
else:
|
||
|
g = g + g_
|
||
|
if self.langs is not None:
|
||
|
# (B, global_channels, 1)
|
||
|
g_ = self.lang_emb(paddle.reshape(lids, [-1])).unsqueeze(-1)
|
||
|
if g is None:
|
||
|
g = g_
|
||
|
else:
|
||
|
g = g + g_
|
||
|
|
||
|
if use_teacher_forcing:
|
||
|
# forward posterior encoder
|
||
|
z, m_q, logs_q, y_mask = self.posterior_encoder(
|
||
|
feats, feats_lengths, g=g)
|
||
|
|
||
|
# forward flow
|
||
|
# (B, H, T_feats)
|
||
|
z_p = self.flow(z, y_mask, g=g)
|
||
|
|
||
|
# monotonic alignment search
|
||
|
# (B, H, T_text)
|
||
|
s_p_sq_r = paddle.exp(-2 * logs_p)
|
||
|
# (B, 1, T_text)
|
||
|
neg_x_ent_1 = paddle.sum(
|
||
|
-0.5 * math.log(2 * math.pi) - logs_p,
|
||
|
[1],
|
||
|
keepdim=True, )
|
||
|
# (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
|
||
|
neg_x_ent_2 = paddle.matmul(
|
||
|
-0.5 * (z_p**2).transpose([0, 2, 1]),
|
||
|
s_p_sq_r, )
|
||
|
# (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
|
||
|
neg_x_ent_3 = paddle.matmul(
|
||
|
z_p.transpose([0, 2, 1]),
|
||
|
(m_p * s_p_sq_r), )
|
||
|
# (B, 1, T_text)
|
||
|
neg_x_ent_4 = paddle.sum(
|
||
|
-0.5 * (m_p**2) * s_p_sq_r,
|
||
|
[1],
|
||
|
keepdim=True, )
|
||
|
# (B, T_feats, T_text)
|
||
|
neg_x_ent = neg_x_ent_1 + neg_x_ent_2 + neg_x_ent_3 + neg_x_ent_4
|
||
|
# (B, 1, T_feats, T_text)
|
||
|
attn_mask = paddle.unsqueeze(x_mask, 2) * paddle.unsqueeze(y_mask,
|
||
|
-1)
|
||
|
# monotonic attention weight: (B, 1, T_feats, T_text)
|
||
|
attn = self.maximum_path(
|
||
|
neg_x_ent,
|
||
|
attn_mask.squeeze(1), ).unsqueeze(1)
|
||
|
# (B, 1, T_text)
|
||
|
dur = attn.sum(2)
|
||
|
|
||
|
# forward decoder with random segments
|
||
|
wav = self.decoder(z * y_mask, g=g)
|
||
|
else:
|
||
|
# duration
|
||
|
if dur is None:
|
||
|
logw = self.duration_predictor(
|
||
|
x,
|
||
|
x_mask,
|
||
|
g=g,
|
||
|
inverse=True,
|
||
|
noise_scale=noise_scale_dur, )
|
||
|
w = paddle.exp(logw) * x_mask * alpha
|
||
|
dur = paddle.ceil(w)
|
||
|
y_lengths = paddle.cast(
|
||
|
paddle.clip(paddle.sum(dur, [1, 2]), min=1), dtype='int64')
|
||
|
y_mask = make_non_pad_mask(y_lengths).unsqueeze(1)
|
||
|
attn_mask = paddle.unsqueeze(x_mask, 2) * paddle.unsqueeze(y_mask,
|
||
|
-1)
|
||
|
attn = self._generate_path(dur, attn_mask)
|
||
|
|
||
|
# expand the length to match with the feature sequence
|
||
|
# (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
|
||
|
m_p = paddle.matmul(
|
||
|
attn.squeeze(1),
|
||
|
m_p.transpose([0, 2, 1]), ).transpose([0, 2, 1])
|
||
|
# (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
|
||
|
logs_p = paddle.matmul(
|
||
|
attn.squeeze(1),
|
||
|
logs_p.transpose([0, 2, 1]), ).transpose([0, 2, 1])
|
||
|
|
||
|
# decoder
|
||
|
z_p = m_p + paddle.randn(
|
||
|
paddle.shape(m_p)) * paddle.exp(logs_p) * noise_scale
|
||
|
z = self.flow(z_p, y_mask, g=g, inverse=True)
|
||
|
wav = self.decoder((z * y_mask)[:, :, :max_len], g=g)
|
||
|
|
||
|
return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)
|
||
|
|
||
|
def _generate_path(self, dur: paddle.Tensor,
|
||
|
mask: paddle.Tensor) -> paddle.Tensor:
|
||
|
"""Generate path a.k.a. monotonic attention.
|
||
|
Args:
|
||
|
dur (Tensor): Duration tensor (B, 1, T_text).
|
||
|
mask (Tensor): Attention mask tensor (B, 1, T_feats, T_text).
|
||
|
Returns:
|
||
|
Tensor: Path tensor (B, 1, T_feats, T_text).
|
||
|
"""
|
||
|
b, _, t_y, t_x = paddle.shape(mask)
|
||
|
cum_dur = paddle.cumsum(dur, -1)
|
||
|
cum_dur_flat = paddle.reshape(cum_dur, [b * t_x])
|
||
|
|
||
|
path = paddle.arange(t_y, dtype=dur.dtype)
|
||
|
path = path.unsqueeze(0) < cum_dur_flat.unsqueeze(1)
|
||
|
path = paddle.reshape(path, [b, t_x, t_y])
|
||
|
'''
|
||
|
path will be like (t_x = 3, t_y = 5):
|
||
|
[[[1., 1., 0., 0., 0.], [[[1., 1., 0., 0., 0.],
|
||
|
[1., 1., 1., 1., 0.], --> [0., 0., 1., 1., 0.],
|
||
|
[1., 1., 1., 1., 1.]]] [0., 0., 0., 0., 1.]]]
|
||
|
'''
|
||
|
|
||
|
path = paddle.cast(path, dtype='float32')
|
||
|
path = path - F.pad(path, [0, 0, 1, 0, 0, 0])[:, :-1]
|
||
|
return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
|