You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/t2s/models/speedyspeech/speedyspeech.py

381 lines
14 KiB

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import paddle
from paddle import nn
from paddlespeech.t2s.modules.nets_utils import initialize
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
class ResidualBlock(nn.Layer):
def __init__(self,
channels: int=128,
kernel_size: int=3,
dilation: int=3,
n: int=2):
"""SpeedySpeech encoder module.
Args:
channels (int, optional): Feature size of the residual output(and also the input).
kernel_size (int, optional): Kernel size of the 1D convolution.
dilation (int, optional): Dilation of the 1D convolution.
n (int): Number of blocks.
"""
super().__init__()
total_pad = (dilation * (kernel_size - 1))
begin = total_pad // 2
end = total_pad - begin
# remove padding='same' here, cause onnx don't support dilation + 'same' padding
blocks = [
nn.Sequential(
nn.Conv1D(
channels,
channels,
kernel_size,
dilation=dilation,
# make sure output T == input T
padding=((0, 0), (0, 0), (begin, end))),
nn.ReLU(),
nn.BatchNorm1D(channels), ) for _ in range(n)
]
self.blocks = nn.Sequential(*blocks)
def forward(self, x: paddle.Tensor):
"""Calculate forward propagation.
Args:
x(Tensor): Batch of input sequences (B, hidden_size, Tmax).
Returns:
Tensor: The residual output (B, hidden_size, Tmax).
"""
return x + self.blocks(x)
class TextEmbedding(nn.Layer):
def __init__(self,
vocab_size: int,
embedding_size: int,
tone_vocab_size: int=None,
tone_embedding_size: int=None,
padding_idx: int=None,
tone_padding_idx: int=None,
concat: bool=False):
super().__init__()
self.text_embedding = nn.Embedding(vocab_size, embedding_size,
padding_idx)
if tone_vocab_size:
tone_embedding_size = tone_embedding_size or embedding_size
if tone_embedding_size != embedding_size and not concat:
raise ValueError(
"embedding size != tone_embedding size, only conat is avaiable."
)
self.tone_embedding = nn.Embedding(
tone_vocab_size, tone_embedding_size, tone_padding_idx)
self.concat = concat
def forward(self, text: paddle.Tensor, tone: paddle.Tensor=None):
"""Calculate forward propagation.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
Returns:
Tensor: The residual output (B, Tmax, embedding_size).
"""
text_embed = self.text_embedding(text)
if tone is None:
return text_embed
tone_embed = self.tone_embedding(tone)
if self.concat:
embed = paddle.concat([text_embed, tone_embed], -1)
else:
embed = text_embed + tone_embed
return embed
class SpeedySpeechEncoder(nn.Layer):
"""SpeedySpeech encoder module.
Args:
vocab_size (int): Dimension of the inputs.
tone_size (Optional[int]): Number of tones.
hidden_size (int): Number of encoder hidden units.
kernel_size (int): Kernel size of encoder.
dilations (List[int]): Dilations of encoder.
spk_num (Optional[int]): Number of speakers.
"""
def __init__(self,
vocab_size: int,
tone_size: int,
hidden_size: int=128,
kernel_size: int=3,
dilations: List[int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
spk_num=None):
super().__init__()
self.embedding = TextEmbedding(
vocab_size,
hidden_size,
tone_size,
padding_idx=0,
tone_padding_idx=0)
if spk_num:
self.spk_emb = nn.Embedding(
num_embeddings=spk_num,
embedding_dim=hidden_size,
padding_idx=0)
else:
self.spk_emb = None
self.prenet = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(), )
res_blocks = [
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
]
self.res_blocks = nn.Sequential(*res_blocks)
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
self.postnet2 = nn.Sequential(
nn.ReLU(),
nn.BatchNorm1D(hidden_size), )
self.linear = nn.Linear(hidden_size, hidden_size)
def forward(self,
text: paddle.Tensor,
tones: paddle.Tensor,
spk_id: paddle.Tensor=None):
"""Encoder input sequence.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
Returns:
Tensor: Output tensor (B, Tmax, hidden_size).
"""
embedding = self.embedding(text, tones)
if self.spk_emb:
embedding += self.spk_emb(spk_id).unsqueeze(1)
embedding = self.prenet(embedding)
x = self.res_blocks(embedding.transpose([0, 2, 1])).transpose([0, 2, 1])
# (B, T, dim)
x = embedding + self.postnet1(x)
x = self.postnet2(x.transpose([0, 2, 1])).transpose([0, 2, 1])
x = self.linear(x)
return x
class DurationPredictor(nn.Layer):
def __init__(self, hidden_size: int=128):
super().__init__()
self.layers = nn.Sequential(
ResidualBlock(hidden_size, 4, 1, n=1),
ResidualBlock(hidden_size, 3, 1, n=1),
ResidualBlock(hidden_size, 1, 1, n=1), )
self.linear = nn.Linear(hidden_size, 1)
def forward(self, x: paddle.Tensor):
"""Calculate forward propagation.
Args:
x(Tensor): Batch of input sequences (B, Tmax, hidden_size).
Returns:
Tensor: Batch of predicted durations in log domain (B, Tmax).
"""
x = self.layers(x.transpose([0, 2, 1])).transpose([0, 2, 1])
x = self.linear(x)
return paddle.squeeze(x, -1)
class SpeedySpeechDecoder(nn.Layer):
def __init__(self,
hidden_size: int=128,
output_size: int=80,
kernel_size: int=3,
dilations: List[int]=[
1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1
]):
"""SpeedySpeech decoder module.
Args:
hidden_size (int): Number of decoder hidden units.
kernel_size (int): Kernel size of decoder.
output_size (int): Dimension of the outputs.
dilations (List[int]): Dilations of decoder.
"""
super().__init__()
res_blocks = [
ResidualBlock(hidden_size, kernel_size, d, n=2) for d in dilations
]
self.res_blocks = nn.Sequential(*res_blocks)
self.postnet1 = nn.Sequential(nn.Linear(hidden_size, hidden_size))
self.postnet2 = ResidualBlock(hidden_size, kernel_size, 1, n=2)
self.linear = nn.Linear(hidden_size, output_size)
def forward(self, x):
"""Decoder input sequence.
Args:
x(Tensor): Input tensor (B, time, hidden_size).
Returns:
Tensor: Output tensor (B, time, output_size).
"""
xx = self.res_blocks(x.transpose([0, 2, 1])).transpose([0, 2, 1])
x = x + self.postnet1(xx)
x = self.postnet2(x.transpose([0, 2, 1])).transpose([0, 2, 1])
x = self.linear(x)
return x
class SpeedySpeech(nn.Layer):
def __init__(
self,
vocab_size,
encoder_hidden_size: int=128,
encoder_kernel_size: int=3,
encoder_dilations: List[int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
duration_predictor_hidden_size: int=128,
decoder_hidden_size: int=128,
decoder_output_size: int=80,
decoder_kernel_size: int=3,
decoder_dilations: List[
int]=[1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1],
tone_size: int=None,
spk_num: int=None,
init_type: str="xavier_uniform",
positional_dropout_rate: int=0.1):
"""Initialize SpeedySpeech module.
Args:
vocab_size (int): Dimension of the inputs.
encoder_hidden_size (int): Number of encoder hidden units.
encoder_kernel_size (int): Kernel size of encoder.
encoder_dilations (List[int]): Dilations of encoder.
duration_predictor_hidden_size (int): Number of duration predictor hidden units.
decoder_hidden_size (int): Number of decoder hidden units.
decoder_kernel_size (int): Kernel size of decoder.
decoder_dilations (List[int]): Dilations of decoder.
decoder_output_size (int): Dimension of the outputs.
tone_size (Optional[int]): Number of tones.
spk_num (Optional[int]): Number of speakers.
init_type (str): How to initialize transformer parameters.
"""
super().__init__()
# initialize parameters
initialize(self, init_type)
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size,
encoder_dilations, spk_num)
duration_predictor = DurationPredictor(duration_predictor_hidden_size)
decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
decoder_kernel_size, decoder_dilations)
self.position_enc = ScaledPositionalEncoding(encoder_hidden_size,
positional_dropout_rate)
self.encoder = encoder
self.duration_predictor = duration_predictor
self.decoder = decoder
# define length regulator
self.length_regulator = LengthRegulator()
nn.initializer.set_global_initializer(None)
def forward(self,
text: paddle.Tensor,
tones: paddle.Tensor,
durations: paddle.Tensor,
spk_id: paddle.Tensor=None):
"""Calculate forward propagation.
Args:
text(Tensor(int64)): Batch of padded token ids (B, Tmax).
durations(Tensor(int64)): Batch of padded durations (B, Tmax).
tones(Tensor, optional(int64)): Batch of padded tone ids (B, Tmax).
spk_id(Tnesor, optional(int64)): Batch of speaker ids (B,)
Returns:
Tensor: Output tensor (B, T_frames, decoder_output_size).
Tensor: Predicted durations (B, Tmax).
"""
# input of embedding must be int64
text = paddle.cast(text, 'int64')
tones = paddle.cast(tones, 'int64')
if spk_id is not None:
spk_id = paddle.cast(spk_id, 'int64')
durations = paddle.cast(durations, 'int64')
encodings = self.encoder(text, tones, spk_id)
pred_durations = self.duration_predictor(encodings.detach())
# expand encodings
durations_to_expand = durations
encodings = self.length_regulator(encodings, durations_to_expand)
encodings = self.position_enc(encodings)
# decode
decoded = self.decoder(encodings)
return decoded, pred_durations
def inference(self,
text: paddle.Tensor,
tones: paddle.Tensor=None,
durations: paddle.Tensor=None,
spk_id: paddle.Tensor=None):
"""Generate the sequence of features given the sequences of characters.
Args:
text(Tensor(int64)): Input sequence of characters (T,).
tones(Tensor, optional(int64)): Batch of padded tone ids (T, ).
durations(Tensor, optional (int64)): Groundtruth of duration (T,).
spk_id(Tensor, optional(int64), optional): spk ids (1,). (Default value = None)
Returns:
Tensor: logmel (T, decoder_output_size).
"""
# input of embedding must be int64
text = paddle.cast(text, 'int64')
text = text.unsqueeze(0)
if tones is not None:
tones = paddle.cast(tones, 'int64')
tones = tones.unsqueeze(0)
encodings = self.encoder(text, tones, spk_id)
if durations is None:
# (1, T)
pred_durations = self.duration_predictor(encodings)
durations_to_expand = paddle.round(pred_durations.exp())
durations_to_expand = durations_to_expand.astype(paddle.int64)
else:
durations_to_expand = durations
encodings = self.length_regulator(
encodings, durations_to_expand, is_inference=True)
encodings = self.position_enc(encodings)
decoded = self.decoder(encodings)
return decoded[0]
class SpeedySpeechInference(nn.Layer):
def __init__(self, normalizer, speedyspeech_model):
super().__init__()
self.normalizer = normalizer
self.acoustic_model = speedyspeech_model
def forward(self, phones, tones, spk_id=None, durations=None):
normalized_mel = self.acoustic_model.inference(
phones, tones, durations=durations, spk_id=spk_id)
logmel = self.normalizer.inverse(normalized_mel)
return logmel