change speaker embedding position

Change speaker embedding position into the encoder.
pull/1259/head
Jerryuhoo 3 years ago
parent 11991b6d35
commit f191d0b022

@ -1,52 +0,0 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # Sampling rate.
n_fft: 2048 # FFT size (samples).
n_shift: 300 # Hop size (samples). 12.5ms
win_length: 1200 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
n_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation.
fmax: 7600 # Maximum frequency in mel basis calculation.
###########################################################
# DATA SETTING #
###########################################################
batch_size: 32
num_workers: 4
###########################################################
# MODEL SETTING #
###########################################################
model:
encoder_hidden_size: 128
encoder_kernel_size: 3
encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
duration_predictor_hidden_size: 128
decoder_hidden_size: 128
decoder_output_size: 80
decoder_kernel_size: 3
decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
spk_embed_dim: 256
spk_embed_integration_type: add # speaker embedding integration type
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 0.002 # learning rate
max_grad_norm: 1
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 100
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
seed: 10086

@ -96,7 +96,7 @@ class TextEmbedding(nn.Layer):
class SpeedySpeechEncoder(nn.Layer):
def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
dilations):
dilations, spk_num=None):
super().__init__()
self.embedding = TextEmbedding(
vocab_size,
@ -104,6 +104,15 @@ class SpeedySpeechEncoder(nn.Layer):
tone_size,
padding_idx=0,
tone_padding_idx=0)
if spk_num:
self.spk_emb = nn.Embedding(
num_embeddings=spk_num,
embedding_dim=hidden_size,
padding_idx=0)
else:
self.spk_emb = None
self.prenet = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(), )
@ -118,8 +127,10 @@ class SpeedySpeechEncoder(nn.Layer):
nn.BatchNorm1D(hidden_size, data_format="NLC"),
nn.Linear(hidden_size, hidden_size), )
def forward(self, text, tones):
def forward(self, text, tones, spk_id=None):
embedding = self.embedding(text, tones)
if self.spk_emb:
embedding += self.spk_emb(spk_id).unsqueeze(1)
embedding = self.prenet(embedding)
x = self.res_blocks(embedding)
x = embedding + self.postnet1(x)
@ -172,14 +183,11 @@ class SpeedySpeech(nn.Layer):
decoder_kernel_size,
decoder_dilations,
tone_size=None,
spk_num: int=None,
spk_embed_dim: int=None,
spk_embed_integration_type: str="add",
):
spk_num=None):
super().__init__()
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size,
encoder_dilations)
encoder_dilations, spk_num)
duration_predictor = DurationPredictor(duration_predictor_hidden_size)
decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size,
decoder_kernel_size, decoder_dilations)
@ -187,27 +195,6 @@ class SpeedySpeech(nn.Layer):
self.encoder = encoder
self.duration_predictor = duration_predictor
self.decoder = decoder
self.spk_embed_dim = spk_embed_dim
# use idx 0 as padding idx
self.padding_idx = 0
if self.spk_embed_dim is not None:
self.spk_embed_integration_type = spk_embed_integration_type
if spk_num and self.spk_embed_dim:
self.spk_embedding_table = nn.Embedding(
num_embeddings=spk_num,
embedding_dim=self.spk_embed_dim,
padding_idx=self.padding_idx)
self.encoder_hidden_size = encoder_hidden_size
# define additional projection for speaker embedding
if self.spk_embed_dim is not None:
print("spk_embed_integration_type------------", spk_embed_integration_type)
if self.spk_embed_integration_type == "add":
self.spk_projection = nn.Linear(self.spk_embed_dim, self.encoder_hidden_size)
else:
self.spk_projection = nn.Linear(self.encoder_hidden_size + self.spk_embed_dim, self.encoder_hidden_size)
def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
# input of embedding must be int64
@ -216,13 +203,7 @@ class SpeedySpeech(nn.Layer):
if spk_id is not None:
spk_id = paddle.cast(spk_id, 'int64')
durations = paddle.cast(durations, 'int64')
encodings = self.encoder(text, tones)
# (B, T)
if self.spk_embed_dim is not None:
if spk_id is not None:
spk_emb = self.spk_embedding_table(spk_id)
encodings = self._integrate_with_spk_embed(encodings, spk_emb)
encodings = self.encoder(text, tones, spk_id)
pred_durations = self.duration_predictor(encodings.detach())
@ -237,7 +218,7 @@ class SpeedySpeech(nn.Layer):
decoded = self.decoder(encodings)
return decoded, pred_durations
def inference(self, text, tones=None, spk_id=None,):
def inference(self, text, tones=None, spk_id=None):
# text: [T]
# tones: [T]
# input of embedding must be int64
@ -247,11 +228,7 @@ class SpeedySpeech(nn.Layer):
tones = paddle.cast(tones, 'int64')
tones = tones.unsqueeze(0)
encodings = self.encoder(text, tones)
if self.spk_embed_dim is not None:
if spk_id is not None:
spk_emb = self.spk_embedding_table(spk_id)
encodings = self._integrate_with_spk_embed(encodings, spk_emb)
encodings = self.encoder(text, tones, spk_id)
pred_durations = self.duration_predictor(encodings) # (1, T)
durations_to_expand = paddle.round(pred_durations.exp())
@ -278,35 +255,6 @@ class SpeedySpeech(nn.Layer):
decoded = self.decoder(encodings)
return decoded[0]
def _integrate_with_spk_embed(self, hs, spk_emb):
"""Integrate speaker embedding with hidden states.
Parameters
----------
hs : Tensor
Batch of hidden state sequences (B, Tmax, adim).
spk_emb : Tensor
Batch of speaker embeddings (B, spk_embed_dim).
Returns
----------
Tensor
Batch of integrated hidden state sequences (B, Tmax, adim)
"""
if self.spk_embed_integration_type == "add":
# apply projection and then add to hidden states
spk_emb = self.spk_projection(F.normalize(spk_emb))
hs = hs + spk_emb.unsqueeze(1)
elif self.spk_embed_integration_type == "concat":
# concat hidden states with spk embeds and then apply projection
spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
shape=[-1, hs.shape[1], -1])
hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
else:
raise NotImplementedError("support only add or concat.")
return hs
class SpeedySpeechInference(nn.Layer):
def __init__(self, normalizer, speedyspeech_model):
super().__init__()

Loading…
Cancel
Save