From f191d0b0224e5565364a5564a31e35ce9abe06b8 Mon Sep 17 00:00:00 2001 From: Jerryuhoo Date: Tue, 4 Jan 2022 14:21:12 +0800 Subject: [PATCH] change speaker embedding position Change speaker embedding position into the encoder. --- examples/aishell3/tts2/default_multi.yaml | 52 ----------- .../t2s/models/speedyspeech/speedyspeech.py | 88 ++++--------------- 2 files changed, 18 insertions(+), 122 deletions(-) delete mode 100644 examples/aishell3/tts2/default_multi.yaml diff --git a/examples/aishell3/tts2/default_multi.yaml b/examples/aishell3/tts2/default_multi.yaml deleted file mode 100644 index dee78f2c..00000000 --- a/examples/aishell3/tts2/default_multi.yaml +++ /dev/null @@ -1,52 +0,0 @@ -########################################################### -# FEATURE EXTRACTION SETTING # -########################################################### -fs: 24000 # Sampling rate. -n_fft: 2048 # FFT size (samples). -n_shift: 300 # Hop size (samples). 12.5ms -win_length: 1200 # Window length (samples). 50ms - # If set to null, it will be the same as fft_size. -window: "hann" # Window function. -n_mels: 80 # Number of mel basis. -fmin: 80 # Minimum freq in mel basis calculation. -fmax: 7600 # Maximum frequency in mel basis calculation. - -########################################################### -# DATA SETTING # -########################################################### -batch_size: 32 -num_workers: 4 - -########################################################### -# MODEL SETTING # -########################################################### -model: - encoder_hidden_size: 128 - encoder_kernel_size: 3 - encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1] - duration_predictor_hidden_size: 128 - decoder_hidden_size: 128 - decoder_output_size: 80 - decoder_kernel_size: 3 - decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1] - spk_embed_dim: 256 - spk_embed_integration_type: add # speaker embedding integration type - -########################################################### -# OPTIMIZER SETTING # -########################################################### -optimizer: - optim: adam # optimizer type - learning_rate: 0.002 # learning rate - max_grad_norm: 1 - -########################################################### -# TRAINING SETTING # -########################################################### -max_epoch: 100 -num_snapshots: 5 - -########################################################### -# OTHER SETTING # -########################################################### -seed: 10086 \ No newline at end of file diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index b03fbb58..ed085dfd 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -96,7 +96,7 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): def __init__(self, vocab_size, tone_size, hidden_size, kernel_size, - dilations): + dilations, spk_num=None): super().__init__() self.embedding = TextEmbedding( vocab_size, @@ -104,6 +104,15 @@ class SpeedySpeechEncoder(nn.Layer): tone_size, padding_idx=0, tone_padding_idx=0) + + if spk_num: + self.spk_emb = nn.Embedding( + num_embeddings=spk_num, + embedding_dim=hidden_size, + padding_idx=0) + else: + self.spk_emb = None + self.prenet = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.ReLU(), ) @@ -118,8 +127,10 @@ class SpeedySpeechEncoder(nn.Layer): nn.BatchNorm1D(hidden_size, data_format="NLC"), nn.Linear(hidden_size, hidden_size), ) - def forward(self, text, tones): + def forward(self, text, tones, spk_id=None): embedding = self.embedding(text, tones) + if self.spk_emb: + embedding += self.spk_emb(spk_id).unsqueeze(1) embedding = self.prenet(embedding) x = self.res_blocks(embedding) x = embedding + self.postnet1(x) @@ -172,14 +183,11 @@ class SpeedySpeech(nn.Layer): decoder_kernel_size, decoder_dilations, tone_size=None, - spk_num: int=None, - spk_embed_dim: int=None, - spk_embed_integration_type: str="add", - ): + spk_num=None): super().__init__() encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, - encoder_dilations) + encoder_dilations, spk_num) duration_predictor = DurationPredictor(duration_predictor_hidden_size) decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size, decoder_kernel_size, decoder_dilations) @@ -187,27 +195,6 @@ class SpeedySpeech(nn.Layer): self.encoder = encoder self.duration_predictor = duration_predictor self.decoder = decoder - self.spk_embed_dim = spk_embed_dim - # use idx 0 as padding idx - self.padding_idx = 0 - - if self.spk_embed_dim is not None: - self.spk_embed_integration_type = spk_embed_integration_type - if spk_num and self.spk_embed_dim: - self.spk_embedding_table = nn.Embedding( - num_embeddings=spk_num, - embedding_dim=self.spk_embed_dim, - padding_idx=self.padding_idx) - self.encoder_hidden_size = encoder_hidden_size - # define additional projection for speaker embedding - - if self.spk_embed_dim is not None: - print("spk_embed_integration_type------------", spk_embed_integration_type) - if self.spk_embed_integration_type == "add": - self.spk_projection = nn.Linear(self.spk_embed_dim, self.encoder_hidden_size) - else: - self.spk_projection = nn.Linear(self.encoder_hidden_size + self.spk_embed_dim, self.encoder_hidden_size) - def forward(self, text, tones, durations, spk_id: paddle.Tensor=None): # input of embedding must be int64 @@ -216,13 +203,7 @@ class SpeedySpeech(nn.Layer): if spk_id is not None: spk_id = paddle.cast(spk_id, 'int64') durations = paddle.cast(durations, 'int64') - encodings = self.encoder(text, tones) - # (B, T) - - if self.spk_embed_dim is not None: - if spk_id is not None: - spk_emb = self.spk_embedding_table(spk_id) - encodings = self._integrate_with_spk_embed(encodings, spk_emb) + encodings = self.encoder(text, tones, spk_id) pred_durations = self.duration_predictor(encodings.detach()) @@ -237,7 +218,7 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded, pred_durations - def inference(self, text, tones=None, spk_id=None,): + def inference(self, text, tones=None, spk_id=None): # text: [T] # tones: [T] # input of embedding must be int64 @@ -247,11 +228,7 @@ class SpeedySpeech(nn.Layer): tones = paddle.cast(tones, 'int64') tones = tones.unsqueeze(0) - encodings = self.encoder(text, tones) - if self.spk_embed_dim is not None: - if spk_id is not None: - spk_emb = self.spk_embedding_table(spk_id) - encodings = self._integrate_with_spk_embed(encodings, spk_emb) + encodings = self.encoder(text, tones, spk_id) pred_durations = self.duration_predictor(encodings) # (1, T) durations_to_expand = paddle.round(pred_durations.exp()) @@ -278,35 +255,6 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded[0] - def _integrate_with_spk_embed(self, hs, spk_emb): - """Integrate speaker embedding with hidden states. - - Parameters - ---------- - hs : Tensor - Batch of hidden state sequences (B, Tmax, adim). - spk_emb : Tensor - Batch of speaker embeddings (B, spk_embed_dim). - - Returns - ---------- - Tensor - Batch of integrated hidden state sequences (B, Tmax, adim) - """ - if self.spk_embed_integration_type == "add": - # apply projection and then add to hidden states - spk_emb = self.spk_projection(F.normalize(spk_emb)) - hs = hs + spk_emb.unsqueeze(1) - elif self.spk_embed_integration_type == "concat": - # concat hidden states with spk embeds and then apply projection - spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( - shape=[-1, hs.shape[1], -1]) - hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1)) - else: - raise NotImplementedError("support only add or concat.") - - return hs - class SpeedySpeechInference(nn.Layer): def __init__(self, normalizer, speedyspeech_model): super().__init__()