# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
""" Tacotron2 encoder related modules. """
import paddle
from paddle import nn
class Encoder ( nn . Layer ) :
""" Encoder module of Spectrogram prediction network.
This is a module of encoder of Spectrogram prediction network in Tacotron2 ,
which described in ` Natural TTS Synthesis by Conditioning WaveNet on Mel
Spectrogram Predictions ` _ . This is the encoder which converts either a sequence
of characters or acoustic features into the sequence of hidden states .
. . _ ` Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions ` :
https : / / arxiv . org / abs / 1712.05884
"""
def __init__ (
self ,
idim ,
input_layer = " embed " ,
embed_dim = 512 ,
elayers = 1 ,
eunits = 512 ,
econv_layers = 3 ,
econv_chans = 512 ,
econv_filts = 5 ,
use_batch_norm = True ,
use_residual = False ,
dropout_rate = 0.5 ,
padding_idx = 0 , ) :
""" Initialize Tacotron2 encoder module.
Args :
idim ( int ) :
Dimension of the inputs .
input_layer ( str ) :
Input layer type .
embed_dim ( int , optional ) :
Dimension of character embedding .
elayers ( int , optional ) :
The number of encoder blstm layers .
eunits ( int , optional ) :
The number of encoder blstm units .
econv_layers ( int , optional ) :
The number of encoder conv layers .
econv_filts ( int , optional ) :
The number of encoder conv filter size .
econv_chans ( int , optional ) :
The number of encoder conv filter channels .
use_batch_norm ( bool , optional ) :
Whether to use batch normalization .
use_residual ( bool , optional ) :
Whether to use residual connection .
dropout_rate ( float , optional ) :
Dropout rate .
"""
super ( ) . __init__ ( )
# store the hyperparameters
self . idim = idim
self . use_residual = use_residual
# define network layer modules
if input_layer == " linear " :
self . embed = nn . Linear ( idim , econv_chans )
elif input_layer == " embed " :
self . embed = nn . Embedding ( idim , embed_dim , padding_idx = padding_idx )
else :
raise ValueError ( " unknown input_layer: " + input_layer )
if econv_layers > 0 :
self . convs = nn . LayerList ( )
for layer in range ( econv_layers ) :
ichans = ( embed_dim if layer == 0 and input_layer == " embed "
else econv_chans )
if use_batch_norm :
self . convs . append (
nn . Sequential (
nn . Conv1D (
ichans ,
econv_chans ,
econv_filts ,
stride = 1 ,
padding = ( econv_filts - 1 ) / / 2 ,
bias_attr = False , ) ,
nn . BatchNorm1D ( econv_chans ) ,
nn . ReLU ( ) ,
nn . Dropout ( dropout_rate ) , ) )
else :
self . convs + = [
nn . Sequential (
nn . Conv1D (
ichans ,
econv_chans ,
econv_filts ,
stride = 1 ,
padding = ( econv_filts - 1 ) / / 2 ,
bias_attr = False , ) ,
nn . ReLU ( ) ,
nn . Dropout ( dropout_rate ) , )
]
else :
self . convs = None
if elayers > 0 :
iunits = econv_chans if econv_layers != 0 else embed_dim
# batch_first=True, bidirectional=True
self . blstm = nn . LSTM (
iunits ,
eunits / / 2 ,
elayers ,
time_major = False ,
direction = ' bidirectional ' ,
bias_ih_attr = True ,
bias_hh_attr = True )
self . blstm . flatten_parameters ( )
else :
self . blstm = None
# # initialize
# self.apply(encoder_init)
def forward ( self , xs , ilens = None ) :
""" Calculate forward propagation.
Args :
xs ( Tensor ) :
Batch of the padded sequence . Either character ids ( B , Tmax )
or acoustic feature ( B , Tmax , idim * encoder_reduction_factor ) .
Padded value should be 0.
ilens ( Tensor ( int64 ) ) :
Batch of lengths of each input batch ( B , ) .
Returns :
Tensor :
Batch of the sequences of encoder states ( B , Tmax , eunits ) .
Tensor ( int64 ) :
Batch of lengths of each sequence ( B , )
"""
xs = self . embed ( xs ) . transpose ( [ 0 , 2 , 1 ] )
if self . convs is not None :
for i in range ( len ( self . convs ) ) :
if self . use_residual :
xs + = self . convs [ i ] ( xs )
else :
xs = self . convs [ i ] ( xs )
if self . blstm is None :
return xs . transpose ( [ 0 , 2 , 1 ] )
if not isinstance ( ilens , paddle . Tensor ) :
ilens = paddle . to_tensor ( ilens )
xs = xs . transpose ( [ 0 , 2 , 1 ] )
# for dygraph to static graph
# self.blstm.flatten_parameters()
# (B, Tmax, C)
# see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi
xs , _ = self . blstm ( xs , sequence_length = ilens )
hlens = ilens
return xs , hlens
def inference ( self , x ) :
""" Inference.
Args :
x ( Tensor ) :
The sequeunce of character ids ( T , ) or acoustic feature ( T , idim * encoder_reduction_factor ) .
Returns :
Tensor : The sequences of encoder states ( T , eunits ) .
"""
xs = x . unsqueeze ( 0 )
ilens = paddle . shape ( x ) [ 0 ]
return self . forward ( xs , ilens ) [ 0 ] [ 0 ]