commit
bc0dd51149
@ -0,0 +1,109 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
|
||||
fs: 24000 # sr
|
||||
n_fft: 2048 # FFT size.
|
||||
n_shift: 300 # Hop size.
|
||||
win_length: 1200 # Window length.
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
|
||||
# Only used for feats_type != raw
|
||||
|
||||
fmin: 80 # Minimum frequency of Mel basis.
|
||||
fmax: 7600 # Maximum frequency of Mel basis.
|
||||
n_mels: 80 # The number of mel basis.
|
||||
|
||||
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||
f0min: 80 # Maximum f0 for pitch extraction.
|
||||
f0max: 400 # Minimum f0 for pitch extraction.
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 4
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
adim: 384 # attention dimension
|
||||
aheads: 2 # number of attention heads
|
||||
elayers: 4 # number of encoder layers
|
||||
eunits: 1536 # number of encoder ff units
|
||||
dlayers: 4 # number of decoder layers
|
||||
dunits: 1536 # number of decoder ff units
|
||||
positionwise_layer_type: conv1d # type of position-wise layer
|
||||
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||
postnet_layers: 5 # number of layers of postnset
|
||||
postnet_filts: 5 # filter size of conv layers in postnet
|
||||
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||
reduction_factor: 1 # reduction factor
|
||||
encoder_type: conformer # encoder type
|
||||
decoder_type: conformer # decoder type
|
||||
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
|
||||
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
|
||||
conformer_activation_type: swish # conformer activation type
|
||||
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
|
||||
use_cnn_in_conformer: true # whether to use CNN in conformer
|
||||
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
|
||||
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
|
||||
init_type: xavier_uniform # initialization type
|
||||
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# UPDATER SETTING #
|
||||
###########################################################
|
||||
updater:
|
||||
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||
|
||||
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.001 # learning rate
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 1000
|
||||
num_snapshots: 5
|
||||
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
@ -1,38 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import random
|
||||
|
||||
|
||||
def cycle(iterable):
|
||||
# cycle('ABCD') --> A B C D A B C D A B C D ...
|
||||
saved = []
|
||||
for element in iterable:
|
||||
yield element
|
||||
saved.append(element)
|
||||
while saved:
|
||||
for element in saved:
|
||||
yield element
|
||||
|
||||
|
||||
def random_cycle(iterable):
|
||||
# cycle('ABCD') --> A B C D B C D A A D B C ...
|
||||
saved = []
|
||||
for element in iterable:
|
||||
yield element
|
||||
saved.append(element)
|
||||
random.shuffle(saved)
|
||||
while saved:
|
||||
for element in saved:
|
||||
yield element
|
||||
random.shuffle(saved)
|
@ -1,131 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import BatchSampler
|
||||
from paddle.io import Dataset
|
||||
|
||||
from paddlespeech.t2s.exps.ge2e.random_cycle import random_cycle
|
||||
|
||||
|
||||
class MultiSpeakerMelDataset(Dataset):
|
||||
"""A 2 layer directory thatn contains mel spectrograms in *.npy format.
|
||||
An Example file structure tree is shown below. We prefer to preprocess
|
||||
raw datasets and organized them like this.
|
||||
|
||||
dataset_root/
|
||||
speaker1/
|
||||
utterance1.npy
|
||||
utterance2.npy
|
||||
utterance3.npy
|
||||
speaker2/
|
||||
utterance1.npy
|
||||
utterance2.npy
|
||||
utterance3.npy
|
||||
"""
|
||||
|
||||
def __init__(self, dataset_root: Path):
|
||||
self.root = Path(dataset_root).expanduser()
|
||||
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
||||
|
||||
speaker_utterances = {
|
||||
speaker_dir: list(speaker_dir.glob("*.npy"))
|
||||
for speaker_dir in speaker_dirs
|
||||
}
|
||||
|
||||
self.speaker_dirs = speaker_dirs
|
||||
self.speaker_to_utterances = speaker_utterances
|
||||
|
||||
# meta data
|
||||
self.num_speakers = len(self.speaker_dirs)
|
||||
self.num_utterances = np.sum(
|
||||
len(utterances)
|
||||
for speaker, utterances in self.speaker_to_utterances.items())
|
||||
|
||||
def get_example_by_index(self, speaker_index, utterance_index):
|
||||
speaker_dir = self.speaker_dirs[speaker_index]
|
||||
fpath = self.speaker_to_utterances[speaker_dir][utterance_index]
|
||||
return self[fpath]
|
||||
|
||||
def __getitem__(self, fpath):
|
||||
return np.load(fpath)
|
||||
|
||||
def __len__(self):
|
||||
return int(self.num_utterances)
|
||||
|
||||
|
||||
class MultiSpeakerSampler(BatchSampler):
|
||||
"""A multi-stratal sampler designed for speaker verification task.
|
||||
First, N speakers from all speakers are sampled randomly. Then, for each
|
||||
speaker, randomly sample M utterances from their corresponding utterances.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dataset: MultiSpeakerMelDataset,
|
||||
speakers_per_batch: int,
|
||||
utterances_per_speaker: int):
|
||||
self._speakers = list(dataset.speaker_dirs)
|
||||
self._speaker_to_utterances = dataset.speaker_to_utterances
|
||||
|
||||
self.speakers_per_batch = speakers_per_batch
|
||||
self.utterances_per_speaker = utterances_per_speaker
|
||||
|
||||
def __iter__(self):
|
||||
# yield list of Paths
|
||||
speaker_generator = iter(random_cycle(self._speakers))
|
||||
speaker_utterances_generator = {
|
||||
s: iter(random_cycle(us))
|
||||
for s, us in self._speaker_to_utterances.items()
|
||||
}
|
||||
|
||||
while True:
|
||||
speakers = []
|
||||
for _ in range(self.speakers_per_batch):
|
||||
speakers.append(next(speaker_generator))
|
||||
|
||||
utterances = []
|
||||
for s in speakers:
|
||||
us = speaker_utterances_generator[s]
|
||||
for _ in range(self.utterances_per_speaker):
|
||||
utterances.append(next(us))
|
||||
yield utterances
|
||||
|
||||
|
||||
class RandomClip(object):
|
||||
def __init__(self, frames):
|
||||
self.frames = frames
|
||||
|
||||
def __call__(self, spec):
|
||||
# spec [T, C]
|
||||
T = spec.shape[0]
|
||||
start = random.randint(0, T - self.frames)
|
||||
return spec[start:start + self.frames, :]
|
||||
|
||||
|
||||
class Collate(object):
|
||||
def __init__(self, num_frames):
|
||||
self.random_crop = RandomClip(num_frames)
|
||||
|
||||
def __call__(self, examples):
|
||||
frame_clips = [self.random_crop(mel) for mel in examples]
|
||||
batced_clips = np.stack(frame_clips)
|
||||
return batced_clips
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mydataset = MultiSpeakerMelDataset(
|
||||
Path("/home/chenfeiyu/datasets/SV2TTS/encoder"))
|
||||
print(mydataset.get_example_by_index(0, 10))
|
@ -1,123 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import time
|
||||
|
||||
from paddle import DataParallel
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader
|
||||
from paddle.nn.clip import ClipGradByGlobalNorm
|
||||
from paddle.optimizer import Adam
|
||||
|
||||
from paddlespeech.t2s.exps.ge2e.config import get_cfg_defaults
|
||||
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import Collate
|
||||
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
|
||||
from paddlespeech.t2s.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
|
||||
from paddlespeech.t2s.models.lstm_speaker_encoder import LSTMSpeakerEncoder
|
||||
from paddlespeech.t2s.training import default_argument_parser
|
||||
from paddlespeech.t2s.training import ExperimentBase
|
||||
|
||||
|
||||
class Ge2eExperiment(ExperimentBase):
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = LSTMSpeakerEncoder(config.data.n_mels, config.model.num_layers,
|
||||
config.model.hidden_size,
|
||||
config.model.embedding_size)
|
||||
optimizer = Adam(
|
||||
config.training.learning_rate_init,
|
||||
parameters=model.parameters(),
|
||||
grad_clip=ClipGradByGlobalNorm(3))
|
||||
self.model = DataParallel(model) if self.parallel else model
|
||||
self.model_core = model
|
||||
self.optimizer = optimizer
|
||||
|
||||
def setup_dataloader(self):
|
||||
config = self.config
|
||||
train_dataset = MultiSpeakerMelDataset(self.args.data)
|
||||
sampler = MultiSpeakerSampler(train_dataset,
|
||||
config.training.speakers_per_batch,
|
||||
config.training.utterances_per_speaker)
|
||||
train_loader = DataLoader(
|
||||
train_dataset,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=Collate(config.data.partial_n_frames),
|
||||
num_workers=16)
|
||||
|
||||
self.train_dataset = train_dataset
|
||||
self.train_loader = train_loader
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
specs = batch
|
||||
loss, eer = self.model(specs, self.config.training.speakers_per_batch)
|
||||
loss.backward()
|
||||
self.model_core.do_gradient_ops()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
# logging
|
||||
loss_value = float(loss)
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += 'loss: {:>.6f} err: {:>.6f}'.format(loss_value, eer)
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
self.visualizer.add_scalar("train/loss", loss_value, self.iteration)
|
||||
self.visualizer.add_scalar("train/eer", eer, self.iteration)
|
||||
self.visualizer.add_scalar("param/w",
|
||||
float(self.model_core.similarity_weight),
|
||||
self.iteration)
|
||||
self.visualizer.add_scalar("param/b",
|
||||
float(self.model_core.similarity_bias),
|
||||
self.iteration)
|
||||
|
||||
def valid(self):
|
||||
pass
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Ge2eExperiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
exp.run()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.ngpu > 1:
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -1,274 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Modified from espnet(https://github.com/espnet/espnet)
|
||||
"""Encoder definition."""
|
||||
import logging
|
||||
|
||||
import paddle
|
||||
|
||||
from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
|
||||
from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
|
||||
from paddlespeech.t2s.modules.layer_norm import LayerNorm
|
||||
from paddlespeech.t2s.modules.nets_utils import get_activation
|
||||
from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
|
||||
from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
|
||||
from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
|
||||
from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
|
||||
from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
|
||||
from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
|
||||
from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
|
||||
from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
|
||||
from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
|
||||
from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
|
||||
from paddlespeech.t2s.modules.transformer.repeat import repeat
|
||||
from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
|
||||
|
||||
|
||||
class Encoder(paddle.nn.Layer):
|
||||
"""Conformer encoder module.
|
||||
Parameters
|
||||
----------
|
||||
idim : int
|
||||
Input dimension.
|
||||
attention_dim : int
|
||||
Dimension of attention.
|
||||
attention_heads : int
|
||||
The number of heads of multi head attention.
|
||||
linear_units : int
|
||||
The number of units of position-wise feed forward.
|
||||
num_blocks : int
|
||||
The number of decoder blocks.
|
||||
dropout_rate : float
|
||||
Dropout rate.
|
||||
positional_dropout_rate : float
|
||||
Dropout rate after adding positional encoding.
|
||||
attention_dropout_rate : float
|
||||
Dropout rate in attention.
|
||||
input_layer : Union[str, paddle.nn.Layer]
|
||||
Input layer type.
|
||||
normalize_before : bool
|
||||
Whether to use layer_norm before the first block.
|
||||
concat_after : bool
|
||||
Whether to concat attention layer's input and output.
|
||||
if True, additional linear will be applied.
|
||||
i.e. x -> x + linear(concat(x, att(x)))
|
||||
if False, no additional linear will be applied. i.e. x -> x + att(x)
|
||||
positionwise_layer_type : str
|
||||
"linear", "conv1d", or "conv1d-linear".
|
||||
positionwise_conv_kernel_size : int
|
||||
Kernel size of positionwise conv1d layer.
|
||||
macaron_style : bool
|
||||
Whether to use macaron style for positionwise layer.
|
||||
pos_enc_layer_type : str
|
||||
Encoder positional encoding layer type.
|
||||
selfattention_layer_type : str
|
||||
Encoder attention layer type.
|
||||
activation_type : str
|
||||
Encoder activation function type.
|
||||
use_cnn_module : bool
|
||||
Whether to use convolution module.
|
||||
zero_triu : bool
|
||||
Whether to zero the upper triangular part of attention matrix.
|
||||
cnn_module_kernel : int
|
||||
Kernerl size of convolution module.
|
||||
padding_idx : int
|
||||
Padding idx for input_layer=embed.
|
||||
stochastic_depth_rate : float
|
||||
Maximum probability to skip the encoder layer.
|
||||
intermediate_layers : Union[List[int], None]
|
||||
indices of intermediate CTC layer.
|
||||
indices start from 1.
|
||||
if not None, intermediate outputs are returned (which changes return type
|
||||
signature.)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
idim,
|
||||
attention_dim=256,
|
||||
attention_heads=4,
|
||||
linear_units=2048,
|
||||
num_blocks=6,
|
||||
dropout_rate=0.1,
|
||||
positional_dropout_rate=0.1,
|
||||
attention_dropout_rate=0.0,
|
||||
input_layer="conv2d",
|
||||
normalize_before=True,
|
||||
concat_after=False,
|
||||
positionwise_layer_type="linear",
|
||||
positionwise_conv_kernel_size=1,
|
||||
macaron_style=False,
|
||||
pos_enc_layer_type="abs_pos",
|
||||
selfattention_layer_type="selfattn",
|
||||
activation_type="swish",
|
||||
use_cnn_module=False,
|
||||
zero_triu=False,
|
||||
cnn_module_kernel=31,
|
||||
padding_idx=-1,
|
||||
stochastic_depth_rate=0.0,
|
||||
intermediate_layers=None, ):
|
||||
"""Construct an Encoder object."""
|
||||
super(Encoder, self).__init__()
|
||||
|
||||
activation = get_activation(activation_type)
|
||||
if pos_enc_layer_type == "abs_pos":
|
||||
pos_enc_class = PositionalEncoding
|
||||
elif pos_enc_layer_type == "scaled_abs_pos":
|
||||
pos_enc_class = ScaledPositionalEncoding
|
||||
elif pos_enc_layer_type == "rel_pos":
|
||||
assert selfattention_layer_type == "rel_selfattn"
|
||||
pos_enc_class = RelPositionalEncoding
|
||||
elif pos_enc_layer_type == "legacy_rel_pos":
|
||||
pos_enc_class = LegacyRelPositionalEncoding
|
||||
assert selfattention_layer_type == "legacy_rel_selfattn"
|
||||
else:
|
||||
raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
|
||||
|
||||
self.conv_subsampling_factor = 1
|
||||
if input_layer == "linear":
|
||||
self.embed = paddle.nn.Sequential(
|
||||
paddle.nn.Linear(idim, attention_dim),
|
||||
paddle.nn.LayerNorm(attention_dim),
|
||||
paddle.nn.Dropout(dropout_rate),
|
||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||
elif input_layer == "conv2d":
|
||||
self.embed = Conv2dSubsampling(
|
||||
idim,
|
||||
attention_dim,
|
||||
dropout_rate,
|
||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||
self.conv_subsampling_factor = 4
|
||||
|
||||
elif input_layer == "embed":
|
||||
self.embed = paddle.nn.Sequential(
|
||||
paddle.nn.Embedding(
|
||||
idim, attention_dim, padding_idx=padding_idx),
|
||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||
elif isinstance(input_layer, paddle.nn.Layer):
|
||||
self.embed = paddle.nn.Sequential(
|
||||
input_layer,
|
||||
pos_enc_class(attention_dim, positional_dropout_rate), )
|
||||
elif input_layer is None:
|
||||
self.embed = paddle.nn.Sequential(
|
||||
pos_enc_class(attention_dim, positional_dropout_rate))
|
||||
else:
|
||||
raise ValueError("unknown input_layer: " + input_layer)
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
# self-attention module definition
|
||||
if selfattention_layer_type == "selfattn":
|
||||
logging.info("encoder self-attention layer type = self-attention")
|
||||
encoder_selfattn_layer = MultiHeadedAttention
|
||||
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||
attention_dropout_rate, )
|
||||
elif selfattention_layer_type == "legacy_rel_selfattn":
|
||||
assert pos_enc_layer_type == "legacy_rel_pos"
|
||||
encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
|
||||
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||
attention_dropout_rate, )
|
||||
elif selfattention_layer_type == "rel_selfattn":
|
||||
logging.info(
|
||||
"encoder self-attention layer type = relative self-attention")
|
||||
assert pos_enc_layer_type == "rel_pos"
|
||||
encoder_selfattn_layer = RelPositionMultiHeadedAttention
|
||||
encoder_selfattn_layer_args = (attention_heads, attention_dim,
|
||||
attention_dropout_rate, zero_triu, )
|
||||
else:
|
||||
raise ValueError("unknown encoder_attn_layer: " +
|
||||
selfattention_layer_type)
|
||||
|
||||
# feed-forward module definition
|
||||
if positionwise_layer_type == "linear":
|
||||
positionwise_layer = PositionwiseFeedForward
|
||||
positionwise_layer_args = (attention_dim, linear_units,
|
||||
dropout_rate, activation, )
|
||||
elif positionwise_layer_type == "conv1d":
|
||||
positionwise_layer = MultiLayeredConv1d
|
||||
positionwise_layer_args = (attention_dim, linear_units,
|
||||
positionwise_conv_kernel_size,
|
||||
dropout_rate, )
|
||||
elif positionwise_layer_type == "conv1d-linear":
|
||||
positionwise_layer = Conv1dLinear
|
||||
positionwise_layer_args = (attention_dim, linear_units,
|
||||
positionwise_conv_kernel_size,
|
||||
dropout_rate, )
|
||||
else:
|
||||
raise NotImplementedError("Support only linear or conv1d.")
|
||||
|
||||
# convolution module definition
|
||||
convolution_layer = ConvolutionModule
|
||||
convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
|
||||
|
||||
self.encoders = repeat(
|
||||
num_blocks,
|
||||
lambda lnum: EncoderLayer(
|
||||
attention_dim,
|
||||
encoder_selfattn_layer(*encoder_selfattn_layer_args),
|
||||
positionwise_layer(*positionwise_layer_args),
|
||||
positionwise_layer(*positionwise_layer_args) if macaron_style else None,
|
||||
convolution_layer(*convolution_layer_args) if use_cnn_module else None,
|
||||
dropout_rate,
|
||||
normalize_before,
|
||||
concat_after,
|
||||
stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
|
||||
if self.normalize_before:
|
||||
self.after_norm = LayerNorm(attention_dim)
|
||||
|
||||
self.intermediate_layers = intermediate_layers
|
||||
|
||||
def forward(self, xs, masks):
|
||||
"""Encode input sequence.
|
||||
Parameters
|
||||
----------
|
||||
xs : paddle.Tensor
|
||||
Input tensor (#batch, time, idim).
|
||||
masks (paddle.Tensor): Mask tensor (#batch, 1, time).
|
||||
Returns
|
||||
----------
|
||||
paddle.Tensor
|
||||
Output tensor (#batch, time, attention_dim).
|
||||
paddle.Tensor
|
||||
Mask tensor (#batch, time).
|
||||
"""
|
||||
if isinstance(self.embed, (Conv2dSubsampling)):
|
||||
xs, masks = self.embed(xs, masks)
|
||||
else:
|
||||
xs = self.embed(xs)
|
||||
|
||||
if self.intermediate_layers is None:
|
||||
xs, masks = self.encoders(xs, masks)
|
||||
else:
|
||||
intermediate_outputs = []
|
||||
for layer_idx, encoder_layer in enumerate(self.encoders):
|
||||
xs, masks = encoder_layer(xs, masks)
|
||||
|
||||
if (self.intermediate_layers is not None and
|
||||
layer_idx + 1 in self.intermediate_layers):
|
||||
# intermediate branches also require normalization.
|
||||
encoder_output = xs
|
||||
if isinstance(encoder_output, tuple):
|
||||
encoder_output = encoder_output[0]
|
||||
if self.normalize_before:
|
||||
encoder_output = self.after_norm(encoder_output)
|
||||
intermediate_outputs.append(encoder_output)
|
||||
|
||||
if isinstance(xs, tuple):
|
||||
xs = xs[0]
|
||||
|
||||
if self.normalize_before:
|
||||
xs = self.after_norm(xs)
|
||||
|
||||
if self.intermediate_layers is not None:
|
||||
return xs, masks, intermediate_outputs
|
||||
return xs, masks
|
Loading…
Reference in new issue