pre-commit format

pull/3088/head
th.zhang 2 years ago
parent 35c75fe052
commit d036c2eba5

@ -25,9 +25,6 @@ import librosa
import numpy as np import numpy as np
import paddle import paddle
import soundfile import soundfile
from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.utils.utility import UpdateConfig
from yacs.config import CfgNode from yacs.config import CfgNode
from ...utils.env import MODEL_HOME from ...utils.env import MODEL_HOME
@ -37,6 +34,9 @@ from ..log import logger
from ..utils import CLI_TIMER from ..utils import CLI_TIMER
from ..utils import stats_wrapper from ..utils import stats_wrapper
from ..utils import timer_register from ..utils import timer_register
from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ['ASRExecutor'] __all__ = ['ASRExecutor']

@ -87,22 +87,22 @@ ssl_dynamic_pretrained_models = {
'chinese-wav2vec2-large.pdparams', 'chinese-wav2vec2-large.pdparams',
}, },
}, },
"wav2vec2ASR_aishell1-zh-16k": { # "wav2vec2ASR_aishell1-zh-16k": {
'1.3': { # '1.3': {
'url': # 'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz', # 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz',
'md5': # 'md5':
'ac8fa0a6345e6a7535f6fabb5e59e218', # 'ac8fa0a6345e6a7535f6fabb5e59e218',
'cfg_path': # 'cfg_path':
'model.yaml', # 'model.yaml',
'ckpt_path': # 'ckpt_path':
'exp/wav2vec2ASR/checkpoints/avg_1', # 'exp/wav2vec2ASR/checkpoints/avg_1',
'model': # 'model':
'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', # 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
'params': # 'params':
'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', # 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
}, # },
}, # },
"wav2vec2ASR_aishell1-zh-16k": { "wav2vec2ASR_aishell1-zh-16k": {
'1.4': { '1.4': {
'url': 'url':

@ -33,8 +33,8 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline
from paddlespeech.s2t.io.speechbrain import dataio from paddlespeech.s2t.io.speechbrain import dataio
from paddlespeech.s2t.io.speechbrain import dataset from paddlespeech.s2t.io.speechbrain import dataset
from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
from paddlespeech.s2t.models.hubert.hubert_ASR import HubertASR from paddlespeech.s2t.models.hubert.hubert_ASR import HubertASR
from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.optimizer import OptimizerFactory
from paddlespeech.s2t.training.reporter import ObsScope from paddlespeech.s2t.training.reporter import ObsScope
from paddlespeech.s2t.training.reporter import report from paddlespeech.s2t.training.reporter import report
@ -49,6 +49,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
# Todo: change this when paddle supports this api # Todo: change this when paddle supports this api
def clip_grad_norm_( def clip_grad_norm_(
parameters, parameters,
@ -428,8 +429,7 @@ class HubertASRTrainer(Trainer):
report("epoch", self.epoch) report("epoch", self.epoch)
report('step', self.iteration) report('step', self.iteration)
report("model_lr", self.model_optimizer.get_lr()) report("model_lr", self.model_optimizer.get_lr())
report("hubert_lr", report("hubert_lr", self.hubert_optimizer.get_lr())
self.hubert_optimizer.get_lr())
self.train_batch(batch_index, batch, msg) self.train_batch(batch_index, batch, msg)
self.after_train_batch() self.after_train_batch()
report('iter', batch_index + 1) report('iter', batch_index + 1)
@ -532,6 +532,7 @@ class HubertASRTrainer(Trainer):
# Defining tokenizer and loading it # Defining tokenizer and loading it
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
self.tokenizer = tokenizer self.tokenizer = tokenizer
# 2. Define audio pipeline: # 2. Define audio pipeline:
@data_pipeline.takes("wav") @data_pipeline.takes("wav")
@data_pipeline.provides("sig") @data_pipeline.provides("sig")
@ -680,8 +681,7 @@ class HubertASRTrainer(Trainer):
logger.info("optim_model:{},{}", model_optim_type, model_optim_conf) logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
hubert_optim_type = train_config.hubert_optim hubert_optim_type = train_config.hubert_optim
hubert_optim_conf = train_config.hubert_optim_conf hubert_optim_conf = train_config.hubert_optim_conf
logger.info("optim_model:{},{}", hubert_optim_type, logger.info("optim_model:{},{}", hubert_optim_type, hubert_optim_conf)
hubert_optim_conf)
model_scheduler_type = train_config.model_scheduler model_scheduler_type = train_config.model_scheduler
model_scheduler_conf = train_config.model_scheduler_conf model_scheduler_conf = train_config.model_scheduler_conf
@ -739,7 +739,7 @@ class HubertASRTrainer(Trainer):
model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer = OptimizerFactory.from_args(model_optim_type,
model_optimizer_args) model_optimizer_args)
hubert_optimizer = OptimizerFactory.from_args(hubert_optim_type, hubert_optimizer = OptimizerFactory.from_args(hubert_optim_type,
hubert_optimizer_args) hubert_optimizer_args)
self.model_optimizer = model_optimizer self.model_optimizer = model_optimizer
self.hubert_optimizer = hubert_optimizer self.hubert_optimizer = hubert_optimizer

@ -11,31 +11,33 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""HubertASR model.""" """HubertASR model."""
from collections import defaultdict from collections import defaultdict
from typing import Dict, List, Tuple, Any
from dataclasses import dataclass, field, is_dataclass
from copy import deepcopy from copy import deepcopy
from dataclasses import dataclass
from dataclasses import is_dataclass
from typing import Dict
from typing import List
from typing import Tuple
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertConfig, HubertModel, HubertPretrainingConfig
from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2ConfigPure from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertConfig
from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2Model from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertModel
from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertPretrainingConfig
from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN
from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment
from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
from paddlespeech.s2t.modules.initializer import DefaultInitializerContext from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
from paddlespeech.s2t.utils.utility import log_add
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
from paddlespeech.s2t.utils.utility import log_add
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
class HubertASR(nn.Layer): class HubertASR(nn.Layer):
def __init__(self, config: dict): def __init__(self, config: dict):
super().__init__() super().__init__()
@ -44,8 +46,10 @@ class HubertASR(nn.Layer):
self.config = config self.config = config
with open(config.vocab_filepath) as f: with open(config.vocab_filepath) as f:
dicts = [symbol.strip() for symbol in f.readlines()] dicts = [symbol.strip() for symbol in f.readlines()]
task_cfg = self.merge_with_parent(HubertPretrainingConfig, dict(self.config.task_cfg)) task_cfg = self.merge_with_parent(HubertPretrainingConfig,
model_cfg = self.merge_with_parent(HubertConfig, dict(self.config.model_cfg)) dict(self.config.task_cfg))
model_cfg = self.merge_with_parent(HubertConfig,
dict(self.config.model_cfg))
hubert = HubertModel(model_cfg, task_cfg, dicts) hubert = HubertModel(model_cfg, task_cfg, dicts)
self.normalize_wav = config.normalize_wav self.normalize_wav = config.normalize_wav
@ -326,11 +330,13 @@ class HubertBase(nn.Layer):
def __init__(self, config: dict): def __init__(self, config: dict):
super().__init__() super().__init__()
with open(config.vocab_filepath) as f: with open(config.vocab_filepath) as f:
dicts = [symbol.strip() for symbol in f.readlines()] dicts = [symbol.strip() for symbol in f.readlines()]
task_cfg = self.merge_with_parent(HubertPretrainingConfig, dict(self.config.task_cfg)) task_cfg = self.merge_with_parent(HubertPretrainingConfig,
model_cfg = self.merge_with_parent(HubertConfig, dict(self.config.model_cfg)) dict(self.config.task_cfg))
model_cfg = self.merge_with_parent(HubertConfig,
dict(self.config.model_cfg))
hubert = HubertModel(model_cfg, task_cfg, dicts) hubert = HubertModel(model_cfg, task_cfg, dicts)
self.hubert= hubert self.hubert = hubert
@classmethod @classmethod
def from_config(cls, configs: dict): def from_config(cls, configs: dict):

@ -13,69 +13,67 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Paddle Hubert model.""" """ Paddle Hubert model."""
from dataclasses import dataclass
from dataclasses import dataclass, field from dataclasses import field
from typing import Any, Dict, List, Optional, Tuple from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
import numpy as np import numpy as np
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ChoiceEnum
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import compute_mask_indices
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ConvFeatureExtractionModel
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import EXTRACTOR_MODE_CHOICES
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import get_available_activation_fns
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import GLU
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import GradMultiply
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import LAYER_TYPE_CHOICES
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import MASKING_DISTRIBUTION_CHOICES
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import TransformerEncoder
from paddlespeech.s2t.modules.align import LayerNorm from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ( from paddlespeech.s2t.modules.align import Linear
EXTRACTOR_MODE_CHOICES,
LAYER_TYPE_CHOICES,
MASKING_DISTRIBUTION_CHOICES,
ChoiceEnum,
ConvFeatureExtractionModel,
GradMultiply,
LayerNorm,
TransformerEncoder,
compute_mask_indices,
get_available_activation_fns,
GLU,
)
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
@dataclass @dataclass
class HubertPretrainingConfig: class HubertPretrainingConfig:
label_rate: float = field( label_rate: float = field(
default=-1.0, default=-1.0,
metadata={"help": "label frame rate. -1.0 for sequence label"}, metadata={"help": "label frame rate. -1.0 for sequence label"}, )
)
sample_rate: int = field( sample_rate: int = field(
default=16_000, default=16_000,
metadata={ metadata={
"help": "target sample rate. audio files will be up/down " "help":
"target sample rate. audio files will be up/down "
"sampled to this rate" "sampled to this rate"
}, }, )
)
normalize: bool = field( normalize: bool = field(
default=False, default=False,
metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, metadata={
) "help": "if set, normalizes input to have 0 mean and unit variance"
}, )
enable_padding: bool = field( enable_padding: bool = field(
default=False, default=False,
metadata={"help": "pad shorter samples instead of cropping"}, metadata={"help": "pad shorter samples instead of cropping"}, )
)
max_keep_size: Optional[int] = field( max_keep_size: Optional[int] = field(
default=None, default=None,
metadata={"help": "exclude sample longer than this"}, metadata={"help": "exclude sample longer than this"}, )
)
max_sample_size: Optional[int] = field( max_sample_size: Optional[int] = field(
default=None, default=None,
metadata={"help": "max sample size to crop to for batching"}, metadata={"help": "max sample size to crop to for batching"}, )
)
min_sample_size: Optional[int] = field( min_sample_size: Optional[int] = field(
default=None, default=None,
metadata={"help": "min sample size to crop to for batching"}, metadata={"help": "min sample size to crop to for batching"}, )
)
random_crop: Optional[bool] = field( random_crop: Optional[bool] = field(
default=True, default=True,
metadata={"help": "always crop from the beginning if false"}, metadata={"help": "always crop from the beginning if false"}, )
)
pad_audio: Optional[bool] = field( pad_audio: Optional[bool] = field(
default=False, default=False,
metadata={"help": "pad audio to the longest one in the batch if true"}, metadata={"help": "pad audio to the longest one in the batch if true"},
@ -89,51 +87,40 @@ class HubertConfig:
extractor_mode: EXTRACTOR_MODE_CHOICES = field( extractor_mode: EXTRACTOR_MODE_CHOICES = field(
default="default", default="default",
metadata={ metadata={
"help": "mode for feature extractor. default has a single group " "help":
"mode for feature extractor. default has a single group "
"norm with d groups in the first conv block, whereas layer_norm " "norm with d groups in the first conv block, whereas layer_norm "
"has layer norms in every block (meant to use with normalize=True)" "has layer norms in every block (meant to use with normalize=True)"
}, }, )
)
encoder_layers: int = field( encoder_layers: int = field(
default=12, metadata={"help": "num encoder layers in the transformer"} default=12, metadata={"help": "num encoder layers in the transformer"})
)
encoder_embed_dim: int = field( encoder_embed_dim: int = field(
default=768, metadata={"help": "encoder embedding dimension"} default=768, metadata={"help": "encoder embedding dimension"})
)
encoder_ffn_embed_dim: int = field( encoder_ffn_embed_dim: int = field(
default=3072, metadata={"help": "encoder embedding dimension for FFN"} default=3072, metadata={"help": "encoder embedding dimension for FFN"})
)
encoder_attention_heads: int = field( encoder_attention_heads: int = field(
default=12, metadata={"help": "num encoder attention heads"} default=12, metadata={"help": "num encoder attention heads"})
)
activation_fn: ChoiceEnum(get_available_activation_fns()) = field( activation_fn: ChoiceEnum(get_available_activation_fns()) = field(
default="gelu", metadata={"help": "activation function to use"} default="gelu", metadata={"help": "activation function to use"})
)
layer_type: LAYER_TYPE_CHOICES = field( layer_type: LAYER_TYPE_CHOICES = field(
default="transformer", metadata={"help": "layer type in encoder"} default="transformer", metadata={"help": "layer type in encoder"})
)
# dropouts # dropouts
dropout: float = field( dropout: float = field(
default=0.1, default=0.1,
metadata={"help": "dropout probability for the transformer"}, metadata={"help": "dropout probability for the transformer"}, )
)
attention_dropout: float = field( attention_dropout: float = field(
default=0.1, default=0.1,
metadata={"help": "dropout probability for attention weights"}, metadata={"help": "dropout probability for attention weights"}, )
)
activation_dropout: float = field( activation_dropout: float = field(
default=0.0, default=0.0,
metadata={"help": "dropout probability after activation in FFN"}, metadata={"help": "dropout probability after activation in FFN"}, )
)
encoder_layerdrop: float = field( encoder_layerdrop: float = field(
default=0.0, default=0.0,
metadata={"help": "probability of dropping a tarnsformer layer"}, metadata={"help": "probability of dropping a tarnsformer layer"}, )
)
dropout_input: float = field( dropout_input: float = field(
default=0.0, default=0.0,
metadata={"help": "dropout to apply to the input (after feat extr)"}, metadata={"help": "dropout to apply to the input (after feat extr)"}, )
)
dropout_features: float = field( dropout_features: float = field(
default=0.0, default=0.0,
metadata={"help": "dropout to apply to the features (after feat extr)"}, metadata={"help": "dropout to apply to the features (after feat extr)"},
@ -142,60 +129,51 @@ class HubertConfig:
final_dim: int = field( final_dim: int = field(
default=0, default=0,
metadata={ metadata={
"help": "project final representations and targets to this many " "help":
"project final representations and targets to this many "
"dimensions. set to encoder_embed_dim is <= 0" "dimensions. set to encoder_embed_dim is <= 0"
}, }, )
)
untie_final_proj: bool = field( untie_final_proj: bool = field(
default=False, default=False,
metadata={"help": "use separate projection for each target"}, metadata={"help": "use separate projection for each target"}, )
)
layer_norm_first: bool = field( layer_norm_first: bool = field(
default=False, default=False,
metadata={"help": "apply layernorm first in the transformer"}, metadata={"help": "apply layernorm first in the transformer"}, )
)
conv_feature_layers: str = field( conv_feature_layers: str = field(
default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
metadata={ metadata={
"help": "string describing convolutional feature extraction " "help":
"string describing convolutional feature extraction "
"layers in form of a python list that contains " "layers in form of a python list that contains "
"[(dim, kernel_size, stride), ...]" "[(dim, kernel_size, stride), ...]"
}, }, )
)
conv_bias: bool = field( conv_bias: bool = field(
default=False, metadata={"help": "include bias in conv encoder"} default=False, metadata={"help": "include bias in conv encoder"})
)
logit_temp: float = field( logit_temp: float = field(
default=0.1, metadata={"help": "temperature to divide logits by"} default=0.1, metadata={"help": "temperature to divide logits by"})
)
target_glu: bool = field( target_glu: bool = field(
default=False, metadata={"help": "adds projection + glu to targets"} default=False, metadata={"help": "adds projection + glu to targets"})
)
feature_grad_mult: float = field( feature_grad_mult: float = field(
default=1.0, default=1.0,
metadata={"help": "multiply feature extractor var grads by this"}, metadata={"help": "multiply feature extractor var grads by this"}, )
)
# masking # masking
mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_length: int = field(default=10, metadata={"help": "mask length"})
mask_prob: float = field( mask_prob: float = field(
default=0.65, default=0.65,
metadata={"help": "probability of replacing a token with mask"}, metadata={"help": "probability of replacing a token with mask"}, )
)
mask_selection: MASKING_DISTRIBUTION_CHOICES = field( mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
default="static", metadata={"help": "how to choose mask length"} default="static", metadata={"help": "how to choose mask length"})
)
mask_other: float = field( mask_other: float = field(
default=0, default=0,
metadata={ metadata={
"help": "secondary mask argument " "help":
"secondary mask argument "
"(used for more complex distributions), " "(used for more complex distributions), "
"see help in compute_mask_indicesh" "see help in compute_mask_indicesh"
}, }, )
)
no_mask_overlap: bool = field( no_mask_overlap: bool = field(
default=False, metadata={"help": "whether to allow masks to overlap"} default=False, metadata={"help": "whether to allow masks to overlap"})
)
mask_min_space: int = field( mask_min_space: int = field(
default=1, default=1,
metadata={"help": "min space between spans (if no overlap is enabled)"}, metadata={"help": "min space between spans (if no overlap is enabled)"},
@ -204,28 +182,24 @@ class HubertConfig:
# channel masking # channel masking
mask_channel_length: int = field( mask_channel_length: int = field(
default=10, default=10,
metadata={"help": "length of the mask for features (channels)"}, metadata={"help": "length of the mask for features (channels)"}, )
)
mask_channel_prob: float = field( mask_channel_prob: float = field(
default=0.0, default=0.0,
metadata={"help": "probability of replacing a feature with 0"}, metadata={"help": "probability of replacing a feature with 0"}, )
)
mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
default="static", default="static",
metadata={"help": "how to choose mask length for channel masking"}, metadata={"help": "how to choose mask length for channel masking"}, )
)
mask_channel_other: float = field( mask_channel_other: float = field(
default=0, default=0,
metadata={ metadata={
"help": "secondary mask argument " "help":
"secondary mask argument "
"(used for more complex distributions), " "(used for more complex distributions), "
"see help in compute_mask_indicesh" "see help in compute_mask_indicesh"
}, }, )
)
no_mask_channel_overlap: bool = field( no_mask_channel_overlap: bool = field(
default=False, default=False,
metadata={"help": "whether to allow channel masks to overlap"}, metadata={"help": "whether to allow channel masks to overlap"}, )
)
mask_channel_min_space: int = field( mask_channel_min_space: int = field(
default=1, default=1,
metadata={"help": "min space between spans (if no overlap is enabled)"}, metadata={"help": "min space between spans (if no overlap is enabled)"},
@ -234,66 +208,64 @@ class HubertConfig:
# positional embeddings # positional embeddings
conv_pos: int = field( conv_pos: int = field(
default=128, default=128,
metadata={"help": "number of filters for convolutional positional embeddings"}, metadata={
) "help": "number of filters for convolutional positional embeddings"
}, )
conv_pos_groups: int = field( conv_pos_groups: int = field(
default=16, default=16,
metadata={"help": "number of groups for convolutional positional embedding"}, metadata={
) "help": "number of groups for convolutional positional embedding"
}, )
latent_temp: Tuple[float, float, float] = field( latent_temp: Tuple[float, float, float] = field(
default=(2, 0.5, 0.999995), default=(2, 0.5, 0.999995),
metadata={"help": "legacy (to be removed)"}, metadata={"help": "legacy (to be removed)"}, )
)
# loss computation # loss computation
skip_masked: bool = field( skip_masked: bool = field(
default=False, default=False,
metadata={"help": "skip computing losses over masked frames"}, metadata={"help": "skip computing losses over masked frames"}, )
)
skip_nomask: bool = field( skip_nomask: bool = field(
default=False, default=False,
metadata={"help": "skip computing losses over unmasked frames"}, metadata={"help": "skip computing losses over unmasked frames"}, )
)
checkpoint_activations: bool = field( checkpoint_activations: bool = field(
default=False, default=False,
metadata={"help": "recompute activations and save memory for extra compute"}, metadata={
) "help": "recompute activations and save memory for extra compute"
}, )
# FP16 optimization # FP16 optimization
required_seq_len_multiple: int = field( required_seq_len_multiple: int = field(
default=2, default=2,
metadata={ metadata={
"help": "pad the input to encoder such that the sequence length is divisible by multiple" "help":
}, "pad the input to encoder such that the sequence length is divisible by multiple"
) }, )
# Conformer # Conformer
depthwise_conv_kernel_size: int = field( depthwise_conv_kernel_size: int = field(
default=31, default=31,
metadata={ metadata={
"help": "depthwise-conv-kernel-size for convolution in conformer layer" "help":
}, "depthwise-conv-kernel-size for convolution in conformer layer"
) }, )
attn_type: str = field( attn_type: str = field(
default="", default="",
metadata={"help": "if espnet use ESPNET MHA"}, metadata={"help": "if espnet use ESPNET MHA"}, )
)
pos_enc_type: str = field( pos_enc_type: str = field(
default="abs", default="abs",
metadata={"help": "Positional encoding type to use in conformer"}, metadata={"help": "Positional encoding type to use in conformer"}, )
) fp16: bool = field(
fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) default=False, metadata={"help": "If fp16 is being used"})
class HubertModel(nn.Layer): class HubertModel(nn.Layer):
def __init__( def __init__(
self, self,
cfg: HubertConfig, cfg: HubertConfig,
task_cfg: HubertPretrainingConfig, task_cfg: HubertPretrainingConfig,
dictionaries: List[Any], dictionaries: List[Any], ) -> None:
) -> None:
super().__init__() super().__init__()
logger.info(f"HubertModel Config: {cfg}") logger.info(f"HubertModel Config: {cfg}")
@ -304,16 +276,12 @@ class HubertModel(nn.Layer):
conv_layers=feature_enc_layers, conv_layers=feature_enc_layers,
dropout=0.0, dropout=0.0,
mode=cfg.extractor_mode, mode=cfg.extractor_mode,
conv_bias=cfg.conv_bias, conv_bias=cfg.conv_bias, )
)
feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers])
self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate
self.post_extract_proj = ( self.post_extract_proj = (Linear(self.embed, cfg.encoder_embed_dim) if
Linear(self.embed, cfg.encoder_embed_dim) self.embed != cfg.encoder_embed_dim else None)
if self.embed != cfg.encoder_embed_dim
else None
)
self.mask_prob = cfg.mask_prob self.mask_prob = cfg.mask_prob
self.mask_selection = cfg.mask_selection self.mask_selection = cfg.mask_selection
@ -342,8 +310,7 @@ class HubertModel(nn.Layer):
self.mask_emb = paddle.create_parameter( self.mask_emb = paddle.create_parameter(
shape=[cfg.encoder_embed_dim], shape=[cfg.encoder_embed_dim],
dtype='float32', dtype='float32',
default_initializer=paddle.nn.initializer.Uniform(low=0), default_initializer=paddle.nn.initializer.Uniform(low=0), )
)
self.encoder = TransformerEncoder(cfg) self.encoder = TransformerEncoder(cfg)
self.layer_norm = LayerNorm(self.embed) self.layer_norm = LayerNorm(self.embed)
@ -351,27 +318,25 @@ class HubertModel(nn.Layer):
self.target_glu = None self.target_glu = None
if cfg.target_glu: if cfg.target_glu:
self.target_glu = nn.Sequential( self.target_glu = nn.Sequential(
Linear(final_dim, final_dim * 2), GLU() Linear(final_dim, final_dim * 2), GLU())
)
self.untie_final_proj = cfg.untie_final_proj self.untie_final_proj = cfg.untie_final_proj
if self.untie_final_proj: if self.untie_final_proj:
self.final_proj = Linear( self.final_proj = Linear(cfg.encoder_embed_dim,
cfg.encoder_embed_dim, final_dim * len(dictionaries) final_dim * len(dictionaries))
)
else: else:
self.final_proj = Linear(cfg.encoder_embed_dim, final_dim) self.final_proj = Linear(cfg.encoder_embed_dim, final_dim)
# modules below are not needed during fine-tuning # modules below are not needed during fine-tuning
if any([d is None for d in dictionaries]): if any([d is None for d in dictionaries]):
logger.info("cannot find dictionary. assume will be used for fine-tuning") logger.info(
"cannot find dictionary. assume will be used for fine-tuning")
else: else:
self.num_classes = [len(d) for d in dictionaries] self.num_classes = [len(d) for d in dictionaries]
self.label_embs_concat = paddle.create_parameter( self.label_embs_concat = paddle.create_parameter(
shape=[sum(self.num_classes), final_dim], shape=[sum(self.num_classes), final_dim],
dtype='float32', dtype='float32',
default_initializer=paddle.nn.initializer.Uniform(low=0), default_initializer=paddle.nn.initializer.Uniform(low=0), )
)
@classmethod @classmethod
def build_model(cls, cfg: HubertConfig, task): def build_model(cls, cfg: HubertConfig, task):
@ -392,10 +357,10 @@ class HubertModel(nn.Layer):
self.mask_other, self.mask_other,
min_masks=2, min_masks=2,
no_overlap=self.no_mask_overlap, no_overlap=self.no_mask_overlap,
min_space=self.mask_min_space, min_space=self.mask_min_space, )
)
mask_indices = paddle.to_tensor(
mask_indices = paddle.to_tensor(mask_indices, dtype='int64', place=x.place) mask_indices, dtype='int64', place=x.place)
x[mask_indices] = self.mask_emb x[mask_indices] = self.mask_emb
else: else:
mask_indices = None mask_indices = None
@ -409,13 +374,10 @@ class HubertModel(nn.Layer):
self.mask_channel_selection, self.mask_channel_selection,
self.mask_channel_other, self.mask_channel_other,
no_overlap=self.no_mask_channel_overlap, no_overlap=self.no_mask_channel_overlap,
min_space=self.mask_channel_min_space, min_space=self.mask_channel_min_space, )
) mask_channel_indices = (paddle.to_tensor(
mask_channel_indices = ( mask_channel_indices, dtype='int64', place=x.place).unsqueeze(1)
paddle.to_tensor(mask_channel_indices, dtype='int64', place=x.place) .expand(-1, T, -1))
.unsqueeze(1)
.expand(-1, T, -1)
)
x[mask_channel_indices] = 0 x[mask_channel_indices] = 0
return x, mask_indices return x, mask_indices
@ -425,7 +387,8 @@ class HubertModel(nn.Layer):
pos = pos.unsqueeze(0) pos = pos.unsqueeze(0)
targets = paddle.concat([pos, negs], axis=0) targets = paddle.concat([pos, negs], axis=0)
logits = paddle.nn.functional.cosine_similarity(x.astype('float32'), targets.astype('float32'), axis=-1) logits = paddle.nn.functional.cosine_similarity(
x.astype('float32'), targets.astype('float32'), axis=-1)
logits /= self.logit_temp logits /= self.logit_temp
if paddle.any(neg_is_pos): if paddle.any(neg_is_pos):
logits[1:][neg_is_pos] = float("-inf") logits[1:][neg_is_pos] = float("-inf")
@ -443,9 +406,9 @@ class HubertModel(nn.Layer):
return features return features
def forward_targets( def forward_targets(
self, self,
features: paddle.Tensor, features: paddle.Tensor,
target_list: List[paddle.Tensor], target_list: List[paddle.Tensor],
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
# Trim features to ensure labels exist and then get aligned labels # Trim features to ensure labels exist and then get aligned labels
feat_tsz = features.shape[2] feat_tsz = features.shape[2]
@ -453,31 +416,31 @@ class HubertModel(nn.Layer):
if self.feat2tar_ratio * feat_tsz > targ_tsz: if self.feat2tar_ratio * feat_tsz > targ_tsz:
feat_tsz = int(targ_tsz / self.feat2tar_ratio) feat_tsz = int(targ_tsz / self.feat2tar_ratio)
features = features[:, :, :feat_tsz] features = features[:, :, :feat_tsz]
target_inds = paddle.arange(feat_tsz).astype('float32') * self.feat2tar_ratio target_inds = paddle.arange(feat_tsz).astype(
'float32') * self.feat2tar_ratio
target_list = [t[:, target_inds.astype('int64')] for t in target_list] target_list = [t[:, target_inds.astype('int64')] for t in target_list]
return features, target_list return features, target_list
def forward_padding_mask( def forward_padding_mask(
self, self,
features: paddle.Tensor, features: paddle.Tensor,
padding_mask: paddle.Tensor, padding_mask: paddle.Tensor, ) -> paddle.Tensor:
) -> paddle.Tensor:
extra = padding_mask.shape[1] % features.shape[1] extra = padding_mask.shape[1] % features.shape[1]
if extra > 0: if extra > 0:
padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask[:, :-extra]
padding_mask = paddle.reshape(padding_mask, [padding_mask.shape[0], features.shape[1], -1]) padding_mask = paddle.reshape(
padding_mask, [padding_mask.shape[0], features.shape[1], -1])
padding_mask = paddle.all(padding_mask, axis=-1) padding_mask = paddle.all(padding_mask, axis=-1)
return padding_mask return padding_mask
def forward( def forward(
self, self,
source: paddle.Tensor, source: paddle.Tensor,
target_list: Optional[List[paddle.Tensor]] = None, target_list: Optional[List[paddle.Tensor]]=None,
padding_mask: Optional[paddle.Tensor] = None, padding_mask: Optional[paddle.Tensor]=None,
mask: bool = True, mask: bool=True,
features_only: bool = False, features_only: bool=False,
output_layer: Optional[int] = None, output_layer: Optional[int]=None, ) -> Dict[str, paddle.Tensor]:
) -> Dict[str, paddle.Tensor]:
"""output layer is 1-based""" """output layer is 1-based"""
features = self.forward_features(source) features = self.forward_features(source)
if target_list is not None: if target_list is not None:
@ -499,7 +462,8 @@ class HubertModel(nn.Layer):
unmasked_features = self.dropout_features(unmasked_features) unmasked_features = self.dropout_features(unmasked_features)
if mask: if mask:
x, mask_indices = self.apply_mask(features, padding_mask, target_list) x, mask_indices = self.apply_mask(features, padding_mask,
target_list)
else: else:
x = features x = features
mask_indices = None mask_indices = None
@ -512,16 +476,18 @@ class HubertModel(nn.Layer):
x, _ = self.encoder( x, _ = self.encoder(
x, x,
padding_mask=padding_mask, padding_mask=padding_mask,
layer=None if output_layer is None else output_layer - 1, layer=None if output_layer is None else output_layer - 1, )
)
if features_only: if features_only:
return {"x": x, "padding_mask": padding_mask, "features": features} return {"x": x, "padding_mask": padding_mask, "features": features}
def compute_pred(self, proj_x, target, label_embs): def compute_pred(self, proj_x, target, label_embs):
# compute logits for the i-th label set # compute logits for the i-th label set
y = paddle.index_select(label_embs, index=target.astype('int64'), axis=0) y = paddle.index_select(
negs = paddle.expand(label_embs.unsqueeze(1), [label_embs.shape[0], proj_x.shape[0], label_embs.shape[-1]]) label_embs, index=target.astype('int64'), axis=0)
negs = paddle.expand(
label_embs.unsqueeze(1),
[label_embs.shape[0], proj_x.shape[0], label_embs.shape[-1]])
if self.target_glu: if self.target_glu:
y = self.target_glu(y) y = self.target_glu(y)
negs = self.target_glu(negs) negs = self.target_glu(negs)
@ -541,7 +507,8 @@ class HubertModel(nn.Layer):
proj_x_m_list = [proj_x_m for _ in range(len(target_list))] proj_x_m_list = [proj_x_m for _ in range(len(target_list))]
logit_m_list = [ logit_m_list = [
compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) compute_pred(proj_x_m, t[masked_indices], label_embs_list[i])
for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list)) for i, (proj_x_m, t
) in enumerate(zip(proj_x_m_list, target_list))
] ]
else: else:
logit_m_list = [None for _ in target_list] logit_m_list = [None for _ in target_list]
@ -556,7 +523,8 @@ class HubertModel(nn.Layer):
logit_u_list = [ logit_u_list = [
compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i]) compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i])
for i, (proj_x_u, t) in enumerate(zip(proj_x_u_list, target_list)) for i, (proj_x_u, t
) in enumerate(zip(proj_x_u_list, target_list))
] ]
else: else:
logit_u_list = [None for _ in target_list] logit_u_list = [None for _ in target_list]
@ -570,20 +538,19 @@ class HubertModel(nn.Layer):
return result return result
def extract_features( def extract_features(
self, self,
source: paddle.Tensor, source: paddle.Tensor,
padding_mask: Optional[paddle.Tensor] = None, padding_mask: Optional[paddle.Tensor]=None,
mask: bool = False, mask: bool=False,
ret_conv: bool = False, ret_conv: bool=False,
output_layer: Optional[int] = None, output_layer: Optional[int]=None,
) -> Tuple[paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor]:
res = self.forward( res = self.forward(
source, source,
padding_mask=padding_mask, padding_mask=padding_mask,
mask=mask, mask=mask,
features_only=True, features_only=True,
output_layer=output_layer, output_layer=output_layer, )
)
feature = res["features"] if ret_conv else res["x"] feature = res["features"] if ret_conv else res["x"]
return feature, res["padding_mask"] return feature, res["padding_mask"]
@ -592,12 +559,16 @@ class HubertModel(nn.Layer):
logits_list = net_output["logit_m_list"] logits_list = net_output["logit_m_list"]
else: else:
logits_list = net_output["logit_u_list"] logits_list = net_output["logit_u_list"]
logits_list = [paddle.cast(x, 'float32') for x in logits_list if x is not None] logits_list = [
paddle.cast(x, 'float32') for x in logits_list if x is not None
]
return logits_list return logits_list
def get_targets(self, net_output, is_masked=True): def get_targets(self, net_output, is_masked=True):
logits_list = self.get_logits(net_output, is_masked) logits_list = self.get_logits(net_output, is_masked)
targets_list = [paddle.zeros_like(x, dtype='int64') for x in logits_list] targets_list = [
paddle.zeros_like(x, dtype='int64') for x in logits_list
]
return targets_list return targets_list
def get_extra_losses(self, net_output): def get_extra_losses(self, net_output):

@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" Paddle Wav2Vec2 model.""" """ Paddle Wav2Vec2 model."""
import math import math
import uuid import uuid
from dataclasses import dataclass from dataclasses import dataclass
@ -25,20 +24,23 @@ from typing import Dict
from typing import List from typing import List
from typing import Optional from typing import Optional
from typing import Tuple from typing import Tuple
import numpy as np import numpy as np
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from paddle import Tensor from paddle import Tensor
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.align import Conv1D from paddlespeech.s2t.modules.align import Conv1D
from paddlespeech.s2t.modules.align import Conv2D from paddlespeech.s2t.modules.align import Conv2D
from paddlespeech.s2t.modules.align import Embedding from paddlespeech.s2t.modules.align import Embedding
from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
class GLU(nn.Layer): class GLU(nn.Layer):
r"""Applies the gated linear unit function r"""Applies the gated linear unit function
:math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
@ -226,7 +228,7 @@ def quant_noise(module, p, block_size):
mask = paddle.zeros( mask = paddle.zeros(
[in_channels // block_size * out_channels], [in_channels // block_size * out_channels],
dtype=paddle.bool) dtype=paddle.bool)
# the implementation of bernoulli_, p=0.5 # the implementation of bernoulli_, p=0.5
mask = paddle.ones_like(mask) * 0.5 mask = paddle.ones_like(mask) * 0.5
mask = paddle.bernoulli(mask) mask = paddle.bernoulli(mask)
@ -310,7 +312,7 @@ class MultiheadAttention(nn.Layer):
assert not self.self_attention or self.qkv_same_dim, ( assert not self.self_attention or self.qkv_same_dim, (
"Self-attention requires query, key and " "Self-attention requires query, key and "
"value to be of the same size") "value to be of the same size")
# Todo scaled initialization # Todo scaled initialization
# Empirically observed the convergence to be much better with # Empirically observed the convergence to be much better with
# the scaled initialization # the scaled initialization
@ -319,19 +321,31 @@ class MultiheadAttention(nn.Layer):
out_proj_bias_attr = nn.initializer.Constant(0) out_proj_bias_attr = nn.initializer.Constant(0)
self.k_proj = quant_noise( self.k_proj = quant_noise(
nn.Linear(self.kdim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else kv_proj_bias_attr), q_noise, qn_block_size nn.Linear(
) self.kdim,
embed_dim,
weight_attr=weight_attr,
bias_attr=bias
if not bias else kv_proj_bias_attr), q_noise, qn_block_size)
self.v_proj = quant_noise( self.v_proj = quant_noise(
nn.Linear(self.vdim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else kv_proj_bias_attr), q_noise, qn_block_size nn.Linear(
) self.vdim,
embed_dim,
weight_attr=weight_attr,
bias_attr=bias
if not bias else kv_proj_bias_attr), q_noise, qn_block_size)
self.q_proj = quant_noise( self.q_proj = quant_noise(
nn.Linear(embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias), q_noise, qn_block_size nn.Linear(
) embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias),
q_noise, qn_block_size)
self.out_proj = quant_noise( self.out_proj = quant_noise(
nn.Linear(embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else out_proj_bias_attr), q_noise, qn_block_size nn.Linear(
) embed_dim,
embed_dim,
weight_attr=weight_attr,
bias_attr=bias
if not bias else out_proj_bias_attr), q_noise, qn_block_size)
# nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2)) # nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2))
# nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2)) # nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2))
@ -384,9 +398,12 @@ class MultiheadAttention(nn.Layer):
if self.qkv_same_dim: if self.qkv_same_dim:
# Empirically observed the convergence to be much better with # Empirically observed the convergence to be much better with
# the scaled initialization # the scaled initialization
nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2)) nn.initializer.XavierUniform(
nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2)) self.k_proj.weight, gain=1 / math.sqrt(2))
nn.initializer.XavierUniform(self.q_proj.weight, gain=1 / math.sqrt(2)) nn.initializer.XavierUniform(
self.v_proj.weight, gain=1 / math.sqrt(2))
nn.initializer.XavierUniform(
self.q_proj.weight, gain=1 / math.sqrt(2))
else: else:
self.k_proj.weight = paddle.ParamAttr() self.k_proj.weight = paddle.ParamAttr()
nn.initializer.XavierUniform(self.k_proj.weight) nn.initializer.XavierUniform(self.k_proj.weight)
@ -410,15 +427,18 @@ class MultiheadAttention(nn.Layer):
start_idx = i * self.head_dim start_idx = i * self.head_dim
end_idx = (i + 1) * self.head_dim end_idx = (i + 1) * self.head_dim
k_proj_heads_norm.append( k_proj_heads_norm.append(
paddle.sum(paddle.abs(self.k_proj.weight[:, start_idx:end_idx])) paddle.sum(
paddle.abs(self.k_proj.weight[:, start_idx:end_idx]))
.tolist() + paddle.sum( .tolist() + paddle.sum(
paddle.abs(self.k_proj.bias[start_idx:end_idx])).tolist()) paddle.abs(self.k_proj.bias[start_idx:end_idx])).tolist())
q_proj_heads_norm.append( q_proj_heads_norm.append(
paddle.sum(paddle.abs(self.q_proj.weight[:, start_idx:end_idx])) paddle.sum(
paddle.abs(self.q_proj.weight[:, start_idx:end_idx]))
.tolist() + paddle.sum( .tolist() + paddle.sum(
paddle.abs(self.q_proj.bias[start_idx:end_idx])).tolist()) paddle.abs(self.q_proj.bias[start_idx:end_idx])).tolist())
v_proj_heads_norm.append( v_proj_heads_norm.append(
paddle.sum(paddle.abs(self.v_proj.weight[:, start_idx:end_idx])) paddle.sum(
paddle.abs(self.v_proj.weight[:, start_idx:end_idx]))
.tolist() + paddle.sum( .tolist() + paddle.sum(
paddle.abs(self.v_proj.bias[start_idx:end_idx])).tolist()) paddle.abs(self.v_proj.bias[start_idx:end_idx])).tolist())
@ -464,8 +484,7 @@ class MultiheadAttention(nn.Layer):
new_q_weight = paddle.concat(new_q_weight, axis=-1).detach() new_q_weight = paddle.concat(new_q_weight, axis=-1).detach()
new_k_weight = paddle.concat(new_k_weight, axis=-1).detach() new_k_weight = paddle.concat(new_k_weight, axis=-1).detach()
new_v_weight = paddle.concat(new_v_weight, axis=-1).detach() new_v_weight = paddle.concat(new_v_weight, axis=-1).detach()
new_out_proj_weight = paddle.concat( new_out_proj_weight = paddle.concat(new_out_proj_weight).detach()
new_out_proj_weight).detach()
new_q_weight.stop_gradient = False new_q_weight.stop_gradient = False
new_k_weight.stop_gradient = False new_k_weight.stop_gradient = False
new_v_weight.stop_gradient = False new_v_weight.stop_gradient = False
@ -898,10 +917,12 @@ class MultiheadAttention(nn.Layer):
if prev_key_padding_mask is not None and static_kv: if prev_key_padding_mask is not None and static_kv:
new_key_padding_mask = prev_key_padding_mask new_key_padding_mask = prev_key_padding_mask
elif prev_key_padding_mask is not None and key_padding_mask is not None: elif prev_key_padding_mask is not None and key_padding_mask is not None:
new_key_padding_mask = paddle.concat([ new_key_padding_mask = paddle.concat(
paddle.cast(prev_key_padding_mask, 'float32'), [
paddle.cast(key_padding_mask, 'float32') paddle.cast(prev_key_padding_mask, 'float32'),
], axis = 1) paddle.cast(key_padding_mask, 'float32')
],
axis=1)
# During incremental decoding, as the padding token enters and # During incremental decoding, as the padding token enters and
# leaves the frame, there will be a time when prev or current # leaves the frame, there will be a time when prev or current
# is None # is None
@ -909,20 +930,24 @@ class MultiheadAttention(nn.Layer):
if src_len > prev_key_padding_mask.shape[1]: if src_len > prev_key_padding_mask.shape[1]:
filler = paddle.zeros( filler = paddle.zeros(
[batch_size, src_len - prev_key_padding_mask.shape[1]], ) [batch_size, src_len - prev_key_padding_mask.shape[1]], )
new_key_padding_mask = paddle.concat([ new_key_padding_mask = paddle.concat(
paddle.cast(prev_key_padding_mask, 'float32'), [
paddle.cast(filler, 'float32') paddle.cast(prev_key_padding_mask, 'float32'),
], axis = 1) paddle.cast(filler, 'float32')
],
axis=1)
else: else:
new_key_padding_mask = prev_key_padding_mask new_key_padding_mask = prev_key_padding_mask
elif key_padding_mask is not None: elif key_padding_mask is not None:
if src_len > key_padding_mask.shape[1]: if src_len > key_padding_mask.shape[1]:
filler = paddle.zeros( filler = paddle.zeros(
[batch_size, src_len - key_padding_mask.shape[1]], ) [batch_size, src_len - key_padding_mask.shape[1]], )
new_key_padding_mask = paddle.concat([ new_key_padding_mask = paddle.concat(
paddle.cast(filler, 'float32'), [
paddle.cast(key_padding_mask, 'float32') paddle.cast(filler, 'float32'),
], axis = 1) paddle.cast(key_padding_mask, 'float32')
],
axis=1)
else: else:
new_key_padding_mask = paddle.cast(key_padding_mask, 'float32') new_key_padding_mask = paddle.cast(key_padding_mask, 'float32')
else: else:
@ -1074,8 +1099,7 @@ class GumbelVectorQuantizer(nn.Layer):
if weight_proj_depth > 1: if weight_proj_depth > 1:
def block(input_dim, output_dim): def block(input_dim, output_dim):
return nn.Sequential( return nn.Sequential(Linear(input_dim, output_dim), activation)
Linear(input_dim, output_dim), activation)
inner_dim = self.input_dim * weight_proj_factor inner_dim = self.input_dim * weight_proj_factor
self.weight_proj = nn.Sequential( self.weight_proj = nn.Sequential(
@ -1085,7 +1109,11 @@ class GumbelVectorQuantizer(nn.Layer):
], ],
Linear(inner_dim, groups * num_vars), ) Linear(inner_dim, groups * num_vars), )
else: else:
self.weight_proj = Linear(self.input_dim, groups * num_vars, weight_attr=nn.initializer.Normal(mean=0, std=1), bias_attr=nn.initializer.Zero()) self.weight_proj = Linear(
self.input_dim,
groups * num_vars,
weight_attr=nn.initializer.Normal(mean=0, std=1),
bias_attr=nn.initializer.Zero())
if isinstance(temp, str): if isinstance(temp, str):
import ast import ast
@ -1243,6 +1271,7 @@ class TransposeLast(nn.Layer):
trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1] trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1]
return x.transpose(trans_dim) return x.transpose(trans_dim)
class Fp32LayerNorm(LayerNorm): class Fp32LayerNorm(LayerNorm):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -1256,13 +1285,22 @@ class Fp32LayerNorm(LayerNorm):
self._epsilon, ) self._epsilon, )
return output.astype(input.dtype) return output.astype(input.dtype)
# Todo: change this when paddle supports F.group_norm # Todo: change this when paddle supports F.group_norm
class Fp32GroupNorm(nn.Layer): class Fp32GroupNorm(nn.Layer):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__() super().__init__()
self.group_norm = paddle.nn.GroupNorm(*args, **kwargs) self.group_norm = paddle.nn.GroupNorm(*args, **kwargs)
fp32_weight = paddle.create_parameter(shape=self.group_norm.weight.shape, dtype='float32', default_initializer=paddle.nn.initializer.Assign(self.group_norm.weight)) fp32_weight = paddle.create_parameter(
fp32_bias = paddle.create_parameter(shape=self.group_norm.bias.shape, dtype='float32', default_initializer=paddle.nn.initializer.Assign(self.group_norm.bias)) shape=self.group_norm.weight.shape,
dtype='float32',
default_initializer=paddle.nn.initializer.Assign(
self.group_norm.weight))
fp32_bias = paddle.create_parameter(
shape=self.group_norm.bias.shape,
dtype='float32',
default_initializer=paddle.nn.initializer.Assign(
self.group_norm.bias))
self.group_norm.weight = fp32_weight self.group_norm.weight = fp32_weight
self.group_norm.bias = fp32_bias self.group_norm.bias = fp32_bias
@ -2299,7 +2337,7 @@ def make_conv_pos(e, k, g):
e, e,
kernel_size=k, kernel_size=k,
padding=k // 2, padding=k // 2,
groups=g, groups=g,
weight_attr=nn.initializer.Normal(mean=0, std=std), weight_attr=nn.initializer.Normal(mean=0, std=std),
bias_attr=nn.initializer.Constant(0)) bias_attr=nn.initializer.Constant(0))
pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2) pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2)

Loading…
Cancel
Save