From d036c2eba511089ac03c15aecf673a80ad2bceae Mon Sep 17 00:00:00 2001 From: "th.zhang" <15600919271@163.com> Date: Thu, 20 Apr 2023 01:16:47 +0800 Subject: [PATCH] pre-commit format --- paddlespeech/cli/asr/infer.py | 6 +- paddlespeech/resource/pretrained_models.py | 32 +- paddlespeech/s2t/exps/hubert/model.py | 12 +- paddlespeech/s2t/models/hubert/hubert_ASR.py | 36 +- .../s2t/models/hubert/modules/hubert_model.py | 351 ++++++++---------- .../models/wav2vec2/modules/wav2vec2_model.py | 118 ++++-- 6 files changed, 285 insertions(+), 270 deletions(-) diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7a7aef8b0..525eb9cb3 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -25,9 +25,6 @@ import librosa import numpy as np import paddle import soundfile -from paddlespeech.audio.transform.transformation import Transformation -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.utils.utility import UpdateConfig from yacs.config import CfgNode from ...utils.env import MODEL_HOME @@ -37,6 +34,9 @@ from ..log import logger from ..utils import CLI_TIMER from ..utils import stats_wrapper from ..utils import timer_register +from paddlespeech.audio.transform.transformation import Transformation +from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index a553b520f..6d26a69db 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -87,22 +87,22 @@ ssl_dynamic_pretrained_models = { 'chinese-wav2vec2-large.pdparams', }, }, - "wav2vec2ASR_aishell1-zh-16k": { - '1.3': { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz', - 'md5': - 'ac8fa0a6345e6a7535f6fabb5e59e218', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/wav2vec2ASR/checkpoints/avg_1', - 'model': - 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', - 'params': - 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', - }, - }, + # "wav2vec2ASR_aishell1-zh-16k": { + # '1.3': { + # 'url': + # 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz', + # 'md5': + # 'ac8fa0a6345e6a7535f6fabb5e59e218', + # 'cfg_path': + # 'model.yaml', + # 'ckpt_path': + # 'exp/wav2vec2ASR/checkpoints/avg_1', + # 'model': + # 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', + # 'params': + # 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', + # }, + # }, "wav2vec2ASR_aishell1-zh-16k": { '1.4': { 'url': diff --git a/paddlespeech/s2t/exps/hubert/model.py b/paddlespeech/s2t/exps/hubert/model.py index d4f90fe52..bc05921dd 100644 --- a/paddlespeech/s2t/exps/hubert/model.py +++ b/paddlespeech/s2t/exps/hubert/model.py @@ -33,8 +33,8 @@ from paddlespeech.s2t.io.speechbrain import data_pipeline from paddlespeech.s2t.io.speechbrain import dataio from paddlespeech.s2t.io.speechbrain import dataset from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader -from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment from paddlespeech.s2t.models.hubert.hubert_ASR import HubertASR +from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment from paddlespeech.s2t.training.optimizer import OptimizerFactory from paddlespeech.s2t.training.reporter import ObsScope from paddlespeech.s2t.training.reporter import report @@ -49,6 +49,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() + # Todo: change this when paddle supports this api def clip_grad_norm_( parameters, @@ -428,8 +429,7 @@ class HubertASRTrainer(Trainer): report("epoch", self.epoch) report('step', self.iteration) report("model_lr", self.model_optimizer.get_lr()) - report("hubert_lr", - self.hubert_optimizer.get_lr()) + report("hubert_lr", self.hubert_optimizer.get_lr()) self.train_batch(batch_index, batch, msg) self.after_train_batch() report('iter', batch_index + 1) @@ -532,6 +532,7 @@ class HubertASRTrainer(Trainer): # Defining tokenizer and loading it tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese') self.tokenizer = tokenizer + # 2. Define audio pipeline: @data_pipeline.takes("wav") @data_pipeline.provides("sig") @@ -680,8 +681,7 @@ class HubertASRTrainer(Trainer): logger.info("optim_model:{},{}", model_optim_type, model_optim_conf) hubert_optim_type = train_config.hubert_optim hubert_optim_conf = train_config.hubert_optim_conf - logger.info("optim_model:{},{}", hubert_optim_type, - hubert_optim_conf) + logger.info("optim_model:{},{}", hubert_optim_type, hubert_optim_conf) model_scheduler_type = train_config.model_scheduler model_scheduler_conf = train_config.model_scheduler_conf @@ -739,7 +739,7 @@ class HubertASRTrainer(Trainer): model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args) hubert_optimizer = OptimizerFactory.from_args(hubert_optim_type, - hubert_optimizer_args) + hubert_optimizer_args) self.model_optimizer = model_optimizer self.hubert_optimizer = hubert_optimizer diff --git a/paddlespeech/s2t/models/hubert/hubert_ASR.py b/paddlespeech/s2t/models/hubert/hubert_ASR.py index 375d5e6ae..00411029a 100644 --- a/paddlespeech/s2t/models/hubert/hubert_ASR.py +++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py @@ -11,31 +11,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """HubertASR model.""" - - +"""HubertASR model.""" from collections import defaultdict -from typing import Dict, List, Tuple, Any -from dataclasses import dataclass, field, is_dataclass from copy import deepcopy +from dataclasses import dataclass +from dataclasses import is_dataclass +from typing import Dict +from typing import List +from typing import Tuple import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertConfig, HubertModel, HubertPretrainingConfig -from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2ConfigPure -from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2Model +from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertConfig +from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertModel +from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertPretrainingConfig from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC from paddlespeech.s2t.modules.initializer import DefaultInitializerContext from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank -from paddlespeech.s2t.utils.utility import log_add from paddlespeech.s2t.utils.log import Log +from paddlespeech.s2t.utils.utility import log_add logger = Log(__name__).getlog() + class HubertASR(nn.Layer): def __init__(self, config: dict): super().__init__() @@ -44,8 +46,10 @@ class HubertASR(nn.Layer): self.config = config with open(config.vocab_filepath) as f: dicts = [symbol.strip() for symbol in f.readlines()] - task_cfg = self.merge_with_parent(HubertPretrainingConfig, dict(self.config.task_cfg)) - model_cfg = self.merge_with_parent(HubertConfig, dict(self.config.model_cfg)) + task_cfg = self.merge_with_parent(HubertPretrainingConfig, + dict(self.config.task_cfg)) + model_cfg = self.merge_with_parent(HubertConfig, + dict(self.config.model_cfg)) hubert = HubertModel(model_cfg, task_cfg, dicts) self.normalize_wav = config.normalize_wav @@ -326,11 +330,13 @@ class HubertBase(nn.Layer): def __init__(self, config: dict): super().__init__() with open(config.vocab_filepath) as f: - dicts = [symbol.strip() for symbol in f.readlines()] - task_cfg = self.merge_with_parent(HubertPretrainingConfig, dict(self.config.task_cfg)) - model_cfg = self.merge_with_parent(HubertConfig, dict(self.config.model_cfg)) + dicts = [symbol.strip() for symbol in f.readlines()] + task_cfg = self.merge_with_parent(HubertPretrainingConfig, + dict(self.config.task_cfg)) + model_cfg = self.merge_with_parent(HubertConfig, + dict(self.config.model_cfg)) hubert = HubertModel(model_cfg, task_cfg, dicts) - self.hubert= hubert + self.hubert = hubert @classmethod def from_config(cls, configs: dict): diff --git a/paddlespeech/s2t/models/hubert/modules/hubert_model.py b/paddlespeech/s2t/models/hubert/modules/hubert_model.py index 1331be9f6..dc30d9ee6 100644 --- a/paddlespeech/s2t/models/hubert/modules/hubert_model.py +++ b/paddlespeech/s2t/models/hubert/modules/hubert_model.py @@ -13,69 +13,67 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Paddle Hubert model.""" - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple +from dataclasses import dataclass +from dataclasses import field +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple import numpy as np import paddle import paddle.nn as nn -from paddlespeech.s2t.modules.align import Linear + +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ChoiceEnum +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import compute_mask_indices +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ConvFeatureExtractionModel +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import EXTRACTOR_MODE_CHOICES +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import get_available_activation_fns +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import GLU +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import GradMultiply +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import LAYER_TYPE_CHOICES +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import MASKING_DISTRIBUTION_CHOICES +from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import TransformerEncoder from paddlespeech.s2t.modules.align import LayerNorm -from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ( - EXTRACTOR_MODE_CHOICES, - LAYER_TYPE_CHOICES, - MASKING_DISTRIBUTION_CHOICES, - ChoiceEnum, - ConvFeatureExtractionModel, - GradMultiply, - LayerNorm, - TransformerEncoder, - compute_mask_indices, - get_available_activation_fns, - GLU, -) +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() + @dataclass class HubertPretrainingConfig: label_rate: float = field( default=-1.0, - metadata={"help": "label frame rate. -1.0 for sequence label"}, - ) + metadata={"help": "label frame rate. -1.0 for sequence label"}, ) sample_rate: int = field( default=16_000, metadata={ - "help": "target sample rate. audio files will be up/down " + "help": + "target sample rate. audio files will be up/down " "sampled to this rate" - }, - ) + }, ) normalize: bool = field( default=False, - metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, - ) + metadata={ + "help": "if set, normalizes input to have 0 mean and unit variance" + }, ) enable_padding: bool = field( default=False, - metadata={"help": "pad shorter samples instead of cropping"}, - ) + metadata={"help": "pad shorter samples instead of cropping"}, ) max_keep_size: Optional[int] = field( default=None, - metadata={"help": "exclude sample longer than this"}, - ) + metadata={"help": "exclude sample longer than this"}, ) max_sample_size: Optional[int] = field( default=None, - metadata={"help": "max sample size to crop to for batching"}, - ) + metadata={"help": "max sample size to crop to for batching"}, ) min_sample_size: Optional[int] = field( default=None, - metadata={"help": "min sample size to crop to for batching"}, - ) + metadata={"help": "min sample size to crop to for batching"}, ) random_crop: Optional[bool] = field( default=True, - metadata={"help": "always crop from the beginning if false"}, - ) + metadata={"help": "always crop from the beginning if false"}, ) pad_audio: Optional[bool] = field( default=False, metadata={"help": "pad audio to the longest one in the batch if true"}, @@ -89,51 +87,40 @@ class HubertConfig: extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ - "help": "mode for feature extractor. default has a single group " + "help": + "mode for feature extractor. default has a single group " "norm with d groups in the first conv block, whereas layer_norm " "has layer norms in every block (meant to use with normalize=True)" - }, - ) + }, ) encoder_layers: int = field( - default=12, metadata={"help": "num encoder layers in the transformer"} - ) + default=12, metadata={"help": "num encoder layers in the transformer"}) encoder_embed_dim: int = field( - default=768, metadata={"help": "encoder embedding dimension"} - ) + default=768, metadata={"help": "encoder embedding dimension"}) encoder_ffn_embed_dim: int = field( - default=3072, metadata={"help": "encoder embedding dimension for FFN"} - ) + default=3072, metadata={"help": "encoder embedding dimension for FFN"}) encoder_attention_heads: int = field( - default=12, metadata={"help": "num encoder attention heads"} - ) + default=12, metadata={"help": "num encoder attention heads"}) activation_fn: ChoiceEnum(get_available_activation_fns()) = field( - default="gelu", metadata={"help": "activation function to use"} - ) + default="gelu", metadata={"help": "activation function to use"}) layer_type: LAYER_TYPE_CHOICES = field( - default="transformer", metadata={"help": "layer type in encoder"} - ) + default="transformer", metadata={"help": "layer type in encoder"}) # dropouts dropout: float = field( default=0.1, - metadata={"help": "dropout probability for the transformer"}, - ) + metadata={"help": "dropout probability for the transformer"}, ) attention_dropout: float = field( default=0.1, - metadata={"help": "dropout probability for attention weights"}, - ) + metadata={"help": "dropout probability for attention weights"}, ) activation_dropout: float = field( default=0.0, - metadata={"help": "dropout probability after activation in FFN"}, - ) + metadata={"help": "dropout probability after activation in FFN"}, ) encoder_layerdrop: float = field( default=0.0, - metadata={"help": "probability of dropping a tarnsformer layer"}, - ) + metadata={"help": "probability of dropping a tarnsformer layer"}, ) dropout_input: float = field( default=0.0, - metadata={"help": "dropout to apply to the input (after feat extr)"}, - ) + metadata={"help": "dropout to apply to the input (after feat extr)"}, ) dropout_features: float = field( default=0.0, metadata={"help": "dropout to apply to the features (after feat extr)"}, @@ -142,60 +129,51 @@ class HubertConfig: final_dim: int = field( default=0, metadata={ - "help": "project final representations and targets to this many " + "help": + "project final representations and targets to this many " "dimensions. set to encoder_embed_dim is <= 0" - }, - ) + }, ) untie_final_proj: bool = field( default=False, - metadata={"help": "use separate projection for each target"}, - ) + metadata={"help": "use separate projection for each target"}, ) layer_norm_first: bool = field( default=False, - metadata={"help": "apply layernorm first in the transformer"}, - ) + metadata={"help": "apply layernorm first in the transformer"}, ) conv_feature_layers: str = field( default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", metadata={ - "help": "string describing convolutional feature extraction " + "help": + "string describing convolutional feature extraction " "layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" - }, - ) + }, ) conv_bias: bool = field( - default=False, metadata={"help": "include bias in conv encoder"} - ) + default=False, metadata={"help": "include bias in conv encoder"}) logit_temp: float = field( - default=0.1, metadata={"help": "temperature to divide logits by"} - ) + default=0.1, metadata={"help": "temperature to divide logits by"}) target_glu: bool = field( - default=False, metadata={"help": "adds projection + glu to targets"} - ) + default=False, metadata={"help": "adds projection + glu to targets"}) feature_grad_mult: float = field( default=1.0, - metadata={"help": "multiply feature extractor var grads by this"}, - ) + metadata={"help": "multiply feature extractor var grads by this"}, ) # masking mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_prob: float = field( default=0.65, - metadata={"help": "probability of replacing a token with mask"}, - ) + metadata={"help": "probability of replacing a token with mask"}, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( - default="static", metadata={"help": "how to choose mask length"} - ) + default="static", metadata={"help": "how to choose mask length"}) mask_other: float = field( default=0, metadata={ - "help": "secondary mask argument " + "help": + "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indicesh" - }, - ) + }, ) no_mask_overlap: bool = field( - default=False, metadata={"help": "whether to allow masks to overlap"} - ) + default=False, metadata={"help": "whether to allow masks to overlap"}) mask_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, @@ -204,28 +182,24 @@ class HubertConfig: # channel masking mask_channel_length: int = field( default=10, - metadata={"help": "length of the mask for features (channels)"}, - ) + metadata={"help": "length of the mask for features (channels)"}, ) mask_channel_prob: float = field( default=0.0, - metadata={"help": "probability of replacing a feature with 0"}, - ) + metadata={"help": "probability of replacing a feature with 0"}, ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", - metadata={"help": "how to choose mask length for channel masking"}, - ) + metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ - "help": "secondary mask argument " + "help": + "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indicesh" - }, - ) + }, ) no_mask_channel_overlap: bool = field( default=False, - metadata={"help": "whether to allow channel masks to overlap"}, - ) + metadata={"help": "whether to allow channel masks to overlap"}, ) mask_channel_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, @@ -234,66 +208,64 @@ class HubertConfig: # positional embeddings conv_pos: int = field( default=128, - metadata={"help": "number of filters for convolutional positional embeddings"}, - ) + metadata={ + "help": "number of filters for convolutional positional embeddings" + }, ) conv_pos_groups: int = field( default=16, - metadata={"help": "number of groups for convolutional positional embedding"}, - ) + metadata={ + "help": "number of groups for convolutional positional embedding" + }, ) latent_temp: Tuple[float, float, float] = field( default=(2, 0.5, 0.999995), - metadata={"help": "legacy (to be removed)"}, - ) + metadata={"help": "legacy (to be removed)"}, ) # loss computation skip_masked: bool = field( default=False, - metadata={"help": "skip computing losses over masked frames"}, - ) + metadata={"help": "skip computing losses over masked frames"}, ) skip_nomask: bool = field( default=False, - metadata={"help": "skip computing losses over unmasked frames"}, - ) + metadata={"help": "skip computing losses over unmasked frames"}, ) checkpoint_activations: bool = field( default=False, - metadata={"help": "recompute activations and save memory for extra compute"}, - ) + metadata={ + "help": "recompute activations and save memory for extra compute" + }, ) # FP16 optimization required_seq_len_multiple: int = field( default=2, metadata={ - "help": "pad the input to encoder such that the sequence length is divisible by multiple" - }, - ) + "help": + "pad the input to encoder such that the sequence length is divisible by multiple" + }, ) # Conformer depthwise_conv_kernel_size: int = field( default=31, metadata={ - "help": "depthwise-conv-kernel-size for convolution in conformer layer" - }, - ) + "help": + "depthwise-conv-kernel-size for convolution in conformer layer" + }, ) attn_type: str = field( default="", - metadata={"help": "if espnet use ESPNET MHA"}, - ) + metadata={"help": "if espnet use ESPNET MHA"}, ) pos_enc_type: str = field( default="abs", - metadata={"help": "Positional encoding type to use in conformer"}, - ) - fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) + metadata={"help": "Positional encoding type to use in conformer"}, ) + fp16: bool = field( + default=False, metadata={"help": "If fp16 is being used"}) class HubertModel(nn.Layer): def __init__( - self, - cfg: HubertConfig, - task_cfg: HubertPretrainingConfig, - dictionaries: List[Any], - ) -> None: + self, + cfg: HubertConfig, + task_cfg: HubertPretrainingConfig, + dictionaries: List[Any], ) -> None: super().__init__() logger.info(f"HubertModel Config: {cfg}") @@ -304,16 +276,12 @@ class HubertModel(nn.Layer): conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, - conv_bias=cfg.conv_bias, - ) + conv_bias=cfg.conv_bias, ) feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate - self.post_extract_proj = ( - Linear(self.embed, cfg.encoder_embed_dim) - if self.embed != cfg.encoder_embed_dim - else None - ) + self.post_extract_proj = (Linear(self.embed, cfg.encoder_embed_dim) if + self.embed != cfg.encoder_embed_dim else None) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection @@ -342,8 +310,7 @@ class HubertModel(nn.Layer): self.mask_emb = paddle.create_parameter( shape=[cfg.encoder_embed_dim], dtype='float32', - default_initializer=paddle.nn.initializer.Uniform(low=0), - ) + default_initializer=paddle.nn.initializer.Uniform(low=0), ) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed) @@ -351,27 +318,25 @@ class HubertModel(nn.Layer): self.target_glu = None if cfg.target_glu: self.target_glu = nn.Sequential( - Linear(final_dim, final_dim * 2), GLU() - ) + Linear(final_dim, final_dim * 2), GLU()) self.untie_final_proj = cfg.untie_final_proj if self.untie_final_proj: - self.final_proj = Linear( - cfg.encoder_embed_dim, final_dim * len(dictionaries) - ) + self.final_proj = Linear(cfg.encoder_embed_dim, + final_dim * len(dictionaries)) else: self.final_proj = Linear(cfg.encoder_embed_dim, final_dim) # modules below are not needed during fine-tuning if any([d is None for d in dictionaries]): - logger.info("cannot find dictionary. assume will be used for fine-tuning") + logger.info( + "cannot find dictionary. assume will be used for fine-tuning") else: self.num_classes = [len(d) for d in dictionaries] self.label_embs_concat = paddle.create_parameter( - shape=[sum(self.num_classes), final_dim], - dtype='float32', - default_initializer=paddle.nn.initializer.Uniform(low=0), - ) + shape=[sum(self.num_classes), final_dim], + dtype='float32', + default_initializer=paddle.nn.initializer.Uniform(low=0), ) @classmethod def build_model(cls, cfg: HubertConfig, task): @@ -392,10 +357,10 @@ class HubertModel(nn.Layer): self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, - min_space=self.mask_min_space, - ) - - mask_indices = paddle.to_tensor(mask_indices, dtype='int64', place=x.place) + min_space=self.mask_min_space, ) + + mask_indices = paddle.to_tensor( + mask_indices, dtype='int64', place=x.place) x[mask_indices] = self.mask_emb else: mask_indices = None @@ -409,13 +374,10 @@ class HubertModel(nn.Layer): self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, - min_space=self.mask_channel_min_space, - ) - mask_channel_indices = ( - paddle.to_tensor(mask_channel_indices, dtype='int64', place=x.place) - .unsqueeze(1) - .expand(-1, T, -1) - ) + min_space=self.mask_channel_min_space, ) + mask_channel_indices = (paddle.to_tensor( + mask_channel_indices, dtype='int64', place=x.place).unsqueeze(1) + .expand(-1, T, -1)) x[mask_channel_indices] = 0 return x, mask_indices @@ -425,7 +387,8 @@ class HubertModel(nn.Layer): pos = pos.unsqueeze(0) targets = paddle.concat([pos, negs], axis=0) - logits = paddle.nn.functional.cosine_similarity(x.astype('float32'), targets.astype('float32'), axis=-1) + logits = paddle.nn.functional.cosine_similarity( + x.astype('float32'), targets.astype('float32'), axis=-1) logits /= self.logit_temp if paddle.any(neg_is_pos): logits[1:][neg_is_pos] = float("-inf") @@ -443,9 +406,9 @@ class HubertModel(nn.Layer): return features def forward_targets( - self, - features: paddle.Tensor, - target_list: List[paddle.Tensor], + self, + features: paddle.Tensor, + target_list: List[paddle.Tensor], ) -> Tuple[paddle.Tensor, paddle.Tensor]: # Trim features to ensure labels exist and then get aligned labels feat_tsz = features.shape[2] @@ -453,31 +416,31 @@ class HubertModel(nn.Layer): if self.feat2tar_ratio * feat_tsz > targ_tsz: feat_tsz = int(targ_tsz / self.feat2tar_ratio) features = features[:, :, :feat_tsz] - target_inds = paddle.arange(feat_tsz).astype('float32') * self.feat2tar_ratio + target_inds = paddle.arange(feat_tsz).astype( + 'float32') * self.feat2tar_ratio target_list = [t[:, target_inds.astype('int64')] for t in target_list] return features, target_list def forward_padding_mask( - self, - features: paddle.Tensor, - padding_mask: paddle.Tensor, - ) -> paddle.Tensor: + self, + features: paddle.Tensor, + padding_mask: paddle.Tensor, ) -> paddle.Tensor: extra = padding_mask.shape[1] % features.shape[1] if extra > 0: padding_mask = padding_mask[:, :-extra] - padding_mask = paddle.reshape(padding_mask, [padding_mask.shape[0], features.shape[1], -1]) + padding_mask = paddle.reshape( + padding_mask, [padding_mask.shape[0], features.shape[1], -1]) padding_mask = paddle.all(padding_mask, axis=-1) return padding_mask def forward( - self, - source: paddle.Tensor, - target_list: Optional[List[paddle.Tensor]] = None, - padding_mask: Optional[paddle.Tensor] = None, - mask: bool = True, - features_only: bool = False, - output_layer: Optional[int] = None, - ) -> Dict[str, paddle.Tensor]: + self, + source: paddle.Tensor, + target_list: Optional[List[paddle.Tensor]]=None, + padding_mask: Optional[paddle.Tensor]=None, + mask: bool=True, + features_only: bool=False, + output_layer: Optional[int]=None, ) -> Dict[str, paddle.Tensor]: """output layer is 1-based""" features = self.forward_features(source) if target_list is not None: @@ -499,7 +462,8 @@ class HubertModel(nn.Layer): unmasked_features = self.dropout_features(unmasked_features) if mask: - x, mask_indices = self.apply_mask(features, padding_mask, target_list) + x, mask_indices = self.apply_mask(features, padding_mask, + target_list) else: x = features mask_indices = None @@ -512,16 +476,18 @@ class HubertModel(nn.Layer): x, _ = self.encoder( x, padding_mask=padding_mask, - layer=None if output_layer is None else output_layer - 1, - ) + layer=None if output_layer is None else output_layer - 1, ) if features_only: return {"x": x, "padding_mask": padding_mask, "features": features} def compute_pred(self, proj_x, target, label_embs): # compute logits for the i-th label set - y = paddle.index_select(label_embs, index=target.astype('int64'), axis=0) - negs = paddle.expand(label_embs.unsqueeze(1), [label_embs.shape[0], proj_x.shape[0], label_embs.shape[-1]]) + y = paddle.index_select( + label_embs, index=target.astype('int64'), axis=0) + negs = paddle.expand( + label_embs.unsqueeze(1), + [label_embs.shape[0], proj_x.shape[0], label_embs.shape[-1]]) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) @@ -541,7 +507,8 @@ class HubertModel(nn.Layer): proj_x_m_list = [proj_x_m for _ in range(len(target_list))] logit_m_list = [ compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) - for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list)) + for i, (proj_x_m, t + ) in enumerate(zip(proj_x_m_list, target_list)) ] else: logit_m_list = [None for _ in target_list] @@ -556,7 +523,8 @@ class HubertModel(nn.Layer): logit_u_list = [ compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i]) - for i, (proj_x_u, t) in enumerate(zip(proj_x_u_list, target_list)) + for i, (proj_x_u, t + ) in enumerate(zip(proj_x_u_list, target_list)) ] else: logit_u_list = [None for _ in target_list] @@ -570,20 +538,19 @@ class HubertModel(nn.Layer): return result def extract_features( - self, - source: paddle.Tensor, - padding_mask: Optional[paddle.Tensor] = None, - mask: bool = False, - ret_conv: bool = False, - output_layer: Optional[int] = None, + self, + source: paddle.Tensor, + padding_mask: Optional[paddle.Tensor]=None, + mask: bool=False, + ret_conv: bool=False, + output_layer: Optional[int]=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]: res = self.forward( source, padding_mask=padding_mask, mask=mask, features_only=True, - output_layer=output_layer, - ) + output_layer=output_layer, ) feature = res["features"] if ret_conv else res["x"] return feature, res["padding_mask"] @@ -592,12 +559,16 @@ class HubertModel(nn.Layer): logits_list = net_output["logit_m_list"] else: logits_list = net_output["logit_u_list"] - logits_list = [paddle.cast(x, 'float32') for x in logits_list if x is not None] + logits_list = [ + paddle.cast(x, 'float32') for x in logits_list if x is not None + ] return logits_list def get_targets(self, net_output, is_masked=True): logits_list = self.get_logits(net_output, is_masked) - targets_list = [paddle.zeros_like(x, dtype='int64') for x in logits_list] + targets_list = [ + paddle.zeros_like(x, dtype='int64') for x in logits_list + ] return targets_list def get_extra_losses(self, net_output): diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py index 766864ef4..3fbb9426b 100644 --- a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Paddle Wav2Vec2 model.""" - import math import uuid from dataclasses import dataclass @@ -25,20 +24,23 @@ from typing import Dict from typing import List from typing import Optional from typing import Tuple + import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import Tensor -from paddlespeech.s2t.modules.align import Linear -from paddlespeech.s2t.modules.align import LayerNorm + from paddlespeech.s2t.modules.align import Conv1D from paddlespeech.s2t.modules.align import Conv2D from paddlespeech.s2t.modules.align import Embedding +from paddlespeech.s2t.modules.align import LayerNorm +from paddlespeech.s2t.modules.align import Linear from paddlespeech.s2t.utils.log import Log logger = Log(__name__).getlog() + class GLU(nn.Layer): r"""Applies the gated linear unit function :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half @@ -226,7 +228,7 @@ def quant_noise(module, p, block_size): mask = paddle.zeros( [in_channels // block_size * out_channels], dtype=paddle.bool) - + # the implementation of bernoulli_, p=0.5 mask = paddle.ones_like(mask) * 0.5 mask = paddle.bernoulli(mask) @@ -310,7 +312,7 @@ class MultiheadAttention(nn.Layer): assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") - + # Todo scaled initialization # Empirically observed the convergence to be much better with # the scaled initialization @@ -319,19 +321,31 @@ class MultiheadAttention(nn.Layer): out_proj_bias_attr = nn.initializer.Constant(0) self.k_proj = quant_noise( - nn.Linear(self.kdim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else kv_proj_bias_attr), q_noise, qn_block_size - ) + nn.Linear( + self.kdim, + embed_dim, + weight_attr=weight_attr, + bias_attr=bias + if not bias else kv_proj_bias_attr), q_noise, qn_block_size) self.v_proj = quant_noise( - nn.Linear(self.vdim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else kv_proj_bias_attr), q_noise, qn_block_size - ) + nn.Linear( + self.vdim, + embed_dim, + weight_attr=weight_attr, + bias_attr=bias + if not bias else kv_proj_bias_attr), q_noise, qn_block_size) self.q_proj = quant_noise( - nn.Linear(embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias), q_noise, qn_block_size - ) + nn.Linear( + embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias), + q_noise, qn_block_size) self.out_proj = quant_noise( - nn.Linear(embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias if not bias else out_proj_bias_attr), q_noise, qn_block_size - ) - + nn.Linear( + embed_dim, + embed_dim, + weight_attr=weight_attr, + bias_attr=bias + if not bias else out_proj_bias_attr), q_noise, qn_block_size) # nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2)) # nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2)) @@ -384,9 +398,12 @@ class MultiheadAttention(nn.Layer): if self.qkv_same_dim: # Empirically observed the convergence to be much better with # the scaled initialization - nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2)) - nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2)) - nn.initializer.XavierUniform(self.q_proj.weight, gain=1 / math.sqrt(2)) + nn.initializer.XavierUniform( + self.k_proj.weight, gain=1 / math.sqrt(2)) + nn.initializer.XavierUniform( + self.v_proj.weight, gain=1 / math.sqrt(2)) + nn.initializer.XavierUniform( + self.q_proj.weight, gain=1 / math.sqrt(2)) else: self.k_proj.weight = paddle.ParamAttr() nn.initializer.XavierUniform(self.k_proj.weight) @@ -410,15 +427,18 @@ class MultiheadAttention(nn.Layer): start_idx = i * self.head_dim end_idx = (i + 1) * self.head_dim k_proj_heads_norm.append( - paddle.sum(paddle.abs(self.k_proj.weight[:, start_idx:end_idx])) + paddle.sum( + paddle.abs(self.k_proj.weight[:, start_idx:end_idx])) .tolist() + paddle.sum( paddle.abs(self.k_proj.bias[start_idx:end_idx])).tolist()) q_proj_heads_norm.append( - paddle.sum(paddle.abs(self.q_proj.weight[:, start_idx:end_idx])) + paddle.sum( + paddle.abs(self.q_proj.weight[:, start_idx:end_idx])) .tolist() + paddle.sum( paddle.abs(self.q_proj.bias[start_idx:end_idx])).tolist()) v_proj_heads_norm.append( - paddle.sum(paddle.abs(self.v_proj.weight[:, start_idx:end_idx])) + paddle.sum( + paddle.abs(self.v_proj.weight[:, start_idx:end_idx])) .tolist() + paddle.sum( paddle.abs(self.v_proj.bias[start_idx:end_idx])).tolist()) @@ -464,8 +484,7 @@ class MultiheadAttention(nn.Layer): new_q_weight = paddle.concat(new_q_weight, axis=-1).detach() new_k_weight = paddle.concat(new_k_weight, axis=-1).detach() new_v_weight = paddle.concat(new_v_weight, axis=-1).detach() - new_out_proj_weight = paddle.concat( - new_out_proj_weight).detach() + new_out_proj_weight = paddle.concat(new_out_proj_weight).detach() new_q_weight.stop_gradient = False new_k_weight.stop_gradient = False new_v_weight.stop_gradient = False @@ -898,10 +917,12 @@ class MultiheadAttention(nn.Layer): if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask elif prev_key_padding_mask is not None and key_padding_mask is not None: - new_key_padding_mask = paddle.concat([ - paddle.cast(prev_key_padding_mask, 'float32'), - paddle.cast(key_padding_mask, 'float32') - ], axis = 1) + new_key_padding_mask = paddle.concat( + [ + paddle.cast(prev_key_padding_mask, 'float32'), + paddle.cast(key_padding_mask, 'float32') + ], + axis=1) # During incremental decoding, as the padding token enters and # leaves the frame, there will be a time when prev or current # is None @@ -909,20 +930,24 @@ class MultiheadAttention(nn.Layer): if src_len > prev_key_padding_mask.shape[1]: filler = paddle.zeros( [batch_size, src_len - prev_key_padding_mask.shape[1]], ) - new_key_padding_mask = paddle.concat([ - paddle.cast(prev_key_padding_mask, 'float32'), - paddle.cast(filler, 'float32') - ], axis = 1) + new_key_padding_mask = paddle.concat( + [ + paddle.cast(prev_key_padding_mask, 'float32'), + paddle.cast(filler, 'float32') + ], + axis=1) else: new_key_padding_mask = prev_key_padding_mask elif key_padding_mask is not None: if src_len > key_padding_mask.shape[1]: filler = paddle.zeros( [batch_size, src_len - key_padding_mask.shape[1]], ) - new_key_padding_mask = paddle.concat([ - paddle.cast(filler, 'float32'), - paddle.cast(key_padding_mask, 'float32') - ], axis = 1) + new_key_padding_mask = paddle.concat( + [ + paddle.cast(filler, 'float32'), + paddle.cast(key_padding_mask, 'float32') + ], + axis=1) else: new_key_padding_mask = paddle.cast(key_padding_mask, 'float32') else: @@ -1074,8 +1099,7 @@ class GumbelVectorQuantizer(nn.Layer): if weight_proj_depth > 1: def block(input_dim, output_dim): - return nn.Sequential( - Linear(input_dim, output_dim), activation) + return nn.Sequential(Linear(input_dim, output_dim), activation) inner_dim = self.input_dim * weight_proj_factor self.weight_proj = nn.Sequential( @@ -1085,7 +1109,11 @@ class GumbelVectorQuantizer(nn.Layer): ], Linear(inner_dim, groups * num_vars), ) else: - self.weight_proj = Linear(self.input_dim, groups * num_vars, weight_attr=nn.initializer.Normal(mean=0, std=1), bias_attr=nn.initializer.Zero()) + self.weight_proj = Linear( + self.input_dim, + groups * num_vars, + weight_attr=nn.initializer.Normal(mean=0, std=1), + bias_attr=nn.initializer.Zero()) if isinstance(temp, str): import ast @@ -1243,6 +1271,7 @@ class TransposeLast(nn.Layer): trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1] return x.transpose(trans_dim) + class Fp32LayerNorm(LayerNorm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1256,13 +1285,22 @@ class Fp32LayerNorm(LayerNorm): self._epsilon, ) return output.astype(input.dtype) + # Todo: change this when paddle supports F.group_norm class Fp32GroupNorm(nn.Layer): def __init__(self, *args, **kwargs): super().__init__() self.group_norm = paddle.nn.GroupNorm(*args, **kwargs) - fp32_weight = paddle.create_parameter(shape=self.group_norm.weight.shape, dtype='float32', default_initializer=paddle.nn.initializer.Assign(self.group_norm.weight)) - fp32_bias = paddle.create_parameter(shape=self.group_norm.bias.shape, dtype='float32', default_initializer=paddle.nn.initializer.Assign(self.group_norm.bias)) + fp32_weight = paddle.create_parameter( + shape=self.group_norm.weight.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Assign( + self.group_norm.weight)) + fp32_bias = paddle.create_parameter( + shape=self.group_norm.bias.shape, + dtype='float32', + default_initializer=paddle.nn.initializer.Assign( + self.group_norm.bias)) self.group_norm.weight = fp32_weight self.group_norm.bias = fp32_bias @@ -2299,7 +2337,7 @@ def make_conv_pos(e, k, g): e, kernel_size=k, padding=k // 2, - groups=g, + groups=g, weight_attr=nn.initializer.Normal(mean=0, std=std), bias_attr=nn.initializer.Constant(0)) pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2)