PaddleSpeech/parakeet/modules/attention.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math

import numpy as np
import paddle
from paddle import nn
from paddle.nn import functional as F


def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,
                                 training=True):
    r"""Scaled dot product attention with masking. 
    
    Assume that q, k, v all have the same leading dimensions (denoted as * in 
    descriptions below). Dropout is applied to attention weights before 
    weighted sum of values.

    Parameters
    -----------
    q : Tensor [shape=(\*, T_q, d)]
        the query tensor.
    k : Tensor [shape=(\*, T_k, d)]
        the key tensor.
    v : Tensor [shape=(\*, T_k, d_v)]
        the value tensor.
    mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional
        the mask tensor, zeros correspond to paddings. Defaults to None.

    Returns
    ----------
    out : Tensor [shape=(\*, T_q, d_v)]
        the context vector.
    attn_weights : Tensor [shape=(\*, T_q, T_k)]
        the attention weights.
    """
    d = q.shape[-1]  # we only support imperative execution
    qk = paddle.matmul(q, k, transpose_y=True)
    scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))

    if mask is not None:
        scaled_logit += paddle.scale((1.0 - mask), -1e9)  # hard coded here

    attn_weights = F.softmax(scaled_logit, axis=-1)
    attn_weights = F.dropout(attn_weights, dropout, training=training)
    out = paddle.matmul(attn_weights, v)
    return out, attn_weights


def drop_head(x, drop_n_heads, training=True):
    """Drop n context vectors from multiple ones.

    Parameters
    ----------
    x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]
        The input, multiple context vectors.
    drop_n_heads : int [0<= drop_n_heads <= num_heads]
        Number of vectors to drop.
    training : bool
        A flag indicating whether it is in training. If `False`, no dropout is 
        applied.

    Returns
    -------
    Tensor
        The output.
    """
    if not training or (drop_n_heads == 0):
        return x

    batch_size, num_heads, _, _ = x.shape
    # drop all heads
    if num_heads == drop_n_heads:
        return paddle.zeros_like(x)

    mask = np.ones([batch_size, num_heads])
    mask[:, :drop_n_heads] = 0
    for subarray in mask:
        np.random.shuffle(subarray)
    scale = float(num_heads) / (num_heads - drop_n_heads)
    mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])
    out = x * paddle.to_tensor(mask)
    return out


def _split_heads(x, num_heads):
    batch_size, time_steps, _ = x.shape
    x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])
    x = paddle.transpose(x, [0, 2, 1, 3])
    return x


def _concat_heads(x):
    batch_size, _, time_steps, _ = x.shape
    x = paddle.transpose(x, [0, 2, 1, 3])
    x = paddle.reshape(x, [batch_size, time_steps, -1])
    return x


# Standard implementations of Monohead Attention & Multihead Attention
class MonoheadAttention(nn.Layer):
    """Monohead Attention module.

    Parameters
    ----------
    model_dim : int
        Feature size of the query.
    dropout : float, optional
        Dropout probability of scaled dot product attention and final context
        vector. Defaults to 0.0.
    k_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to `model_dim / num_heads`. Defaults to None.
    v_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to `model_dim / num_heads`. Defaults to None.
    """

    def __init__(self,
                 model_dim: int,
                 dropout: float=0.0,
                 k_dim: int=None,
                 v_dim: int=None):
        super(MonoheadAttention, self).__init__()
        k_dim = k_dim or model_dim
        v_dim = v_dim or model_dim
        self.affine_q = nn.Linear(model_dim, k_dim)
        self.affine_k = nn.Linear(model_dim, k_dim)
        self.affine_v = nn.Linear(model_dim, v_dim)
        self.affine_o = nn.Linear(v_dim, model_dim)

        self.model_dim = model_dim
        self.dropout = dropout

    def forward(self, q, k, v, mask):
        """Compute context vector and attention weights.
        
        Parameters
        -----------
        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The queries.
        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The keys.
        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The values.
        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
            The mask.

        Returns
        ----------
        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The context vector.
        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
            The attention weights.
        """
        q = self.affine_q(q)  # (B, T, C)
        k = self.affine_k(k)
        v = self.affine_v(v)

        context_vectors, attention_weights = scaled_dot_product_attention(
            q, k, v, mask, self.dropout, self.training)

        out = self.affine_o(context_vectors)
        return out, attention_weights


class MultiheadAttention(nn.Layer):
    """Multihead Attention module.

    Parameters
    -----------
    model_dim: int
        The feature size of query.
    num_heads : int
        The number of attention heads.
    dropout : float, optional
        Dropout probability of scaled dot product attention and final context
        vector. Defaults to 0.0.
    k_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to ``model_dim / num_heads``. Defaults to None.
    v_dim : int, optional
        Feature size of the key of each scaled dot product attention. If not
        provided, it is set to ``model_dim / num_heads``. Defaults to None.

    Raises
    ---------
    ValueError
        If ``model_dim`` is not divisible by ``num_heads``.
    """

    def __init__(self,
                 model_dim: int,
                 num_heads: int,
                 dropout: float=0.0,
                 k_dim: int=None,
                 v_dim: int=None):
        super(MultiheadAttention, self).__init__()
        if model_dim % num_heads != 0:
            raise ValueError("model_dim must be divisible by num_heads")
        depth = model_dim // num_heads
        k_dim = k_dim or depth
        v_dim = v_dim or depth
        self.affine_q = nn.Linear(model_dim, num_heads * k_dim)
        self.affine_k = nn.Linear(model_dim, num_heads * k_dim)
        self.affine_v = nn.Linear(model_dim, num_heads * v_dim)
        self.affine_o = nn.Linear(num_heads * v_dim, model_dim)

        self.num_heads = num_heads
        self.model_dim = model_dim
        self.dropout = dropout

    def forward(self, q, k, v, mask):
        """Compute context vector and attention weights.
        
        Parameters
        -----------
        q : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The queries.
        k : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The keys.
        v : Tensor [shape=(batch_size, time_steps_k, model_dim)]
            The values.
        mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape
            The mask.

        Returns
        ----------
        out : Tensor [shape=(batch_size, time_steps_q, model_dim)]
            The context vector.
        attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]
            The attention weights.
        """
        q = _split_heads(self.affine_q(q), self.num_heads)  # (B, h, T, C)
        k = _split_heads(self.affine_k(k), self.num_heads)
        v = _split_heads(self.affine_v(v), self.num_heads)
        mask = paddle.unsqueeze(mask, 1)  # unsqueeze for the h dim

        context_vectors, attention_weights = scaled_dot_product_attention(
            q, k, v, mask, self.dropout, self.training)
        # NOTE: there is more sophisticated implementation: Scheduled DropHead
        context_vectors = _concat_heads(context_vectors)  # (B, T, h*C)
        out = self.affine_o(context_vectors)
        return out, attention_weights


class LocationSensitiveAttention(nn.Layer):
    """Location Sensitive Attention module.

    Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_

    Parameters
    -----------
    d_query: int
        The feature size of query.
    d_key : int
        The feature size of key.
    d_attention : int
        The feature size of dimension.
    location_filters : int
        Filter size of attention convolution.
    location_kernel_size : int
        Kernel size of attention convolution.
    """

    def __init__(self,
                 d_query: int,
                 d_key: int,
                 d_attention: int,
                 location_filters: int,
                 location_kernel_size: int):
        super().__init__()

        self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)
        self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)
        self.value = nn.Linear(d_attention, 1, bias_attr=False)

        # Location Layer
        self.location_conv = nn.Conv1D(
            2,
            location_filters,
            kernel_size=location_kernel_size,
            padding=int((location_kernel_size - 1) / 2),
            bias_attr=False,
            data_format='NLC')
        self.location_layer = nn.Linear(
            location_filters, d_attention, bias_attr=False)

    def forward(self,
                query,
                processed_key,
                value,
                attention_weights_cat,
                mask=None):
        """Compute context vector and attention weights.
        
        Parameters
        -----------
        query : Tensor [shape=(batch_size, d_query)]
            The queries.
        processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]
            The keys after linear layer.
        value : Tensor [shape=(batch_size, time_steps_k, d_key)]
            The values.
        attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]
            Attention weights concat.
        mask : Tensor, optional
            The mask. Shape should be (batch_size, times_steps_k, 1).
            Defaults to None.

        Returns
        ----------
        attention_context : Tensor [shape=(batch_size, d_attention)]
            The context vector.
        attention_weights : Tensor [shape=(batch_size, time_steps_k)]
            The attention weights.
        """

        processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))
        processed_attention_weights = self.location_layer(
            self.location_conv(attention_weights_cat))
        # (B, T_enc, 1)
        alignment = self.value(
            paddle.tanh(processed_attention_weights + processed_key +
                        processed_query))

        if mask is not None:
            alignment = alignment + (1.0 - mask) * -1e9

        attention_weights = F.softmax(alignment, axis=1)
        attention_context = paddle.matmul(
            attention_weights, value, transpose_x=True)

        attention_weights = paddle.squeeze(attention_weights, axis=-1)
        attention_context = paddle.squeeze(attention_context, axis=1)

        return attention_context, attention_weights
merge parakeet repo into deepspeech 3 years ago			`# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import math`

			`import numpy as np`
			`import paddle`
			`from paddle import nn`
			`from paddle.nn import functional as F`


			`def scaled_dot_product_attention(q, k, v, mask=None, dropout=0.0,`
			`training=True):`
			`r"""Scaled dot product attention with masking.`

			`Assume that q, k, v all have the same leading dimensions (denoted as * in`
			`descriptions below). Dropout is applied to attention weights before`
			`weighted sum of values.`

			`Parameters`
			`-----------`
			`q : Tensor [shape=(\*, T_q, d)]`
			`the query tensor.`
			`k : Tensor [shape=(\*, T_k, d)]`
			`the key tensor.`
			`v : Tensor [shape=(\*, T_k, d_v)]`
			`the value tensor.`
			`mask : Tensor, [shape=(\*, T_q, T_k) or broadcastable shape], optional`
			`the mask tensor, zeros correspond to paddings. Defaults to None.`

			`Returns`
			`----------`
			`out : Tensor [shape=(\*, T_q, d_v)]`
			`the context vector.`
			`attn_weights : Tensor [shape=(\*, T_q, T_k)]`
			`the attention weights.`
			`"""`
			`d = q.shape[-1] # we only support imperative execution`
			`qk = paddle.matmul(q, k, transpose_y=True)`
			`scaled_logit = paddle.scale(qk, 1.0 / math.sqrt(d))`

			`if mask is not None:`
			`scaled_logit += paddle.scale((1.0 - mask), -1e9) # hard coded here`

			`attn_weights = F.softmax(scaled_logit, axis=-1)`
			`attn_weights = F.dropout(attn_weights, dropout, training=training)`
			`out = paddle.matmul(attn_weights, v)`
			`return out, attn_weights`


			`def drop_head(x, drop_n_heads, training=True):`
			`"""Drop n context vectors from multiple ones.`

			`Parameters`
			`----------`
			`x : Tensor [shape=(batch_size, num_heads, time_steps, channels)]`
			`The input, multiple context vectors.`
			`drop_n_heads : int [0<= drop_n_heads <= num_heads]`
			`Number of vectors to drop.`
			`training : bool`
			A flag indicating whether it is in training. If `False`, no dropout is
			`applied.`

			`Returns`
			`-------`
			`Tensor`
			`The output.`
			`"""`
			`if not training or (drop_n_heads == 0):`
			`return x`

			`batch_size, num_heads, _, _ = x.shape`
			`# drop all heads`
			`if num_heads == drop_n_heads:`
			`return paddle.zeros_like(x)`

			`mask = np.ones([batch_size, num_heads])`
			`mask[:, :drop_n_heads] = 0`
			`for subarray in mask:`
			`np.random.shuffle(subarray)`
			`scale = float(num_heads) / (num_heads - drop_n_heads)`
			`mask = scale * np.reshape(mask, [batch_size, num_heads, 1, 1])`
			`out = x * paddle.to_tensor(mask)`
			`return out`


			`def _split_heads(x, num_heads):`
			`batch_size, time_steps, _ = x.shape`
			`x = paddle.reshape(x, [batch_size, time_steps, num_heads, -1])`
			`x = paddle.transpose(x, [0, 2, 1, 3])`
			`return x`


			`def _concat_heads(x):`
			`batch_size, _, time_steps, _ = x.shape`
			`x = paddle.transpose(x, [0, 2, 1, 3])`
			`x = paddle.reshape(x, [batch_size, time_steps, -1])`
			`return x`


			`# Standard implementations of Monohead Attention & Multihead Attention`
			`class MonoheadAttention(nn.Layer):`
			`"""Monohead Attention module.`

			`Parameters`
			`----------`
			`model_dim : int`
			`Feature size of the query.`
			`dropout : float, optional`
			`Dropout probability of scaled dot product attention and final context`
			`vector. Defaults to 0.0.`
			`k_dim : int, optional`
			`Feature size of the key of each scaled dot product attention. If not`
			provided, it is set to `model_dim / num_heads`. Defaults to None.
			`v_dim : int, optional`
			`Feature size of the key of each scaled dot product attention. If not`
			provided, it is set to `model_dim / num_heads`. Defaults to None.
			`"""`

			`def __init__(self,`
			`model_dim: int,`
			`dropout: float=0.0,`
			`k_dim: int=None,`
			`v_dim: int=None):`
			`super(MonoheadAttention, self).__init__()`
			`k_dim = k_dim or model_dim`
			`v_dim = v_dim or model_dim`
			`self.affine_q = nn.Linear(model_dim, k_dim)`
			`self.affine_k = nn.Linear(model_dim, k_dim)`
			`self.affine_v = nn.Linear(model_dim, v_dim)`
			`self.affine_o = nn.Linear(v_dim, model_dim)`

			`self.model_dim = model_dim`
			`self.dropout = dropout`

			`def forward(self, q, k, v, mask):`
			`"""Compute context vector and attention weights.`

			`Parameters`
			`-----------`
			`q : Tensor [shape=(batch_size, time_steps_q, model_dim)]`
			`The queries.`
			`k : Tensor [shape=(batch_size, time_steps_k, model_dim)]`
			`The keys.`
			`v : Tensor [shape=(batch_size, time_steps_k, model_dim)]`
			`The values.`
			`mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape`
			`The mask.`

			`Returns`
			`----------`
			`out : Tensor [shape=(batch_size, time_steps_q, model_dim)]`
			`The context vector.`
			`attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]`
			`The attention weights.`
			`"""`
			`q = self.affine_q(q) # (B, T, C)`
			`k = self.affine_k(k)`
			`v = self.affine_v(v)`

			`context_vectors, attention_weights = scaled_dot_product_attention(`
			`q, k, v, mask, self.dropout, self.training)`

			`out = self.affine_o(context_vectors)`
			`return out, attention_weights`


			`class MultiheadAttention(nn.Layer):`
			`"""Multihead Attention module.`

			`Parameters`
			`-----------`
			`model_dim: int`
			`The feature size of query.`
			`num_heads : int`
			`The number of attention heads.`
			`dropout : float, optional`
			`Dropout probability of scaled dot product attention and final context`
			`vector. Defaults to 0.0.`
			`k_dim : int, optional`
			`Feature size of the key of each scaled dot product attention. If not`
			provided, it is set to ``model_dim / num_heads``. Defaults to None.
			`v_dim : int, optional`
			`Feature size of the key of each scaled dot product attention. If not`
			provided, it is set to ``model_dim / num_heads``. Defaults to None.

			`Raises`
			`---------`
			`ValueError`
			If ``model_dim`` is not divisible by ``num_heads``.
			`"""`

			`def __init__(self,`
			`model_dim: int,`
			`num_heads: int,`
			`dropout: float=0.0,`
			`k_dim: int=None,`
			`v_dim: int=None):`
			`super(MultiheadAttention, self).__init__()`
			`if model_dim % num_heads != 0:`
			`raise ValueError("model_dim must be divisible by num_heads")`
			`depth = model_dim // num_heads`
			`k_dim = k_dim or depth`
			`v_dim = v_dim or depth`
			`self.affine_q = nn.Linear(model_dim, num_heads * k_dim)`
			`self.affine_k = nn.Linear(model_dim, num_heads * k_dim)`
			`self.affine_v = nn.Linear(model_dim, num_heads * v_dim)`
			`self.affine_o = nn.Linear(num_heads * v_dim, model_dim)`

			`self.num_heads = num_heads`
			`self.model_dim = model_dim`
			`self.dropout = dropout`

			`def forward(self, q, k, v, mask):`
			`"""Compute context vector and attention weights.`

			`Parameters`
			`-----------`
			`q : Tensor [shape=(batch_size, time_steps_q, model_dim)]`
			`The queries.`
			`k : Tensor [shape=(batch_size, time_steps_k, model_dim)]`
			`The keys.`
			`v : Tensor [shape=(batch_size, time_steps_k, model_dim)]`
			`The values.`
			`mask : Tensor [shape=(batch_size, times_steps_q, time_steps_k] or broadcastable shape`
			`The mask.`

			`Returns`
			`----------`
			`out : Tensor [shape=(batch_size, time_steps_q, model_dim)]`
			`The context vector.`
			`attention_weights : Tensor [shape=(batch_size, times_steps_q, time_steps_k)]`
			`The attention weights.`
			`"""`
			`q = _split_heads(self.affine_q(q), self.num_heads) # (B, h, T, C)`
			`k = _split_heads(self.affine_k(k), self.num_heads)`
			`v = _split_heads(self.affine_v(v), self.num_heads)`
			`mask = paddle.unsqueeze(mask, 1) # unsqueeze for the h dim`

			`context_vectors, attention_weights = scaled_dot_product_attention(`
			`q, k, v, mask, self.dropout, self.training)`
			`# NOTE: there is more sophisticated implementation: Scheduled DropHead`
			`context_vectors = _concat_heads(context_vectors) # (B, T, h*C)`
			`out = self.affine_o(context_vectors)`
			`return out, attention_weights`


			`class LocationSensitiveAttention(nn.Layer):`
			`"""Location Sensitive Attention module.`

			Reference: `Attention-Based Models for Speech Recognition <https://arxiv.org/pdf/1506.07503.pdf>`_

			`Parameters`
			`-----------`
			`d_query: int`
			`The feature size of query.`
			`d_key : int`
			`The feature size of key.`
			`d_attention : int`
			`The feature size of dimension.`
			`location_filters : int`
			`Filter size of attention convolution.`
			`location_kernel_size : int`
			`Kernel size of attention convolution.`
			`"""`

			`def __init__(self,`
			`d_query: int,`
			`d_key: int,`
			`d_attention: int,`
			`location_filters: int,`
			`location_kernel_size: int):`
			`super().__init__()`

			`self.query_layer = nn.Linear(d_query, d_attention, bias_attr=False)`
			`self.key_layer = nn.Linear(d_key, d_attention, bias_attr=False)`
			`self.value = nn.Linear(d_attention, 1, bias_attr=False)`

			`# Location Layer`
			`self.location_conv = nn.Conv1D(`
			`2,`
			`location_filters,`
			`kernel_size=location_kernel_size,`
			`padding=int((location_kernel_size - 1) / 2),`
			`bias_attr=False,`
			`data_format='NLC')`
			`self.location_layer = nn.Linear(`
			`location_filters, d_attention, bias_attr=False)`

			`def forward(self,`
			`query,`
			`processed_key,`
			`value,`
			`attention_weights_cat,`
			`mask=None):`
			`"""Compute context vector and attention weights.`

			`Parameters`
			`-----------`
			`query : Tensor [shape=(batch_size, d_query)]`
			`The queries.`
			`processed_key : Tensor [shape=(batch_size, time_steps_k, d_attention)]`
			`The keys after linear layer.`
			`value : Tensor [shape=(batch_size, time_steps_k, d_key)]`
			`The values.`
			`attention_weights_cat : Tensor [shape=(batch_size, time_step_k, 2)]`
			`Attention weights concat.`
			`mask : Tensor, optional`
			`The mask. Shape should be (batch_size, times_steps_k, 1).`
			`Defaults to None.`

			`Returns`
			`----------`
			`attention_context : Tensor [shape=(batch_size, d_attention)]`
			`The context vector.`
			`attention_weights : Tensor [shape=(batch_size, time_steps_k)]`
			`The attention weights.`
			`"""`

			`processed_query = self.query_layer(paddle.unsqueeze(query, axis=[1]))`
			`processed_attention_weights = self.location_layer(`
			`self.location_conv(attention_weights_cat))`
			`# (B, T_enc, 1)`
			`alignment = self.value(`
			`paddle.tanh(processed_attention_weights + processed_key +`
			`processed_query))`

			`if mask is not None:`
			`alignment = alignment + (1.0 - mask) * -1e9`

			`attention_weights = F.softmax(alignment, axis=1)`
			`attention_context = paddle.matmul(`
			`attention_weights, value, transpose_x=True)`

			`attention_weights = paddle.squeeze(attention_weights, axis=-1)`
			`attention_context = paddle.squeeze(attention_context, axis=1)`

			`return attention_context, attention_weights`