# Transformer里的原始方法

In [99]:
import numpy as np

def get_positional_encoding(max_seq_len, d_model):
    position_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / d_model) for j in range(d_model)]
        if pos != 0 else np.zeros(d_model)
        for pos in range(max_seq_len)
    ])
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return torch.from_numpy(position_enc).type(torch.FloatTensor)

# 假设我们的模型维度是768，最大序列长度是1024
max_seq_len = 1024
d_model = 768
positional_encoding = get_positional_encoding(max_seq_len, d_model)
print(positional_encoding)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2843e-01,  ...,  1.0000e+00,
          1.0243e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.2799e-01,  ...,  1.0000e+00,
          2.0486e-04,  1.0000e+00],
        ...,
        [ 1.7612e-02, -9.9984e-01, -7.9410e-01,  ...,  9.9427e-01,
          1.0439e-01,  9.9454e-01],
        [-8.3182e-01, -5.5504e-01, -9.4828e-01,  ...,  9.9426e-01,
          1.0449e-01,  9.9453e-01],
        [-9.1649e-01,  4.0007e-01, -2.6814e-01,  ...,  9.9425e-01,
          1.0459e-01,  9.9452e-01]])


In [101]:
print(positional_encoding[13][:10])
print(positional_encoding[14][:10])
print(positional_encoding[11][:10])

tensor([ 0.4202,  0.9074,  0.1252,  0.9921, -0.1744,  0.9847, -0.4519,  0.8921,
        -0.6858,  0.7278])
tensor([0.9906, 0.1367, 0.8920, 0.4520, 0.7018, 0.7124, 0.4454, 0.8953, 0.1523,
        0.9883])
tensor([-1.0000,  0.0044, -0.9673, -0.2535, -0.8724, -0.4889, -0.7253, -0.6884,
        -0.5387, -0.8425])


# GPT-2的位置编码方法

In [102]:
import tensorflow as tf

class HParams:
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

def positions_for(tokens, past_length):
    batch_size = tf.shape(tokens)[0]
    nsteps = tf.shape(tokens)[1]
    position_ids = past_length + tf.range(nsteps)
    return tf.tile(tf.expand_dims(position_ids, 0), [batch_size, 1])

def position_embedding(hparams, position_ids):
    wpe = tf.Variable(tf.random.normal([hparams.n_ctx,hparams.n_embd],stddev=0.01),name='wpe')
    position_embeddings = tf.gather(wpe, position_ids)
    return position_embeddings

# Hyperparameters for the model
hparams = HParams(
    n_vocab=0,
    n_ctx=1024,
    n_embd=768,
    n_head=12,
    n_layer=12,
)

input_tokens = tf.constant([[0, 1, 2, 3]], dtype=tf.int32)
past_length = tf.constant(0)  # Assuming no past context

position_ids = positions_for(input_tokens, past_length)
position_embeddings = position_embedding(hparams, position_ids)
print(position_embeddings)

tf.Tensor(
[[[ 0.01702908  0.00268412  0.01296544 ...  0.00706888  0.00186165
    0.01521429]
  [ 0.00431    -0.01150406  0.01421692 ... -0.00568195  0.00935402
    0.01863918]
  [-0.00091886 -0.00914316 -0.0180154  ...  0.00033014  0.00344726
    0.01064758]
  [ 0.00253335 -0.01882706  0.00029727 ...  0.0026667  -0.00202818
   -0.00463023]]], shape=(1, 4, 768), dtype=float32)


# 获取位置编码，并与向量相加（同位置相加）

In [86]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # 初始化
model = GPT2Model.from_pretrained('gpt2')

text = "LLM with me"  # 待处理的文本
# 分词并转换为索引
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
embeddings = model.get_input_embeddings()  # 获取模型的嵌入层
input_embeddings = embeddings(input_ids)  # 将索引转换为嵌入向量
# 获取位置编码矩阵
position_ids = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)
position_embeddings = model.wpe(position_ids)
final_embeddings = input_embeddings + position_embeddings  # 将位置编码与词嵌入相加以获得最终的输入嵌入

# 查看最终的输入嵌入
print(final_embeddings)
print(final_embeddings.shape)

tensor([[[ 0.2321, -0.3849,  0.1550,  ...,  0.0664,  0.1922,  0.3908],
         [ 0.0081, -0.1923,  0.1255,  ..., -0.0160,  0.1091, -0.0756],
         [ 0.0686, -0.0744,  0.0838,  ...,  0.0598,  0.1280,  0.0136],
         [ 0.1512, -0.0985,  0.1991,  ..., -0.1582,  0.1241,  0.0501]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 4, 768])


In [87]:
print(final_embeddings[0][0][0])
print(input_embeddings[0][0][0] + position_embeddings[0][0][0])

tensor(0.2321, grad_fn=<SelectBackward0>)
tensor(0.2321, grad_fn=<AddBackward0>)


In [88]:
print(final_embeddings[0][1][1])
print(input_embeddings[0][1][1] + position_embeddings[0][1][1])

tensor(-0.1923, grad_fn=<SelectBackward0>)
tensor(-0.1923, grad_fn=<AddBackward0>)
