In [15]:
import numpy as np

np.random.seed(0)  # 设置随机种子以获得可重复的结果

X = np.random.randn(4, 8)  # 假设我们的向量维度是8，即从768变成8，还是"LLM with me"的4个Token
W = np.random.randn(8, 1)  # 权重矩阵W，形状为[8, 1]
b = np.random.randn(1)  # 偏置向量b，形状为[1]
# 线性变换Y = XW + b
# 这里使用np.dot进行矩阵乘法，然后加上偏置
Y = np.dot(X, W) + b
# 输出结果Y，形状为[4, 1]， 为了得到形状[4,]的输出，我们可以将结果压缩到一维
Y = np.squeeze(Y)

print("Input X shape:", X.shape)
print("Weight W shape:", W.shape)
print("Bias b shape:", b.shape)
print("Output Y shape:", Y.shape)
print("Output Y:", Y)

Input X shape: (4, 8)
Weight W shape: (8, 1)
Bias b shape: (1,)
Output Y shape: (4,)
Output Y: [-2.59709604 -0.78316274 -4.6765379   3.25016417]


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 假设的词汇表和词嵌入
vocab = {'LLM': 0, 'with': 1, 'me': 2, '<PAD>': 3}  # 一个简化的词汇表
vocab_size = len(vocab)  # 词汇表大小
embedding_dim = 768  # 嵌入维度，与GPT-2的小型版本相同

text = "LLM with me"
input_ids = torch.tensor([[vocab[word] for word in text.split()]], dtype=torch.long)

# 模拟Transformer流程
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedded = embedding_layer(input_ids)
transformer_output = torch.rand(embedded.size())  # 假设的Transformer输出
# 创建一个线性层，将Transformer输出映射到词汇表空间
linear_layer = nn.Linear(embedding_dim, vocab_size)
vocab_space_scores = linear_layer(transformer_output)
# 输出概率分布
probabilities = F.softmax(vocab_space_scores, dim=-1)
print(probabilities)

tensor([[[0.2306, 0.2478, 0.2688, 0.2528],
         [0.1928, 0.3077, 0.2768, 0.2227],
         [0.2562, 0.2568, 0.2837, 0.2033]]], grad_fn=<SoftmaxBackward0>)


In [8]:
vocab_space_scores

tensor([[[-0.1818, -0.1099, -0.0287, -0.0901],
         [-0.6043, -0.1369, -0.2430, -0.4603],
         [-0.1816, -0.1795, -0.0799, -0.4130]]], grad_fn=<ViewBackward0>)