From 8dd57ae7d183ad1e8113719c6b241181b8151d6b Mon Sep 17 00:00:00 2001 From: "ben.guo" <909336740@qq.com> Date: Sat, 27 Apr 2024 11:36:36 +0800 Subject: [PATCH] =?UTF-8?q?Add.=20=E6=B7=BB=E5=8A=A0=E7=AC=AC=E4=BA=8C?= =?UTF-8?q?=E7=AB=A0=E5=8F=AF=E5=A4=8D=E7=8E=B0=E7=9A=84=E5=AE=8C=E6=95=B4?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 10244 -> 10244 bytes 人人都能看懂的Transformer/.DS_Store | Bin 0 -> 6148 bytes .../code/llmcode-2.ipynb | 1 + 3 files changed, 1 insertion(+) create mode 100644 人人都能看懂的Transformer/.DS_Store create mode 100644 人人都能看懂的Transformer/code/llmcode-2.ipynb diff --git a/.DS_Store b/.DS_Store index 5381d97c4720641fff505b589451b28a59c5370f..74ce64615e916fdd5845d857b5e32b0110e57c6c 100644 GIT binary patch delta 32 kcmZn(XbISGR)EoD@_SMH&AfsSSr|<=zZd<_52i$!0MOM7^Z)<= delta 32 kcmZn(XbISGR)EoX@_SMH&AfsSSs0BszZd<_52i$!0MM!n@c;k- diff --git a/人人都能看懂的Transformer/.DS_Store b/人人都能看懂的Transformer/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c9326ffaffe2088f1e5f49260038c611833e9b80 GIT binary patch literal 6148 zcmeHK!EVz)5S?ue*i@B56$-~iT%(dgL_u8K5Dr|Bz=0kL3U+K#3&$JT36UBFd_V#u zF8u($fm{E8GdE5gICJ0^U}kqy#cGSh1tGd4&Ay$Tow4Vw*6RfT)*OZF02KhRQ36`Cp{z!dc;9wl;dt9xGn-)4aJt_58X5JFVmzw8qFHhxNxMI6iv z#`!?+^Zj6u#QCsZ|H2j)ty8DZ*tWf7Kk;_N*c-Fi> zvCm(=a_d3odD2hCTSYp1hZJ&nhAWdnA0o(J-GBMDx8HjW zYiNh%wSteBVQp@$mNDCVl`VR+z12Rq_t{%LGLq_XIf zgaP_Ka8yE9W1&!fI6-&n;plhu z{6e7$2PU74dGwQ+zM(LE@nBp|2j(g?tzp10P-b9JnH4(!?;L;sF9#XRFkl$?uNYuf z%WE~UBz?9XDvr)tj`|dpgyt0rWe7I)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Config, GPT2Model\nimport torch\n\n# 初始化分词器& 创建模型配置 & 初始化模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nconfig = GPT2Config(vocab_size=len(tokenizer), n_embd=768, n_layer=12, n_head=12)\nmodel = GPT2Model(config)\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nprint(input_ids)\n# 获取模型的嵌入层\nembeddings = model.get_input_embeddings()\n# 将索引转换为嵌入向量\ninput_embeddings = embeddings(input_ids)\n\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:55.625570Z","iopub.execute_input":"2024-04-27T02:56:55.626445Z","iopub.status.idle":"2024-04-27T02:56:58.685226Z","shell.execute_reply.started":"2024-04-27T02:56:55.626407Z","shell.execute_reply":"2024-04-27T02:56:58.684113Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"tensor([[3069, 44, 351, 502]])\ntensor([[[-0.0286, 0.0050, -0.0065, ..., -0.0037, 0.0252, -0.0502],\n [-0.0149, -0.0292, 0.0061, ..., 0.0174, 0.0070, 0.0024],\n [ 0.0027, 0.0040, 0.0024, ..., -0.0140, -0.0124, 0.0112],\n [-0.0026, -0.0169, -0.0193, ..., 0.0246, -0.0027, 0.0186]]],\n grad_fn=)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"# from tokenizers import Tokenizer\n# from tokenizers.models import BPE\n# from tokenizers.trainers import BpeTrainer\n# from tokenizers.pre_tokenizers import Whitespace\n\n# # 创建一个空的BPE分词器 & 使用空格进行预分词 & 创建一个分词器训练器\n# tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))\n# tokenizer.pre_tokenizer = Whitespace()\n# trainer = BpeTrainer(special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"])\n\n# # 准备一些训练数据,这里我们只用一句话 & 在实际应用中,你需要大量的文本数据\n# train_data = [\"LLM with me\"]\n\n# # 训练分词器\n# tokenizer.train_from_iterator(train_data, trainer)\n# # 保存分词器到文件\n# tokenizer.save(\"custom_tokenizer.json\")\n# # 测试分词器\n# output = tokenizer.encode(\"LLM with me\")\n# print(output.tokens)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.686374Z","iopub.execute_input":"2024-04-27T02:56:58.686644Z","iopub.status.idle":"2024-04-27T02:56:58.691471Z","shell.execute_reply.started":"2024-04-27T02:56:58.686622Z","shell.execute_reply":"2024-04-27T02:56:58.690586Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"from tokenizers import Tokenizer\nfrom tokenizers.models import BPE\nfrom tokenizers.trainers import BpeTrainer\nfrom tokenizers.pre_tokenizers import Whitespace\n\n# 创建一个空的BPE分词器 & 使用空格进行预分词 & 创建一个分词器训练器\ntokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))\ntokenizer.pre_tokenizer = Whitespace()\ntrainer = BpeTrainer(special_tokens=[\"[UNK]\", \"\"]) # 添加两个占位来解决不认识的词和结束语\n\n# 准备一些训练数据,这里我们只用一句话 & 在实际应用中,你需要大量的文本数据\ntrain_data = [\"LLM with me\"]\n# 训练分词器\ntokenizer.train_from_iterator(train_data, trainer)\n# 保存分词器到文件\ntokenizer.save(\"custom_tokenizer.json\")\n# 测试分词器\noutput = tokenizer.encode(\"LLM with me \")\nprint(output.tokens)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.692919Z","iopub.execute_input":"2024-04-27T02:56:58.693327Z","iopub.status.idle":"2024-04-27T02:56:58.851600Z","shell.execute_reply.started":"2024-04-27T02:56:58.693289Z","shell.execute_reply":"2024-04-27T02:56:58.850683Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"\n['LLM', 'with', 'me', '']\n\n\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import PreTrainedTokenizerFast\n\n# 获取文本的索引\ninput_ids = output.ids\nprint(input_ids)\n\n# 加载自定义分词器 & 编码文本并返回PyTorch张量\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nprint(input_ids)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.853791Z","iopub.execute_input":"2024-04-27T02:56:58.854126Z","iopub.status.idle":"2024-04-27T02:56:58.863190Z","shell.execute_reply.started":"2024-04-27T02:56:58.854094Z","shell.execute_reply":"2024-04-27T02:56:58.862136Z"},"trusted":true},"execution_count":9,"outputs":[{"name":"stdout","text":"[14, 15, 12, 1]\ntensor([[14, 15, 12]])\n","output_type":"stream"}]},{"cell_type":"code","source":"# 查看分词器的词汇表\nvocab = tokenizer.get_vocab()\nprint(vocab)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.864610Z","iopub.execute_input":"2024-04-27T02:56:58.865014Z","iopub.status.idle":"2024-04-27T02:56:58.874455Z","shell.execute_reply.started":"2024-04-27T02:56:58.864965Z","shell.execute_reply":"2024-04-27T02:56:58.873519Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"{'e': 4, 'it': 11, 'm': 7, 'h': 5, 'w': 9, 'LLM': 14, '[UNK]': 0, 'i': 6, 'LL': 10, 'wit': 13, 'M': 3, 'with': 15, 'L': 2, 't': 8, '': 1, 'me': 12}\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import PreTrainedTokenizerFast\nfrom transformers import GPT2Config, GPT2Model\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n\n# 创建模型配置 & 初始化模型\nconfig = GPT2Config(vocab_size=tokenizer.vocab_size, n_embd=768, n_layer=12, n_head=12)\nmodel = GPT2Model(config)\n\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n\n# 获取模型的嵌入层 & 将索引转换为嵌入向量\nembeddings = model.get_input_embeddings()\ninput_embeddings = embeddings(input_ids)\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.878560Z","iopub.execute_input":"2024-04-27T02:56:58.878873Z","iopub.status.idle":"2024-04-27T02:57:00.767283Z","shell.execute_reply.started":"2024-04-27T02:56:58.878848Z","shell.execute_reply":"2024-04-27T02:57:00.766259Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"tensor([[[-0.0054, 0.0349, -0.0085, ..., -0.0360, -0.0266, -0.0049],\n [-0.0047, -0.0010, 0.0164, ..., -0.0157, -0.0245, -0.0222],\n [-0.0183, 0.0165, -0.0246, ..., -0.0089, 0.0305, -0.0066]]],\n grad_fn=)\ntorch.Size([1, 3, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nfrom transformers import PreTrainedTokenizerFast\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n\n# 假设我们的自定义分词器有一个很小的词汇表\nvocab_size = tokenizer.vocab_size # 从自定义分词器获取词汇表大小\nn_embd = 10 # 设置嵌入维度为10\n# 创建一个随机初始化的嵌入矩阵,这里我们使用正态分布随机初始化,与实际模型初始化类似\nembedding_matrix = torch.randn(vocab_size, n_embd)\ntoken_indices = input_ids[0] # 假设input_ids是一个包含索引的张量\ntoken_embeddings = embedding_matrix[token_indices]\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.768613Z","iopub.execute_input":"2024-04-27T02:57:00.769015Z","iopub.status.idle":"2024-04-27T02:57:00.779896Z","shell.execute_reply.started":"2024-04-27T02:57:00.768982Z","shell.execute_reply":"2024-04-27T02:57:00.778956Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stdout","text":"tensor([[-0.0734, 0.9592, 1.3814, 2.3693, 2.3262, -1.0959, 0.7059, -0.0506,\n -0.0729, -1.1791],\n [-0.5122, 0.6106, -0.3071, 0.4347, 0.2445, 2.0369, 0.3645, -0.4135,\n -0.5863, 1.2864],\n [-2.0330, 0.1906, -0.1990, -0.4726, 2.1953, 1.0321, -2.0593, -0.5847,\n -0.3605, -1.9308]])\ntorch.Size([3, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(embedding_matrix[13])\nprint(embedding_matrix[14])\nprint(embedding_matrix[11])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.781157Z","iopub.execute_input":"2024-04-27T02:57:00.781522Z","iopub.status.idle":"2024-04-27T02:57:00.789426Z","shell.execute_reply.started":"2024-04-27T02:57:00.781489Z","shell.execute_reply":"2024-04-27T02:57:00.788480Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"tensor([ 0.8079, 0.3908, -1.4927, 0.7122, 1.9811, -0.1907, -0.7898, -0.7037,\n 2.5170, 1.1116])\ntensor([-0.0734, 0.9592, 1.3814, 2.3693, 2.3262, -1.0959, 0.7059, -0.0506,\n -0.0729, -1.1791])\ntensor([-1.0125, -0.0436, -0.1076, -0.2520, 1.0112, -1.5468, -0.2592, -1.1768,\n 1.1942, -0.6323])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nfrom transformers import PreTrainedTokenizerFast\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nvocab_size = tokenizer.vocab_size; n_embd = 10\n# 创建一个随机初始化的嵌入矩阵,这里我们使用Xavier初始化\nembedding_matrix = torch.empty(vocab_size, n_embd)\ntorch.nn.init.xavier_uniform_(embedding_matrix) # 使用Xavier均匀初始化\n\ntoken_indices = input_ids[0] # 假设input_ids是一个包含索引的张量\ntoken_embeddings = embedding_matrix[token_indices]\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.790724Z","iopub.execute_input":"2024-04-27T02:57:00.791389Z","iopub.status.idle":"2024-04-27T02:57:00.802611Z","shell.execute_reply.started":"2024-04-27T02:57:00.791357Z","shell.execute_reply":"2024-04-27T02:57:00.801589Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"tensor([[-0.0731, -0.0870, -0.0354, -0.1912, 0.0068, 0.1697, -0.3924, 0.0451,\n -0.1344, 0.1391],\n [-0.1168, 0.2037, -0.2497, 0.3482, -0.0643, -0.1626, 0.0857, -0.3652,\n 0.1890, -0.2604],\n [ 0.1953, -0.1465, -0.0192, 0.3124, 0.4406, -0.4748, -0.3842, -0.3123,\n -0.4185, -0.1526]])\ntorch.Size([3, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(embedding_matrix[13])\nprint(embedding_matrix[14])\nprint(embedding_matrix[11])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.803812Z","iopub.execute_input":"2024-04-27T02:57:00.804131Z","iopub.status.idle":"2024-04-27T02:57:00.814686Z","shell.execute_reply.started":"2024-04-27T02:57:00.804106Z","shell.execute_reply":"2024-04-27T02:57:00.813740Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"tensor([-0.3125, 0.1946, -0.0692, -0.3138, -0.1735, 0.0426, -0.3563, -0.1339,\n 0.2658, -0.1542])\ntensor([-0.0731, -0.0870, -0.0354, -0.1912, 0.0068, 0.1697, -0.3924, 0.0451,\n -0.1344, 0.1391])\ntensor([ 0.2595, -0.1769, 0.2184, -0.1655, 0.1156, 0.3385, -0.1067, -0.3338,\n -0.3268, -0.2799])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nimport torch.nn as nn\nimport torch.optim as optim\n\nvocab_size = tokenizer.vocab_size; n_embd = 10\nembedding_matrix = torch.empty(vocab_size, n_embd)\nnn.init.xavier_uniform_(embedding_matrix)\n# 定义一个简化版的GPT模型\nclass SimpleGPT(nn.Module):\n def __init__(self, vocab_size, n_embd):\n super(SimpleGPT, self).__init__()\n self.embeddings = nn.Embedding(vocab_size, n_embd)\n self.ffn = nn.Linear(n_embd, n_embd)\n self.logits = nn.Linear(n_embd, vocab_size)\n nn.init.xavier_uniform_(self.embeddings.weight) # 使用Xavier初始化嵌入层\n \n def forward(self, input_ids):\n x = self.embeddings(input_ids) # 嵌入层\n x = self.ffn(x) # 前馈网络\n logits = self.logits(x) # 输出层\n return logits\n\n# 创建模型实例 & 定义损失函数和优化器\nmodel = SimpleGPT(vocab_size, n_embd)\nloss_fn = nn.CrossEntropyLoss()\noptimizer = optim.Adam(model.parameters(), lr=0.001)\n\n# 假设我们有一些训练数据\ninput_ids = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]]) # 示例输入\nlabels = torch.tensor([[2, 3, 4, 5], [3, 4, 5, 6]]) # 示例目标\n\n# 训练循环\nfor epoch in range(100): # 假设训练100个epoch\n logits = model(input_ids) # 前向传播\n loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1)) # 计算损失\n # 反向传播\n optimizer.zero_grad() \n loss.backward()\n # 梯度下降\n optimizer.step()\n # 打印损失\n if (epoch + 1) % 10 == 0:\n print(f'Epoch {epoch + 1}, Loss: {loss.item()}')","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.815992Z","iopub.execute_input":"2024-04-27T02:57:00.816912Z","iopub.status.idle":"2024-04-27T02:57:01.069032Z","shell.execute_reply.started":"2024-04-27T02:57:00.816876Z","shell.execute_reply":"2024-04-27T02:57:01.067947Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"Epoch 10, Loss: 2.7324118614196777\nEpoch 20, Loss: 2.657238245010376\nEpoch 30, Loss: 2.580580472946167\nEpoch 40, Loss: 2.500619888305664\nEpoch 50, Loss: 2.415583848953247\nEpoch 60, Loss: 2.3237650394439697\nEpoch 70, Loss: 2.2237038612365723\nEpoch 80, Loss: 2.1143651008605957\nEpoch 90, Loss: 1.9953210353851318\nEpoch 100, Loss: 1.8669122457504272\n","output_type":"stream"}]},{"cell_type":"code","source":"token_indices = input_ids[0]\ntoken_embeddings = model.embeddings(token_indices)\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:01.070282Z","iopub.execute_input":"2024-04-27T02:57:01.070583Z","iopub.status.idle":"2024-04-27T02:57:01.077554Z","shell.execute_reply.started":"2024-04-27T02:57:01.070557Z","shell.execute_reply":"2024-04-27T02:57:01.076566Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stdout","text":"tensor([[ 0.3168, 0.1598, -0.2357, -0.1286, 0.4422, 0.0902, -0.2156, 0.1508,\n -0.3751, -0.4240],\n [ 0.3838, -0.2698, 0.2582, -0.1764, 0.4416, -0.0557, 0.5702, 0.3589,\n -0.0439, 0.4755],\n [ 0.0883, -0.5616, -0.4737, -0.1625, 0.4614, -0.1707, -0.3864, -0.3232,\n -0.1757, 0.2665],\n [-0.4491, 0.5912, 0.0080, 0.0760, 0.0837, -0.4634, -0.5850, -0.4476,\n -0.4615, -0.2961]], grad_fn=)\ntorch.Size([4, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch.nn.functional as F\n# 假设model是训练好的模型实例 & 假设tokenizer是加载好的分词器\nmodel.eval() # 将模型设置为评估模式\ninput_text = \"LLM with me\" # 输入文本\ninput_ids = tokenizer.encode(input_text, return_tensors=\"pt\") # 将文本编码为token索引\ntemperature = 0.7 # 设置温度参数 & 一般设置为0到1之间的值\ngenerated_text = input_text + \" A:\"\nfor _ in range(50): # 假设我们想生成50个单词\n with torch.no_grad(): # 不需要计算梯度\n logits = model(input_ids)\n logits = logits / temperature # 应用温度调整\n # 使用softmax函数将logits转换为概率分布 & 根据概率分布随机选择下一个单词\n probabilities = F.softmax(logits[:, -1, :], dim=-1)\n predicted_id = torch.multinomial(probabilities, num_samples=1)\n # 将预测的token添加到输入序列中 & 将预测的token解码为文本并添加到生成的文本中\n input_ids = torch.cat((input_ids, predicted_id), dim=1)\n generated_text += tokenizer.decode(predicted_id[0])\n\nprint(generated_text)\neos_token = '' # 在生成文本后根据进行切割\ngenerated_text_parts = generated_text.split(eos_token)\nfinal_text = generated_text_parts[0] + eos_token if len(generated_text_parts) > 1 else generated_text_parts[0]\nprint(final_text)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:01.080003Z","iopub.execute_input":"2024-04-27T02:57:01.080404Z","iopub.status.idle":"2024-04-27T02:57:01.123301Z","shell.execute_reply.started":"2024-04-27T02:57:01.080371Z","shell.execute_reply":"2024-04-27T02:57:01.122130Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"LLM with me A:[UNK]iLLwitMimeLMethihitMehwLLMwitMmmMwitLLLLLLMLMLMiwitLLmLwithe[UNK]LLhi[UNK]witLLM\nLLM with me A:[UNK]iLLwitMimeLMethihitMehw\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]} \ No newline at end of file