From 8dd57ae7d183ad1e8113719c6b241181b8151d6b Mon Sep 17 00:00:00 2001
From: "ben.guo" <909336740@qq.com>
Date: Sat, 27 Apr 2024 11:36:36 +0800
Subject: [PATCH] =?UTF-8?q?Add.=20=E6=B7=BB=E5=8A=A0=E7=AC=AC=E4=BA=8C?=
 =?UTF-8?q?=E7=AB=A0=E5=8F=AF=E5=A4=8D=E7=8E=B0=E7=9A=84=E5=AE=8C=E6=95=B4?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .DS_Store                                     | Bin 10244 -> 10244 bytes
 人人都能看懂的Transformer/.DS_Store    | Bin 0 -> 6148 bytes
 .../code/llmcode-2.ipynb                      |   1 +
 3 files changed, 1 insertion(+)
 create mode 100644 人人都能看懂的Transformer/.DS_Store
 create mode 100644 人人都能看懂的Transformer/code/llmcode-2.ipynb
diff --git a/.DS_Store b/.DS_Store
index 5381d97c4720641fff505b589451b28a59c5370f..74ce64615e916fdd5845d857b5e32b0110e57c6c 100644
GIT binary patch
delta 32
kcmZn(XbISGR)EoD@_SMH&AfsSSr|<=zZd<_52i$!0MOM7^Z)<=

delta 32
kcmZn(XbISGR)EoX@_SMH&AfsSSs0BszZd<_52i$!0MM!n@c;k-

diff --git a/人人都能看懂的Transformer/.DS_Store b/人人都能看懂的Transformer/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..c9326ffaffe2088f1e5f49260038c611833e9b80
GIT binary patch
literal 6148
zcmeHK!EVz)5S?ue*i@B56$-~iT%(dgL_u8K5Dr|Bz=0kL3U+K#3&$JT36UBFd_V#u
zF8u($fm{E8GdE5gICJ0^U}kqy#cGSh1tGd4&Ay$Tow4Vw*6RfT)*OZF02KhRQ3<WH
z*c>6`Cp{z!dc;9wl;dt9x<MiWv?-wx!+>Gn-)4aJt_58X5JFVmzw8qFHhxNxMI6iv
z#`!?+^Zj6u#QCsZ|H2j)ty8DZ*tWf7Kk;_N*c<x8QQq{kZL-=zB*Coj2an=t+^JM=
zi*)G6X_U!;IE--0<E=Oi#keU(X_(7cPdRMIaXOXN$)sN6*BXs!jZbb~y;0+h>-Fi>
zvCm(=a_d3odD2hCTSYp<uacByjWc+Mor7wU-n!Qn{lUyR@9T?JOIK=t_`bLD@W-dk
zvOoN-f3S0W?iXH`CL--&K6=j!liEGl{l=@j3T?>1hZJ&nhAWdnA0o(J-GBMDx8HjW
zYiNh%wSteBVQp@$mNDCVl`VR+z12Rq_t{%LGLq_<k3`eRiC{qQ4twcDRBMVf3>XIf
zgaP_Ka8yE9W1&!fI<S#f07MIirC?wF`Xh20Ky)=03ULI5sZc}}%Jdb3>6-&n;plhu
z{6e7$2PU74dGwQ+zM(LE@nBp|2j(g?tzp10P-b9JnH4(!?;L;sF9#XRFkl$?uNYuf
z%WE~UBz?9XDvr)tj`|dpgyt0rWe7I<IF=nciZ7x{K_90MqN}k`h$ATGM?lhG8pFU}
GW#AX&m6#|1

literal 0
HcmV?d00001

diff --git a/人人都能看懂的Transformer/code/llmcode-2.ipynb b/人人都能看懂的Transformer/code/llmcode-2.ipynb
new file mode 100644
index 0000000..4edad2f
--- /dev/null
+++ b/人人都能看懂的Transformer/code/llmcode-2.ipynb
@@ -0,0 +1 @@
+{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# 第二章——文字向量化","metadata":{}},{"cell_type":"code","source":"# 如果没有transformers则安装\n# ! pip install transformers","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:34.940787Z","iopub.execute_input":"2024-04-27T02:56:34.941256Z","iopub.status.idle":"2024-04-27T02:56:34.945910Z","shell.execute_reply.started":"2024-04-27T02:56:34.941207Z","shell.execute_reply":"2024-04-27T02:56:34.944958Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Model\n\n# 初始化分词器和模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nmodel = GPT2Model.from_pretrained('gpt2')\n\n# 待向量化的文本\ntext = \"LLM with me\"\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\n# 输出token及其对应的索引\nprint(inputs)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:34.950820Z","iopub.execute_input":"2024-04-27T02:56:34.951141Z","iopub.status.idle":"2024-04-27T02:56:54.045121Z","shell.execute_reply.started":"2024-04-27T02:56:34.951107Z","shell.execute_reply":"2024-04-27T02:56:54.043973Z"},"trusted":true},"execution_count":2,"outputs":[{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7f4646e559f142df97301fdb4b55b911"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"20368fa1fd474cfbbe666270d89e57e1"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d2cdba3dff6f4eec92bcba8e64b2ad51"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"253a64324a7e44f6a7d0616f02462cfd"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"16f41d3722d94e4cbbbb74402babc85d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"73a1b8fc83d64b538660aa95da81d709"}},"metadata":{}},{"name":"stdout","text":"{'input_ids': tensor([[3069,   44,  351,  502]]), 'attention_mask': tensor([[1, 1, 1, 1]])}\n","output_type":"stream"}]},{"cell_type":"code","source":"text = \"LLM\"\ninputs = tokenizer(text, return_tensors=\"pt\")\nprint(inputs)\n\n# 查看索引对应的token\nprint(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:54.047315Z","iopub.execute_input":"2024-04-27T02:56:54.047958Z","iopub.status.idle":"2024-04-27T02:56:54.055371Z","shell.execute_reply.started":"2024-04-27T02:56:54.047920Z","shell.execute_reply":"2024-04-27T02:56:54.054460Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"{'input_ids': tensor([[3069,   44]]), 'attention_mask': tensor([[1, 1]])}\n['LL', 'M']\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer\n\n# 初始化分词器\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n# 获取词汇表的大小\nvocab_size = len(tokenizer)\nprint(f\"The vocabulary size of GPT2Tokenizer is: {vocab_size}\")","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:54.056556Z","iopub.execute_input":"2024-04-27T02:56:54.056851Z","iopub.status.idle":"2024-04-27T02:56:54.448560Z","shell.execute_reply.started":"2024-04-27T02:56:54.056826Z","shell.execute_reply":"2024-04-27T02:56:54.447512Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"The vocabulary size of GPT2Tokenizer is: 50257\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Model\n\n# 初始化分词器和模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nmodel = GPT2Model.from_pretrained('gpt2')\n# 待处理的文本\ntext = \"LLM with me\"\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n# 获取模型的嵌入层\nembeddings = model.get_input_embeddings()\n# 将索引转换为嵌入向量\ninput_embeddings = embeddings(input_ids)\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:54.451739Z","iopub.execute_input":"2024-04-27T02:56:54.452204Z","iopub.status.idle":"2024-04-27T02:56:55.623998Z","shell.execute_reply.started":"2024-04-27T02:56:54.452174Z","shell.execute_reply":"2024-04-27T02:56:55.622962Z"},"trusted":true},"execution_count":5,"outputs":[{"name":"stdout","text":"tensor([[[ 0.2509, -0.1875,  0.1510,  ...,  0.1094,  0.1639,  0.3363],\n         [-0.0159, -0.1385,  0.2203,  ..., -0.0501,  0.0990, -0.0755],\n         [ 0.0644,  0.0104,  0.0293,  ...,  0.0400,  0.1087,  0.0350],\n         [ 0.1515, -0.0247,  0.0936,  ..., -0.1684,  0.1065,  0.0572]]],\n       grad_fn=<EmbeddingBackward0>)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Config, GPT2Model\nimport torch\n\n# 初始化分词器& 创建模型配置 & 初始化模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nconfig = GPT2Config(vocab_size=len(tokenizer), n_embd=768, n_layer=12, n_head=12)\nmodel = GPT2Model(config)\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nprint(input_ids)\n# 获取模型的嵌入层\nembeddings = model.get_input_embeddings()\n# 将索引转换为嵌入向量\ninput_embeddings = embeddings(input_ids)\n\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:55.625570Z","iopub.execute_input":"2024-04-27T02:56:55.626445Z","iopub.status.idle":"2024-04-27T02:56:58.685226Z","shell.execute_reply.started":"2024-04-27T02:56:55.626407Z","shell.execute_reply":"2024-04-27T02:56:58.684113Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"tensor([[3069,   44,  351,  502]])\ntensor([[[-0.0286,  0.0050, -0.0065,  ..., -0.0037,  0.0252, -0.0502],\n         [-0.0149, -0.0292,  0.0061,  ...,  0.0174,  0.0070,  0.0024],\n         [ 0.0027,  0.0040,  0.0024,  ..., -0.0140, -0.0124,  0.0112],\n         [-0.0026, -0.0169, -0.0193,  ...,  0.0246, -0.0027,  0.0186]]],\n       grad_fn=<EmbeddingBackward0>)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"# from tokenizers import Tokenizer\n# from tokenizers.models import BPE\n# from tokenizers.trainers import BpeTrainer\n# from tokenizers.pre_tokenizers import Whitespace\n\n# # 创建一个空的BPE分词器 & 使用空格进行预分词 & 创建一个分词器训练器\n# tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))\n# tokenizer.pre_tokenizer = Whitespace()\n# trainer = BpeTrainer(special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"])\n\n# # 准备一些训练数据，这里我们只用一句话 & 在实际应用中，你需要大量的文本数据\n# train_data = [\"LLM with me\"]\n\n# # 训练分词器\n# tokenizer.train_from_iterator(train_data, trainer)\n# # 保存分词器到文件\n# tokenizer.save(\"custom_tokenizer.json\")\n# # 测试分词器\n# output = tokenizer.encode(\"LLM with me\")\n# print(output.tokens)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.686374Z","iopub.execute_input":"2024-04-27T02:56:58.686644Z","iopub.status.idle":"2024-04-27T02:56:58.691471Z","shell.execute_reply.started":"2024-04-27T02:56:58.686622Z","shell.execute_reply":"2024-04-27T02:56:58.690586Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"from tokenizers import Tokenizer\nfrom tokenizers.models import BPE\nfrom tokenizers.trainers import BpeTrainer\nfrom tokenizers.pre_tokenizers import Whitespace\n\n# 创建一个空的BPE分词器 & 使用空格进行预分词 & 创建一个分词器训练器\ntokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))\ntokenizer.pre_tokenizer = Whitespace()\ntrainer = BpeTrainer(special_tokens=[\"[UNK]\", \"<EOS>\"])  # 添加两个占位来解决不认识的词和结束语\n\n# 准备一些训练数据，这里我们只用一句话 & 在实际应用中，你需要大量的文本数据\ntrain_data = [\"LLM with me\"]\n# 训练分词器\ntokenizer.train_from_iterator(train_data, trainer)\n# 保存分词器到文件\ntokenizer.save(\"custom_tokenizer.json\")\n# 测试分词器\noutput = tokenizer.encode(\"LLM with me <EOS>\")\nprint(output.tokens)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.692919Z","iopub.execute_input":"2024-04-27T02:56:58.693327Z","iopub.status.idle":"2024-04-27T02:56:58.851600Z","shell.execute_reply.started":"2024-04-27T02:56:58.693289Z","shell.execute_reply":"2024-04-27T02:56:58.850683Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"\n['LLM', 'with', 'me', '<EOS>']\n\n\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import PreTrainedTokenizerFast\n\n# 获取文本的索引\ninput_ids = output.ids\nprint(input_ids)\n\n# 加载自定义分词器 & 编码文本并返回PyTorch张量\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nprint(input_ids)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.853791Z","iopub.execute_input":"2024-04-27T02:56:58.854126Z","iopub.status.idle":"2024-04-27T02:56:58.863190Z","shell.execute_reply.started":"2024-04-27T02:56:58.854094Z","shell.execute_reply":"2024-04-27T02:56:58.862136Z"},"trusted":true},"execution_count":9,"outputs":[{"name":"stdout","text":"[14, 15, 12, 1]\ntensor([[14, 15, 12]])\n","output_type":"stream"}]},{"cell_type":"code","source":"# 查看分词器的词汇表\nvocab = tokenizer.get_vocab()\nprint(vocab)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.864610Z","iopub.execute_input":"2024-04-27T02:56:58.865014Z","iopub.status.idle":"2024-04-27T02:56:58.874455Z","shell.execute_reply.started":"2024-04-27T02:56:58.864965Z","shell.execute_reply":"2024-04-27T02:56:58.873519Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"{'e': 4, 'it': 11, 'm': 7, 'h': 5, 'w': 9, 'LLM': 14, '[UNK]': 0, 'i': 6, 'LL': 10, 'wit': 13, 'M': 3, 'with': 15, 'L': 2, 't': 8, '<EOS>': 1, 'me': 12}\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import PreTrainedTokenizerFast\nfrom transformers import GPT2Config, GPT2Model\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n\n# 创建模型配置 & 初始化模型\nconfig = GPT2Config(vocab_size=tokenizer.vocab_size, n_embd=768, n_layer=12, n_head=12)\nmodel = GPT2Model(config)\n\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n\n# 获取模型的嵌入层 & 将索引转换为嵌入向量\nembeddings = model.get_input_embeddings()\ninput_embeddings = embeddings(input_ids)\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.878560Z","iopub.execute_input":"2024-04-27T02:56:58.878873Z","iopub.status.idle":"2024-04-27T02:57:00.767283Z","shell.execute_reply.started":"2024-04-27T02:56:58.878848Z","shell.execute_reply":"2024-04-27T02:57:00.766259Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"tensor([[[-0.0054,  0.0349, -0.0085,  ..., -0.0360, -0.0266, -0.0049],\n         [-0.0047, -0.0010,  0.0164,  ..., -0.0157, -0.0245, -0.0222],\n         [-0.0183,  0.0165, -0.0246,  ..., -0.0089,  0.0305, -0.0066]]],\n       grad_fn=<EmbeddingBackward0>)\ntorch.Size([1, 3, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nfrom transformers import PreTrainedTokenizerFast\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n\n# 假设我们的自定义分词器有一个很小的词汇表\nvocab_size = tokenizer.vocab_size  # 从自定义分词器获取词汇表大小\nn_embd = 10  # 设置嵌入维度为10\n# 创建一个随机初始化的嵌入矩阵，这里我们使用正态分布随机初始化，与实际模型初始化类似\nembedding_matrix = torch.randn(vocab_size, n_embd)\ntoken_indices = input_ids[0]  # 假设input_ids是一个包含索引的张量\ntoken_embeddings = embedding_matrix[token_indices]\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.768613Z","iopub.execute_input":"2024-04-27T02:57:00.769015Z","iopub.status.idle":"2024-04-27T02:57:00.779896Z","shell.execute_reply.started":"2024-04-27T02:57:00.768982Z","shell.execute_reply":"2024-04-27T02:57:00.778956Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stdout","text":"tensor([[-0.0734,  0.9592,  1.3814,  2.3693,  2.3262, -1.0959,  0.7059, -0.0506,\n         -0.0729, -1.1791],\n        [-0.5122,  0.6106, -0.3071,  0.4347,  0.2445,  2.0369,  0.3645, -0.4135,\n         -0.5863,  1.2864],\n        [-2.0330,  0.1906, -0.1990, -0.4726,  2.1953,  1.0321, -2.0593, -0.5847,\n         -0.3605, -1.9308]])\ntorch.Size([3, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(embedding_matrix[13])\nprint(embedding_matrix[14])\nprint(embedding_matrix[11])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.781157Z","iopub.execute_input":"2024-04-27T02:57:00.781522Z","iopub.status.idle":"2024-04-27T02:57:00.789426Z","shell.execute_reply.started":"2024-04-27T02:57:00.781489Z","shell.execute_reply":"2024-04-27T02:57:00.788480Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"tensor([ 0.8079,  0.3908, -1.4927,  0.7122,  1.9811, -0.1907, -0.7898, -0.7037,\n         2.5170,  1.1116])\ntensor([-0.0734,  0.9592,  1.3814,  2.3693,  2.3262, -1.0959,  0.7059, -0.0506,\n        -0.0729, -1.1791])\ntensor([-1.0125, -0.0436, -0.1076, -0.2520,  1.0112, -1.5468, -0.2592, -1.1768,\n         1.1942, -0.6323])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nfrom transformers import PreTrainedTokenizerFast\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nvocab_size = tokenizer.vocab_size; n_embd = 10\n# 创建一个随机初始化的嵌入矩阵，这里我们使用Xavier初始化\nembedding_matrix = torch.empty(vocab_size, n_embd)\ntorch.nn.init.xavier_uniform_(embedding_matrix)  # 使用Xavier均匀初始化\n\ntoken_indices = input_ids[0]  # 假设input_ids是一个包含索引的张量\ntoken_embeddings = embedding_matrix[token_indices]\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.790724Z","iopub.execute_input":"2024-04-27T02:57:00.791389Z","iopub.status.idle":"2024-04-27T02:57:00.802611Z","shell.execute_reply.started":"2024-04-27T02:57:00.791357Z","shell.execute_reply":"2024-04-27T02:57:00.801589Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"tensor([[-0.0731, -0.0870, -0.0354, -0.1912,  0.0068,  0.1697, -0.3924,  0.0451,\n         -0.1344,  0.1391],\n        [-0.1168,  0.2037, -0.2497,  0.3482, -0.0643, -0.1626,  0.0857, -0.3652,\n          0.1890, -0.2604],\n        [ 0.1953, -0.1465, -0.0192,  0.3124,  0.4406, -0.4748, -0.3842, -0.3123,\n         -0.4185, -0.1526]])\ntorch.Size([3, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(embedding_matrix[13])\nprint(embedding_matrix[14])\nprint(embedding_matrix[11])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.803812Z","iopub.execute_input":"2024-04-27T02:57:00.804131Z","iopub.status.idle":"2024-04-27T02:57:00.814686Z","shell.execute_reply.started":"2024-04-27T02:57:00.804106Z","shell.execute_reply":"2024-04-27T02:57:00.813740Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"tensor([-0.3125,  0.1946, -0.0692, -0.3138, -0.1735,  0.0426, -0.3563, -0.1339,\n         0.2658, -0.1542])\ntensor([-0.0731, -0.0870, -0.0354, -0.1912,  0.0068,  0.1697, -0.3924,  0.0451,\n        -0.1344,  0.1391])\ntensor([ 0.2595, -0.1769,  0.2184, -0.1655,  0.1156,  0.3385, -0.1067, -0.3338,\n        -0.3268, -0.2799])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nimport torch.nn as nn\nimport torch.optim as optim\n\nvocab_size = tokenizer.vocab_size; n_embd = 10\nembedding_matrix = torch.empty(vocab_size, n_embd)\nnn.init.xavier_uniform_(embedding_matrix)\n# 定义一个简化版的GPT模型\nclass SimpleGPT(nn.Module):\n    def __init__(self, vocab_size, n_embd):\n        super(SimpleGPT, self).__init__()\n        self.embeddings = nn.Embedding(vocab_size, n_embd)\n        self.ffn = nn.Linear(n_embd, n_embd)\n        self.logits = nn.Linear(n_embd, vocab_size)\n        nn.init.xavier_uniform_(self.embeddings.weight)  # 使用Xavier初始化嵌入层\n    \n    def forward(self, input_ids):\n        x = self.embeddings(input_ids)  # 嵌入层\n        x = self.ffn(x)  # 前馈网络\n        logits = self.logits(x)  # 输出层\n        return logits\n\n# 创建模型实例 & 定义损失函数和优化器\nmodel = SimpleGPT(vocab_size, n_embd)\nloss_fn = nn.CrossEntropyLoss()\noptimizer = optim.Adam(model.parameters(), lr=0.001)\n\n# 假设我们有一些训练数据\ninput_ids = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]])  # 示例输入\nlabels = torch.tensor([[2, 3, 4, 5], [3, 4, 5, 6]])  # 示例目标\n\n# 训练循环\nfor epoch in range(100):  # 假设训练100个epoch\n    logits = model(input_ids)  # 前向传播\n    loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1))  # 计算损失\n    # 反向传播\n    optimizer.zero_grad()  \n    loss.backward()\n    # 梯度下降\n    optimizer.step()\n    # 打印损失\n    if (epoch + 1) % 10 == 0:\n        print(f'Epoch {epoch + 1}, Loss: {loss.item()}')","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.815992Z","iopub.execute_input":"2024-04-27T02:57:00.816912Z","iopub.status.idle":"2024-04-27T02:57:01.069032Z","shell.execute_reply.started":"2024-04-27T02:57:00.816876Z","shell.execute_reply":"2024-04-27T02:57:01.067947Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"Epoch 10, Loss: 2.7324118614196777\nEpoch 20, Loss: 2.657238245010376\nEpoch 30, Loss: 2.580580472946167\nEpoch 40, Loss: 2.500619888305664\nEpoch 50, Loss: 2.415583848953247\nEpoch 60, Loss: 2.3237650394439697\nEpoch 70, Loss: 2.2237038612365723\nEpoch 80, Loss: 2.1143651008605957\nEpoch 90, Loss: 1.9953210353851318\nEpoch 100, Loss: 1.8669122457504272\n","output_type":"stream"}]},{"cell_type":"code","source":"token_indices = input_ids[0]\ntoken_embeddings = model.embeddings(token_indices)\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:01.070282Z","iopub.execute_input":"2024-04-27T02:57:01.070583Z","iopub.status.idle":"2024-04-27T02:57:01.077554Z","shell.execute_reply.started":"2024-04-27T02:57:01.070557Z","shell.execute_reply":"2024-04-27T02:57:01.076566Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stdout","text":"tensor([[ 0.3168,  0.1598, -0.2357, -0.1286,  0.4422,  0.0902, -0.2156,  0.1508,\n         -0.3751, -0.4240],\n        [ 0.3838, -0.2698,  0.2582, -0.1764,  0.4416, -0.0557,  0.5702,  0.3589,\n         -0.0439,  0.4755],\n        [ 0.0883, -0.5616, -0.4737, -0.1625,  0.4614, -0.1707, -0.3864, -0.3232,\n         -0.1757,  0.2665],\n        [-0.4491,  0.5912,  0.0080,  0.0760,  0.0837, -0.4634, -0.5850, -0.4476,\n         -0.4615, -0.2961]], grad_fn=<EmbeddingBackward0>)\ntorch.Size([4, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch.nn.functional as F\n# 假设model是训练好的模型实例 & 假设tokenizer是加载好的分词器\nmodel.eval() # 将模型设置为评估模式\ninput_text = \"LLM with me\"  # 输入文本\ninput_ids = tokenizer.encode(input_text, return_tensors=\"pt\")  # 将文本编码为token索引\ntemperature = 0.7  # 设置温度参数 & 一般设置为0到1之间的值\ngenerated_text = input_text + \" A:\"\nfor _ in range(50):  # 假设我们想生成50个单词\n    with torch.no_grad():  # 不需要计算梯度\n        logits = model(input_ids)\n        logits = logits / temperature  # 应用温度调整\n        # 使用softmax函数将logits转换为概率分布 & 根据概率分布随机选择下一个单词\n        probabilities = F.softmax(logits[:, -1, :], dim=-1)\n        predicted_id = torch.multinomial(probabilities, num_samples=1)\n        # 将预测的token添加到输入序列中 & 将预测的token解码为文本并添加到生成的文本中\n        input_ids = torch.cat((input_ids, predicted_id), dim=1)\n        generated_text += tokenizer.decode(predicted_id[0])\n\nprint(generated_text)\neos_token = '<EOS>'  # 在生成文本后根据<EOS>进行切割\ngenerated_text_parts = generated_text.split(eos_token)\nfinal_text = generated_text_parts[0] + eos_token if len(generated_text_parts) > 1 else generated_text_parts[0]\nprint(final_text)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:01.080003Z","iopub.execute_input":"2024-04-27T02:57:01.080404Z","iopub.status.idle":"2024-04-27T02:57:01.123301Z","shell.execute_reply.started":"2024-04-27T02:57:01.080371Z","shell.execute_reply":"2024-04-27T02:57:01.122130Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"LLM with me A:[UNK]iLLwitMimeLMethihitMehw<EOS>LLMwitMmmMwitLLLLLLMLMLMiwitLLmLwithe[UNK]LLhi[UNK]witLLM\nLLM with me A:[UNK]iLLwitMimeLMethihitMehw<EOS>\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}
\ No newline at end of file