You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1 line
23 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# 第二章——文字向量化","metadata":{}},{"cell_type":"code","source":"# 如果没有transformers则安装\n# ! pip install transformers","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:34.940787Z","iopub.execute_input":"2024-04-27T02:56:34.941256Z","iopub.status.idle":"2024-04-27T02:56:34.945910Z","shell.execute_reply.started":"2024-04-27T02:56:34.941207Z","shell.execute_reply":"2024-04-27T02:56:34.944958Z"},"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Model\n\n# 初始化分词器和模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nmodel = GPT2Model.from_pretrained('gpt2')\n\n# 待向量化的文本\ntext = \"LLM with me\"\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\n# 输出token及其对应的索引\nprint(inputs)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:34.950820Z","iopub.execute_input":"2024-04-27T02:56:34.951141Z","iopub.status.idle":"2024-04-27T02:56:54.045121Z","shell.execute_reply.started":"2024-04-27T02:56:34.951107Z","shell.execute_reply":"2024-04-27T02:56:54.043973Z"},"trusted":true},"execution_count":2,"outputs":[{"output_type":"display_data","data":{"text/plain":"tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7f4646e559f142df97301fdb4b55b911"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"20368fa1fd474cfbbe666270d89e57e1"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d2cdba3dff6f4eec92bcba8e64b2ad51"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"253a64324a7e44f6a7d0616f02462cfd"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"config.json: 0%| | 0.00/665 [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"16f41d3722d94e4cbbbb74402babc85d"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"model.safetensors: 0%| | 0.00/548M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"73a1b8fc83d64b538660aa95da81d709"}},"metadata":{}},{"name":"stdout","text":"{'input_ids': tensor([[3069, 44, 351, 502]]), 'attention_mask': tensor([[1, 1, 1, 1]])}\n","output_type":"stream"}]},{"cell_type":"code","source":"text = \"LLM\"\ninputs = tokenizer(text, return_tensors=\"pt\")\nprint(inputs)\n\n# 查看索引对应的token\nprint(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:54.047315Z","iopub.execute_input":"2024-04-27T02:56:54.047958Z","iopub.status.idle":"2024-04-27T02:56:54.055371Z","shell.execute_reply.started":"2024-04-27T02:56:54.047920Z","shell.execute_reply":"2024-04-27T02:56:54.054460Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"{'input_ids': tensor([[3069, 44]]), 'attention_mask': tensor([[1, 1]])}\n['LL', 'M']\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer\n\n# 初始化分词器\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n# 获取词汇表的大小\nvocab_size = len(tokenizer)\nprint(f\"The vocabulary size of GPT2Tokenizer is: {vocab_size}\")","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:54.056556Z","iopub.execute_input":"2024-04-27T02:56:54.056851Z","iopub.status.idle":"2024-04-27T02:56:54.448560Z","shell.execute_reply.started":"2024-04-27T02:56:54.056826Z","shell.execute_reply":"2024-04-27T02:56:54.447512Z"},"trusted":true},"execution_count":4,"outputs":[{"name":"stdout","text":"The vocabulary size of GPT2Tokenizer is: 50257\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Model\n\n# 初始化分词器和模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nmodel = GPT2Model.from_pretrained('gpt2')\n# 待处理的文本\ntext = \"LLM with me\"\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n# 获取模型的嵌入层\nembeddings = model.get_input_embeddings()\n# 将索引转换为嵌入向量\ninput_embeddings = embeddings(input_ids)\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:54.451739Z","iopub.execute_input":"2024-04-27T02:56:54.452204Z","iopub.status.idle":"2024-04-27T02:56:55.623998Z","shell.execute_reply.started":"2024-04-27T02:56:54.452174Z","shell.execute_reply":"2024-04-27T02:56:55.622962Z"},"trusted":true},"execution_count":5,"outputs":[{"name":"stdout","text":"tensor([[[ 0.2509, -0.1875, 0.1510, ..., 0.1094, 0.1639, 0.3363],\n [-0.0159, -0.1385, 0.2203, ..., -0.0501, 0.0990, -0.0755],\n [ 0.0644, 0.0104, 0.0293, ..., 0.0400, 0.1087, 0.0350],\n [ 0.1515, -0.0247, 0.0936, ..., -0.1684, 0.1065, 0.0572]]],\n grad_fn=<EmbeddingBackward0>)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import GPT2Tokenizer, GPT2Config, GPT2Model\nimport torch\n\n# 初始化分词器& 创建模型配置 & 初始化模型\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\nconfig = GPT2Config(vocab_size=len(tokenizer), n_embd=768, n_layer=12, n_head=12)\nmodel = GPT2Model(config)\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nprint(input_ids)\n# 获取模型的嵌入层\nembeddings = model.get_input_embeddings()\n# 将索引转换为嵌入向量\ninput_embeddings = embeddings(input_ids)\n\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:55.625570Z","iopub.execute_input":"2024-04-27T02:56:55.626445Z","iopub.status.idle":"2024-04-27T02:56:58.685226Z","shell.execute_reply.started":"2024-04-27T02:56:55.626407Z","shell.execute_reply":"2024-04-27T02:56:58.684113Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"tensor([[3069, 44, 351, 502]])\ntensor([[[-0.0286, 0.0050, -0.0065, ..., -0.0037, 0.0252, -0.0502],\n [-0.0149, -0.0292, 0.0061, ..., 0.0174, 0.0070, 0.0024],\n [ 0.0027, 0.0040, 0.0024, ..., -0.0140, -0.0124, 0.0112],\n [-0.0026, -0.0169, -0.0193, ..., 0.0246, -0.0027, 0.0186]]],\n grad_fn=<EmbeddingBackward0>)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"# from tokenizers import Tokenizer\n# from tokenizers.models import BPE\n# from tokenizers.trainers import BpeTrainer\n# from tokenizers.pre_tokenizers import Whitespace\n\n# # 创建一个空的BPE分词器 & 使用空格进行预分词 & 创建一个分词器训练器\n# tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))\n# tokenizer.pre_tokenizer = Whitespace()\n# trainer = BpeTrainer(special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"])\n\n# # 准备一些训练数据,这里我们只用一句话 & 在实际应用中,你需要大量的文本数据\n# train_data = [\"LLM with me\"]\n\n# # 训练分词器\n# tokenizer.train_from_iterator(train_data, trainer)\n# # 保存分词器到文件\n# tokenizer.save(\"custom_tokenizer.json\")\n# # 测试分词器\n# output = tokenizer.encode(\"LLM with me\")\n# print(output.tokens)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.686374Z","iopub.execute_input":"2024-04-27T02:56:58.686644Z","iopub.status.idle":"2024-04-27T02:56:58.691471Z","shell.execute_reply.started":"2024-04-27T02:56:58.686622Z","shell.execute_reply":"2024-04-27T02:56:58.690586Z"},"trusted":true},"execution_count":7,"outputs":[]},{"cell_type":"code","source":"from tokenizers import Tokenizer\nfrom tokenizers.models import BPE\nfrom tokenizers.trainers import BpeTrainer\nfrom tokenizers.pre_tokenizers import Whitespace\n\n# 创建一个空的BPE分词器 & 使用空格进行预分词 & 创建一个分词器训练器\ntokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))\ntokenizer.pre_tokenizer = Whitespace()\ntrainer = BpeTrainer(special_tokens=[\"[UNK]\", \"<EOS>\"]) # 添加两个占位来解决不认识的词和结束语\n\n# 准备一些训练数据,这里我们只用一句话 & 在实际应用中,你需要大量的文本数据\ntrain_data = [\"LLM with me\"]\n# 训练分词器\ntokenizer.train_from_iterator(train_data, trainer)\n# 保存分词器到文件\ntokenizer.save(\"custom_tokenizer.json\")\n# 测试分词器\noutput = tokenizer.encode(\"LLM with me <EOS>\")\nprint(output.tokens)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.692919Z","iopub.execute_input":"2024-04-27T02:56:58.693327Z","iopub.status.idle":"2024-04-27T02:56:58.851600Z","shell.execute_reply.started":"2024-04-27T02:56:58.693289Z","shell.execute_reply":"2024-04-27T02:56:58.850683Z"},"trusted":true},"execution_count":8,"outputs":[{"name":"stdout","text":"\n['LLM', 'with', 'me', '<EOS>']\n\n\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import PreTrainedTokenizerFast\n\n# 获取文本的索引\ninput_ids = output.ids\nprint(input_ids)\n\n# 加载自定义分词器 & 编码文本并返回PyTorch张量\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nprint(input_ids)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.853791Z","iopub.execute_input":"2024-04-27T02:56:58.854126Z","iopub.status.idle":"2024-04-27T02:56:58.863190Z","shell.execute_reply.started":"2024-04-27T02:56:58.854094Z","shell.execute_reply":"2024-04-27T02:56:58.862136Z"},"trusted":true},"execution_count":9,"outputs":[{"name":"stdout","text":"[14, 15, 12, 1]\ntensor([[14, 15, 12]])\n","output_type":"stream"}]},{"cell_type":"code","source":"# 查看分词器的词汇表\nvocab = tokenizer.get_vocab()\nprint(vocab)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.864610Z","iopub.execute_input":"2024-04-27T02:56:58.865014Z","iopub.status.idle":"2024-04-27T02:56:58.874455Z","shell.execute_reply.started":"2024-04-27T02:56:58.864965Z","shell.execute_reply":"2024-04-27T02:56:58.873519Z"},"trusted":true},"execution_count":10,"outputs":[{"name":"stdout","text":"{'e': 4, 'it': 11, 'm': 7, 'h': 5, 'w': 9, 'LLM': 14, '[UNK]': 0, 'i': 6, 'LL': 10, 'wit': 13, 'M': 3, 'with': 15, 'L': 2, 't': 8, '<EOS>': 1, 'me': 12}\n","output_type":"stream"}]},{"cell_type":"code","source":"from transformers import PreTrainedTokenizerFast\nfrom transformers import GPT2Config, GPT2Model\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n\n# 创建模型配置 & 初始化模型\nconfig = GPT2Config(vocab_size=tokenizer.vocab_size, n_embd=768, n_layer=12, n_head=12)\nmodel = GPT2Model(config)\n\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n\n# 获取模型的嵌入层 & 将索引转换为嵌入向量\nembeddings = model.get_input_embeddings()\ninput_embeddings = embeddings(input_ids)\nprint(input_embeddings)\nprint(input_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:56:58.878560Z","iopub.execute_input":"2024-04-27T02:56:58.878873Z","iopub.status.idle":"2024-04-27T02:57:00.767283Z","shell.execute_reply.started":"2024-04-27T02:56:58.878848Z","shell.execute_reply":"2024-04-27T02:57:00.766259Z"},"trusted":true},"execution_count":11,"outputs":[{"name":"stdout","text":"tensor([[[-0.0054, 0.0349, -0.0085, ..., -0.0360, -0.0266, -0.0049],\n [-0.0047, -0.0010, 0.0164, ..., -0.0157, -0.0245, -0.0222],\n [-0.0183, 0.0165, -0.0246, ..., -0.0089, 0.0305, -0.0066]]],\n grad_fn=<EmbeddingBackward0>)\ntorch.Size([1, 3, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nfrom transformers import PreTrainedTokenizerFast\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\n\n# 假设我们的自定义分词器有一个很小的词汇表\nvocab_size = tokenizer.vocab_size # 从自定义分词器获取词汇表大小\nn_embd = 10 # 设置嵌入维度为10\n# 创建一个随机初始化的嵌入矩阵,这里我们使用正态分布随机初始化,与实际模型初始化类似\nembedding_matrix = torch.randn(vocab_size, n_embd)\ntoken_indices = input_ids[0] # 假设input_ids是一个包含索引的张量\ntoken_embeddings = embedding_matrix[token_indices]\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.768613Z","iopub.execute_input":"2024-04-27T02:57:00.769015Z","iopub.status.idle":"2024-04-27T02:57:00.779896Z","shell.execute_reply.started":"2024-04-27T02:57:00.768982Z","shell.execute_reply":"2024-04-27T02:57:00.778956Z"},"trusted":true},"execution_count":12,"outputs":[{"name":"stdout","text":"tensor([[-0.0734, 0.9592, 1.3814, 2.3693, 2.3262, -1.0959, 0.7059, -0.0506,\n -0.0729, -1.1791],\n [-0.5122, 0.6106, -0.3071, 0.4347, 0.2445, 2.0369, 0.3645, -0.4135,\n -0.5863, 1.2864],\n [-2.0330, 0.1906, -0.1990, -0.4726, 2.1953, 1.0321, -2.0593, -0.5847,\n -0.3605, -1.9308]])\ntorch.Size([3, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(embedding_matrix[13])\nprint(embedding_matrix[14])\nprint(embedding_matrix[11])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.781157Z","iopub.execute_input":"2024-04-27T02:57:00.781522Z","iopub.status.idle":"2024-04-27T02:57:00.789426Z","shell.execute_reply.started":"2024-04-27T02:57:00.781489Z","shell.execute_reply":"2024-04-27T02:57:00.788480Z"},"trusted":true},"execution_count":13,"outputs":[{"name":"stdout","text":"tensor([ 0.8079, 0.3908, -1.4927, 0.7122, 1.9811, -0.1907, -0.7898, -0.7037,\n 2.5170, 1.1116])\ntensor([-0.0734, 0.9592, 1.3814, 2.3693, 2.3262, -1.0959, 0.7059, -0.0506,\n -0.0729, -1.1791])\ntensor([-1.0125, -0.0436, -0.1076, -0.2520, 1.0112, -1.5468, -0.2592, -1.1768,\n 1.1942, -0.6323])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nfrom transformers import PreTrainedTokenizerFast\n\n# 加载自定义分词器\ntokenizer = PreTrainedTokenizerFast(tokenizer_file=\"custom_tokenizer.json\")\n# 待处理的文本 & 分词并转换为索引\ntext = \"LLM with me\"\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nvocab_size = tokenizer.vocab_size; n_embd = 10\n# 创建一个随机初始化的嵌入矩阵这里我们使用Xavier初始化\nembedding_matrix = torch.empty(vocab_size, n_embd)\ntorch.nn.init.xavier_uniform_(embedding_matrix) # 使用Xavier均匀初始化\n\ntoken_indices = input_ids[0] # 假设input_ids是一个包含索引的张量\ntoken_embeddings = embedding_matrix[token_indices]\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.790724Z","iopub.execute_input":"2024-04-27T02:57:00.791389Z","iopub.status.idle":"2024-04-27T02:57:00.802611Z","shell.execute_reply.started":"2024-04-27T02:57:00.791357Z","shell.execute_reply":"2024-04-27T02:57:00.801589Z"},"trusted":true},"execution_count":14,"outputs":[{"name":"stdout","text":"tensor([[-0.0731, -0.0870, -0.0354, -0.1912, 0.0068, 0.1697, -0.3924, 0.0451,\n -0.1344, 0.1391],\n [-0.1168, 0.2037, -0.2497, 0.3482, -0.0643, -0.1626, 0.0857, -0.3652,\n 0.1890, -0.2604],\n [ 0.1953, -0.1465, -0.0192, 0.3124, 0.4406, -0.4748, -0.3842, -0.3123,\n -0.4185, -0.1526]])\ntorch.Size([3, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(embedding_matrix[13])\nprint(embedding_matrix[14])\nprint(embedding_matrix[11])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.803812Z","iopub.execute_input":"2024-04-27T02:57:00.804131Z","iopub.status.idle":"2024-04-27T02:57:00.814686Z","shell.execute_reply.started":"2024-04-27T02:57:00.804106Z","shell.execute_reply":"2024-04-27T02:57:00.813740Z"},"trusted":true},"execution_count":15,"outputs":[{"name":"stdout","text":"tensor([-0.3125, 0.1946, -0.0692, -0.3138, -0.1735, 0.0426, -0.3563, -0.1339,\n 0.2658, -0.1542])\ntensor([-0.0731, -0.0870, -0.0354, -0.1912, 0.0068, 0.1697, -0.3924, 0.0451,\n -0.1344, 0.1391])\ntensor([ 0.2595, -0.1769, 0.2184, -0.1655, 0.1156, 0.3385, -0.1067, -0.3338,\n -0.3268, -0.2799])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch\nimport torch.nn as nn\nimport torch.optim as optim\n\nvocab_size = tokenizer.vocab_size; n_embd = 10\nembedding_matrix = torch.empty(vocab_size, n_embd)\nnn.init.xavier_uniform_(embedding_matrix)\n# 定义一个简化版的GPT模型\nclass SimpleGPT(nn.Module):\n def __init__(self, vocab_size, n_embd):\n super(SimpleGPT, self).__init__()\n self.embeddings = nn.Embedding(vocab_size, n_embd)\n self.ffn = nn.Linear(n_embd, n_embd)\n self.logits = nn.Linear(n_embd, vocab_size)\n nn.init.xavier_uniform_(self.embeddings.weight) # 使用Xavier初始化嵌入层\n \n def forward(self, input_ids):\n x = self.embeddings(input_ids) # 嵌入层\n x = self.ffn(x) # 前馈网络\n logits = self.logits(x) # 输出层\n return logits\n\n# 创建模型实例 & 定义损失函数和优化器\nmodel = SimpleGPT(vocab_size, n_embd)\nloss_fn = nn.CrossEntropyLoss()\noptimizer = optim.Adam(model.parameters(), lr=0.001)\n\n# 假设我们有一些训练数据\ninput_ids = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]]) # 示例输入\nlabels = torch.tensor([[2, 3, 4, 5], [3, 4, 5, 6]]) # 示例目标\n\n# 训练循环\nfor epoch in range(100): # 假设训练100个epoch\n logits = model(input_ids) # 前向传播\n loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1)) # 计算损失\n # 反向传播\n optimizer.zero_grad() \n loss.backward()\n # 梯度下降\n optimizer.step()\n # 打印损失\n if (epoch + 1) % 10 == 0:\n print(f'Epoch {epoch + 1}, Loss: {loss.item()}')","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:00.815992Z","iopub.execute_input":"2024-04-27T02:57:00.816912Z","iopub.status.idle":"2024-04-27T02:57:01.069032Z","shell.execute_reply.started":"2024-04-27T02:57:00.816876Z","shell.execute_reply":"2024-04-27T02:57:01.067947Z"},"trusted":true},"execution_count":16,"outputs":[{"name":"stdout","text":"Epoch 10, Loss: 2.7324118614196777\nEpoch 20, Loss: 2.657238245010376\nEpoch 30, Loss: 2.580580472946167\nEpoch 40, Loss: 2.500619888305664\nEpoch 50, Loss: 2.415583848953247\nEpoch 60, Loss: 2.3237650394439697\nEpoch 70, Loss: 2.2237038612365723\nEpoch 80, Loss: 2.1143651008605957\nEpoch 90, Loss: 1.9953210353851318\nEpoch 100, Loss: 1.8669122457504272\n","output_type":"stream"}]},{"cell_type":"code","source":"token_indices = input_ids[0]\ntoken_embeddings = model.embeddings(token_indices)\nprint(token_embeddings)\nprint(token_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:01.070282Z","iopub.execute_input":"2024-04-27T02:57:01.070583Z","iopub.status.idle":"2024-04-27T02:57:01.077554Z","shell.execute_reply.started":"2024-04-27T02:57:01.070557Z","shell.execute_reply":"2024-04-27T02:57:01.076566Z"},"trusted":true},"execution_count":17,"outputs":[{"name":"stdout","text":"tensor([[ 0.3168, 0.1598, -0.2357, -0.1286, 0.4422, 0.0902, -0.2156, 0.1508,\n -0.3751, -0.4240],\n [ 0.3838, -0.2698, 0.2582, -0.1764, 0.4416, -0.0557, 0.5702, 0.3589,\n -0.0439, 0.4755],\n [ 0.0883, -0.5616, -0.4737, -0.1625, 0.4614, -0.1707, -0.3864, -0.3232,\n -0.1757, 0.2665],\n [-0.4491, 0.5912, 0.0080, 0.0760, 0.0837, -0.4634, -0.5850, -0.4476,\n -0.4615, -0.2961]], grad_fn=<EmbeddingBackward0>)\ntorch.Size([4, 10])\n","output_type":"stream"}]},{"cell_type":"code","source":"import torch.nn.functional as F\n# 假设model是训练好的模型实例 & 假设tokenizer是加载好的分词器\nmodel.eval() # 将模型设置为评估模式\ninput_text = \"LLM with me\" # 输入文本\ninput_ids = tokenizer.encode(input_text, return_tensors=\"pt\") # 将文本编码为token索引\ntemperature = 0.7 # 设置温度参数 & 一般设置为0到1之间的值\ngenerated_text = input_text + \" A:\"\nfor _ in range(50): # 假设我们想生成50个单词\n with torch.no_grad(): # 不需要计算梯度\n logits = model(input_ids)\n logits = logits / temperature # 应用温度调整\n # 使用softmax函数将logits转换为概率分布 & 根据概率分布随机选择下一个单词\n probabilities = F.softmax(logits[:, -1, :], dim=-1)\n predicted_id = torch.multinomial(probabilities, num_samples=1)\n # 将预测的token添加到输入序列中 & 将预测的token解码为文本并添加到生成的文本中\n input_ids = torch.cat((input_ids, predicted_id), dim=1)\n generated_text += tokenizer.decode(predicted_id[0])\n\nprint(generated_text)\neos_token = '<EOS>' # 在生成文本后根据<EOS>进行切割\ngenerated_text_parts = generated_text.split(eos_token)\nfinal_text = generated_text_parts[0] + eos_token if len(generated_text_parts) > 1 else generated_text_parts[0]\nprint(final_text)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T02:57:01.080003Z","iopub.execute_input":"2024-04-27T02:57:01.080404Z","iopub.status.idle":"2024-04-27T02:57:01.123301Z","shell.execute_reply.started":"2024-04-27T02:57:01.080371Z","shell.execute_reply":"2024-04-27T02:57:01.122130Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"LLM with me A:[UNK]iLLwitMimeLMethihitMehw<EOS>LLMwitMmmMwitLLLLLLMLMLMiwitLLmLwithe[UNK]LLhi[UNK]witLLM\nLLM with me A:[UNK]iLLwitMimeLMethihitMehw<EOS>\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}