{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Transformer里的原始方法","metadata":{"execution":{"iopub.status.busy":"2024-04-27T07:08:04.928408Z","iopub.execute_input":"2024-04-27T07:08:04.928708Z","iopub.status.idle":"2024-04-27T07:08:06.306886Z","shell.execute_reply.started":"2024-04-27T07:08:04.928683Z","shell.execute_reply":"2024-04-27T07:08:06.305971Z"}}},{"cell_type":"code","source":"import numpy as np\n\ndef get_positional_encoding(max_seq_len, d_model):\n position_enc = np.array([\n [pos / np.power(10000, 2 * (j // 2) / d_model) for j in range(d_model)]\n if pos != 0 else np.zeros(d_model)\n for pos in range(max_seq_len)\n ])\n position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i\n position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1\n return torch.from_numpy(position_enc).type(torch.FloatTensor)\n\n# 假设我们的模型维度是768,最大序列长度是1024\nmax_seq_len = 1024\nd_model = 768\npositional_encoding = get_positional_encoding(max_seq_len, d_model)\nprint(positional_encoding)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:04:01.360467Z","iopub.execute_input":"2024-04-27T10:04:01.361329Z","iopub.status.idle":"2024-04-27T10:04:03.936556Z","shell.execute_reply.started":"2024-04-27T10:04:01.361290Z","shell.execute_reply":"2024-04-27T10:04:03.935518Z"},"trusted":true},"execution_count":99,"outputs":[{"name":"stdout","text":"tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,\n 0.0000e+00, 0.0000e+00],\n [ 8.4147e-01, 5.4030e-01, 8.2843e-01, ..., 1.0000e+00,\n 1.0243e-04, 1.0000e+00],\n [ 9.0930e-01, -4.1615e-01, 9.2799e-01, ..., 1.0000e+00,\n 2.0486e-04, 1.0000e+00],\n ...,\n [ 1.7612e-02, -9.9984e-01, -7.9410e-01, ..., 9.9427e-01,\n 1.0439e-01, 9.9454e-01],\n [-8.3182e-01, -5.5504e-01, -9.4828e-01, ..., 9.9426e-01,\n 1.0449e-01, 9.9453e-01],\n [-9.1649e-01, 4.0007e-01, -2.6814e-01, ..., 9.9425e-01,\n 1.0459e-01, 9.9452e-01]])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(positional_encoding[13][:10])\nprint(positional_encoding[14][:10])\nprint(positional_encoding[11][:10])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:04:23.000572Z","iopub.execute_input":"2024-04-27T10:04:23.001381Z","iopub.status.idle":"2024-04-27T10:04:23.008773Z","shell.execute_reply.started":"2024-04-27T10:04:23.001353Z","shell.execute_reply":"2024-04-27T10:04:23.007735Z"},"trusted":true},"execution_count":101,"outputs":[{"name":"stdout","text":"tensor([ 0.4202, 0.9074, 0.1252, 0.9921, -0.1744, 0.9847, -0.4519, 0.8921,\n -0.6858, 0.7278])\ntensor([0.9906, 0.1367, 0.8920, 0.4520, 0.7018, 0.7124, 0.4454, 0.8953, 0.1523,\n 0.9883])\ntensor([-1.0000, 0.0044, -0.9673, -0.2535, -0.8724, -0.4889, -0.7253, -0.6884,\n -0.5387, -0.8425])\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# GPT-2的位置编码方法","metadata":{}},{"cell_type":"code","source":"import tensorflow as tf\n\nclass HParams:\n def __init__(self, **kwargs):\n self.__dict__.update(kwargs)\n\ndef positions_for(tokens, past_length):\n batch_size = tf.shape(tokens)[0]\n nsteps = tf.shape(tokens)[1]\n position_ids = past_length + tf.range(nsteps)\n return tf.tile(tf.expand_dims(position_ids, 0), [batch_size, 1])\n\ndef position_embedding(hparams, position_ids):\n wpe = tf.Variable(tf.random.normal([hparams.n_ctx,hparams.n_embd],stddev=0.01),name='wpe')\n position_embeddings = tf.gather(wpe, position_ids)\n return position_embeddings\n\n# Hyperparameters for the model\nhparams = HParams(\n n_vocab=0,\n n_ctx=1024,\n n_embd=768,\n n_head=12,\n n_layer=12,\n)\n\ninput_tokens = tf.constant([[0, 1, 2, 3]], dtype=tf.int32)\npast_length = tf.constant(0) # Assuming no past context\n\nposition_ids = positions_for(input_tokens, past_length)\nposition_embeddings = position_embedding(hparams, position_ids)\nprint(position_embeddings)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:07:43.627438Z","iopub.execute_input":"2024-04-27T10:07:43.628169Z","iopub.status.idle":"2024-04-27T10:07:43.645633Z","shell.execute_reply.started":"2024-04-27T10:07:43.628139Z","shell.execute_reply":"2024-04-27T10:07:43.644705Z"},"trusted":true},"execution_count":102,"outputs":[{"name":"stdout","text":"tf.Tensor(\n[[[ 0.01702908 0.00268412 0.01296544 ... 0.00706888 0.00186165\n 0.01521429]\n [ 0.00431 -0.01150406 0.01421692 ... -0.00568195 0.00935402\n 0.01863918]\n [-0.00091886 -0.00914316 -0.0180154 ... 0.00033014 0.00344726\n 0.01064758]\n [ 0.00253335 -0.01882706 0.00029727 ... 0.0026667 -0.00202818\n -0.00463023]]], shape=(1, 4, 768), dtype=float32)\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# 获取位置编码,并与向量相加(同位置相加)","metadata":{}},{"cell_type":"code","source":"import torch\nfrom transformers import GPT2Tokenizer, GPT2Model\n\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2') # 初始化\nmodel = GPT2Model.from_pretrained('gpt2')\n\ntext = \"LLM with me\" # 待处理的文本\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nembeddings = model.get_input_embeddings() # 获取模型的嵌入层\ninput_embeddings = embeddings(input_ids) # 将索引转换为嵌入向量\n# 获取位置编码矩阵\nposition_ids = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)\nposition_embeddings = model.wpe(position_ids)\nfinal_embeddings = input_embeddings + position_embeddings # 将位置编码与词嵌入相加以获得最终的输入嵌入\n\n# 查看最终的输入嵌入\nprint(final_embeddings)\nprint(final_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:50.966165Z","iopub.execute_input":"2024-04-27T09:25:50.966557Z","iopub.status.idle":"2024-04-27T09:25:52.352012Z","shell.execute_reply.started":"2024-04-27T09:25:50.966529Z","shell.execute_reply":"2024-04-27T09:25:52.350974Z"},"trusted":true},"execution_count":86,"outputs":[{"name":"stdout","text":"tensor([[[ 0.2321, -0.3849, 0.1550, ..., 0.0664, 0.1922, 0.3908],\n [ 0.0081, -0.1923, 0.1255, ..., -0.0160, 0.1091, -0.0756],\n [ 0.0686, -0.0744, 0.0838, ..., 0.0598, 0.1280, 0.0136],\n [ 0.1512, -0.0985, 0.1991, ..., -0.1582, 0.1241, 0.0501]]],\n grad_fn=)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(final_embeddings[0][0][0])\nprint(input_embeddings[0][0][0] + position_embeddings[0][0][0])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:52.354052Z","iopub.execute_input":"2024-04-27T09:25:52.354508Z","iopub.status.idle":"2024-04-27T09:25:52.361606Z","shell.execute_reply.started":"2024-04-27T09:25:52.354474Z","shell.execute_reply":"2024-04-27T09:25:52.360662Z"},"trusted":true},"execution_count":87,"outputs":[{"name":"stdout","text":"tensor(0.2321, grad_fn=)\ntensor(0.2321, grad_fn=)\n","output_type":"stream"}]},{"cell_type":"code","source":"print(final_embeddings[0][1][1])\nprint(input_embeddings[0][1][1] + position_embeddings[0][1][1])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:52.362880Z","iopub.execute_input":"2024-04-27T09:25:52.363261Z","iopub.status.idle":"2024-04-27T09:25:52.371588Z","shell.execute_reply.started":"2024-04-27T09:25:52.363214Z","shell.execute_reply":"2024-04-27T09:25:52.370577Z"},"trusted":true},"execution_count":88,"outputs":[{"name":"stdout","text":"tensor(-0.1923, grad_fn=)\ntensor(-0.1923, grad_fn=)\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:06:38.339625Z","iopub.execute_input":"2024-04-27T09:06:38.340001Z","iopub.status.idle":"2024-04-27T09:06:38.353412Z","shell.execute_reply.started":"2024-04-27T09:06:38.339971Z","shell.execute_reply":"2024-04-27T09:06:38.352491Z"},"trusted":true},"execution_count":83,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}