You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1 line
8.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Transformer里的原始方法","metadata":{"execution":{"iopub.status.busy":"2024-04-27T07:08:04.928408Z","iopub.execute_input":"2024-04-27T07:08:04.928708Z","iopub.status.idle":"2024-04-27T07:08:06.306886Z","shell.execute_reply.started":"2024-04-27T07:08:04.928683Z","shell.execute_reply":"2024-04-27T07:08:06.305971Z"}}},{"cell_type":"code","source":"import numpy as np\n\ndef get_positional_encoding(max_seq_len, d_model):\n position_enc = np.array([\n [pos / np.power(10000, 2 * (j // 2) / d_model) for j in range(d_model)]\n if pos != 0 else np.zeros(d_model)\n for pos in range(max_seq_len)\n ])\n position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i\n position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1\n return torch.from_numpy(position_enc).type(torch.FloatTensor)\n\n# 假设我们的模型维度是768最大序列长度是1024\nmax_seq_len = 1024\nd_model = 768\npositional_encoding = get_positional_encoding(max_seq_len, d_model)\nprint(positional_encoding)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:04:01.360467Z","iopub.execute_input":"2024-04-27T10:04:01.361329Z","iopub.status.idle":"2024-04-27T10:04:03.936556Z","shell.execute_reply.started":"2024-04-27T10:04:01.361290Z","shell.execute_reply":"2024-04-27T10:04:03.935518Z"},"trusted":true},"execution_count":99,"outputs":[{"name":"stdout","text":"tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,\n 0.0000e+00, 0.0000e+00],\n [ 8.4147e-01, 5.4030e-01, 8.2843e-01, ..., 1.0000e+00,\n 1.0243e-04, 1.0000e+00],\n [ 9.0930e-01, -4.1615e-01, 9.2799e-01, ..., 1.0000e+00,\n 2.0486e-04, 1.0000e+00],\n ...,\n [ 1.7612e-02, -9.9984e-01, -7.9410e-01, ..., 9.9427e-01,\n 1.0439e-01, 9.9454e-01],\n [-8.3182e-01, -5.5504e-01, -9.4828e-01, ..., 9.9426e-01,\n 1.0449e-01, 9.9453e-01],\n [-9.1649e-01, 4.0007e-01, -2.6814e-01, ..., 9.9425e-01,\n 1.0459e-01, 9.9452e-01]])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(positional_encoding[13][:10])\nprint(positional_encoding[14][:10])\nprint(positional_encoding[11][:10])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:04:23.000572Z","iopub.execute_input":"2024-04-27T10:04:23.001381Z","iopub.status.idle":"2024-04-27T10:04:23.008773Z","shell.execute_reply.started":"2024-04-27T10:04:23.001353Z","shell.execute_reply":"2024-04-27T10:04:23.007735Z"},"trusted":true},"execution_count":101,"outputs":[{"name":"stdout","text":"tensor([ 0.4202, 0.9074, 0.1252, 0.9921, -0.1744, 0.9847, -0.4519, 0.8921,\n -0.6858, 0.7278])\ntensor([0.9906, 0.1367, 0.8920, 0.4520, 0.7018, 0.7124, 0.4454, 0.8953, 0.1523,\n 0.9883])\ntensor([-1.0000, 0.0044, -0.9673, -0.2535, -0.8724, -0.4889, -0.7253, -0.6884,\n -0.5387, -0.8425])\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# GPT-2的位置编码方法","metadata":{}},{"cell_type":"code","source":"import tensorflow as tf\n\nclass HParams:\n def __init__(self, **kwargs):\n self.__dict__.update(kwargs)\n\ndef positions_for(tokens, past_length):\n batch_size = tf.shape(tokens)[0]\n nsteps = tf.shape(tokens)[1]\n position_ids = past_length + tf.range(nsteps)\n return tf.tile(tf.expand_dims(position_ids, 0), [batch_size, 1])\n\ndef position_embedding(hparams, position_ids):\n wpe = tf.Variable(tf.random.normal([hparams.n_ctx,hparams.n_embd],stddev=0.01),name='wpe')\n position_embeddings = tf.gather(wpe, position_ids)\n return position_embeddings\n\n# Hyperparameters for the model\nhparams = HParams(\n n_vocab=0,\n n_ctx=1024,\n n_embd=768,\n n_head=12,\n n_layer=12,\n)\n\ninput_tokens = tf.constant([[0, 1, 2, 3]], dtype=tf.int32)\npast_length = tf.constant(0) # Assuming no past context\n\nposition_ids = positions_for(input_tokens, past_length)\nposition_embeddings = position_embedding(hparams, position_ids)\nprint(position_embeddings)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:07:43.627438Z","iopub.execute_input":"2024-04-27T10:07:43.628169Z","iopub.status.idle":"2024-04-27T10:07:43.645633Z","shell.execute_reply.started":"2024-04-27T10:07:43.628139Z","shell.execute_reply":"2024-04-27T10:07:43.644705Z"},"trusted":true},"execution_count":102,"outputs":[{"name":"stdout","text":"tf.Tensor(\n[[[ 0.01702908 0.00268412 0.01296544 ... 0.00706888 0.00186165\n 0.01521429]\n [ 0.00431 -0.01150406 0.01421692 ... -0.00568195 0.00935402\n 0.01863918]\n [-0.00091886 -0.00914316 -0.0180154 ... 0.00033014 0.00344726\n 0.01064758]\n [ 0.00253335 -0.01882706 0.00029727 ... 0.0026667 -0.00202818\n -0.00463023]]], shape=(1, 4, 768), dtype=float32)\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# 获取位置编码,并与向量相加(同位置相加)","metadata":{}},{"cell_type":"code","source":"import torch\nfrom transformers import GPT2Tokenizer, GPT2Model\n\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2') # 初始化\nmodel = GPT2Model.from_pretrained('gpt2')\n\ntext = \"LLM with me\" # 待处理的文本\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nembeddings = model.get_input_embeddings() # 获取模型的嵌入层\ninput_embeddings = embeddings(input_ids) # 将索引转换为嵌入向量\n# 获取位置编码矩阵\nposition_ids = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)\nposition_embeddings = model.wpe(position_ids)\nfinal_embeddings = input_embeddings + position_embeddings # 将位置编码与词嵌入相加以获得最终的输入嵌入\n\n# 查看最终的输入嵌入\nprint(final_embeddings)\nprint(final_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:50.966165Z","iopub.execute_input":"2024-04-27T09:25:50.966557Z","iopub.status.idle":"2024-04-27T09:25:52.352012Z","shell.execute_reply.started":"2024-04-27T09:25:50.966529Z","shell.execute_reply":"2024-04-27T09:25:52.350974Z"},"trusted":true},"execution_count":86,"outputs":[{"name":"stdout","text":"tensor([[[ 0.2321, -0.3849, 0.1550, ..., 0.0664, 0.1922, 0.3908],\n [ 0.0081, -0.1923, 0.1255, ..., -0.0160, 0.1091, -0.0756],\n [ 0.0686, -0.0744, 0.0838, ..., 0.0598, 0.1280, 0.0136],\n [ 0.1512, -0.0985, 0.1991, ..., -0.1582, 0.1241, 0.0501]]],\n grad_fn=<AddBackward0>)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(final_embeddings[0][0][0])\nprint(input_embeddings[0][0][0] + position_embeddings[0][0][0])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:52.354052Z","iopub.execute_input":"2024-04-27T09:25:52.354508Z","iopub.status.idle":"2024-04-27T09:25:52.361606Z","shell.execute_reply.started":"2024-04-27T09:25:52.354474Z","shell.execute_reply":"2024-04-27T09:25:52.360662Z"},"trusted":true},"execution_count":87,"outputs":[{"name":"stdout","text":"tensor(0.2321, grad_fn=<SelectBackward0>)\ntensor(0.2321, grad_fn=<AddBackward0>)\n","output_type":"stream"}]},{"cell_type":"code","source":"print(final_embeddings[0][1][1])\nprint(input_embeddings[0][1][1] + position_embeddings[0][1][1])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:52.362880Z","iopub.execute_input":"2024-04-27T09:25:52.363261Z","iopub.status.idle":"2024-04-27T09:25:52.371588Z","shell.execute_reply.started":"2024-04-27T09:25:52.363214Z","shell.execute_reply":"2024-04-27T09:25:52.370577Z"},"trusted":true},"execution_count":88,"outputs":[{"name":"stdout","text":"tensor(-0.1923, grad_fn=<SelectBackward0>)\ntensor(-0.1923, grad_fn=<AddBackward0>)\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:06:38.339625Z","iopub.execute_input":"2024-04-27T09:06:38.340001Z","iopub.status.idle":"2024-04-27T09:06:38.353412Z","shell.execute_reply.started":"2024-04-27T09:06:38.339971Z","shell.execute_reply":"2024-04-27T09:06:38.352491Z"},"trusted":true},"execution_count":83,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}