{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[],"dockerImageVersionId":30699,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Transformer里的原始方法","metadata":{"execution":{"iopub.status.busy":"2024-04-27T07:08:04.928408Z","iopub.execute_input":"2024-04-27T07:08:04.928708Z","iopub.status.idle":"2024-04-27T07:08:06.306886Z","shell.execute_reply.started":"2024-04-27T07:08:04.928683Z","shell.execute_reply":"2024-04-27T07:08:06.305971Z"}}},{"cell_type":"code","source":"import numpy as np\n\ndef get_positional_encoding(max_seq_len, d_model):\n    position_enc = np.array([\n        [pos / np.power(10000, 2 * (j // 2) / d_model) for j in range(d_model)]\n        if pos != 0 else np.zeros(d_model)\n        for pos in range(max_seq_len)\n    ])\n    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i\n    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1\n    return torch.from_numpy(position_enc).type(torch.FloatTensor)\n\n# 假设我们的模型维度是768，最大序列长度是1024\nmax_seq_len = 1024\nd_model = 768\npositional_encoding = get_positional_encoding(max_seq_len, d_model)\nprint(positional_encoding)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:04:01.360467Z","iopub.execute_input":"2024-04-27T10:04:01.361329Z","iopub.status.idle":"2024-04-27T10:04:03.936556Z","shell.execute_reply.started":"2024-04-27T10:04:01.361290Z","shell.execute_reply":"2024-04-27T10:04:03.935518Z"},"trusted":true},"execution_count":99,"outputs":[{"name":"stdout","text":"tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,\n          0.0000e+00,  0.0000e+00],\n        [ 8.4147e-01,  5.4030e-01,  8.2843e-01,  ...,  1.0000e+00,\n          1.0243e-04,  1.0000e+00],\n        [ 9.0930e-01, -4.1615e-01,  9.2799e-01,  ...,  1.0000e+00,\n          2.0486e-04,  1.0000e+00],\n        ...,\n        [ 1.7612e-02, -9.9984e-01, -7.9410e-01,  ...,  9.9427e-01,\n          1.0439e-01,  9.9454e-01],\n        [-8.3182e-01, -5.5504e-01, -9.4828e-01,  ...,  9.9426e-01,\n          1.0449e-01,  9.9453e-01],\n        [-9.1649e-01,  4.0007e-01, -2.6814e-01,  ...,  9.9425e-01,\n          1.0459e-01,  9.9452e-01]])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(positional_encoding[13][:10])\nprint(positional_encoding[14][:10])\nprint(positional_encoding[11][:10])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:04:23.000572Z","iopub.execute_input":"2024-04-27T10:04:23.001381Z","iopub.status.idle":"2024-04-27T10:04:23.008773Z","shell.execute_reply.started":"2024-04-27T10:04:23.001353Z","shell.execute_reply":"2024-04-27T10:04:23.007735Z"},"trusted":true},"execution_count":101,"outputs":[{"name":"stdout","text":"tensor([ 0.4202,  0.9074,  0.1252,  0.9921, -0.1744,  0.9847, -0.4519,  0.8921,\n        -0.6858,  0.7278])\ntensor([0.9906, 0.1367, 0.8920, 0.4520, 0.7018, 0.7124, 0.4454, 0.8953, 0.1523,\n        0.9883])\ntensor([-1.0000,  0.0044, -0.9673, -0.2535, -0.8724, -0.4889, -0.7253, -0.6884,\n        -0.5387, -0.8425])\n","output_type":"stream"}]},{"cell_type":"markdown","source":"# GPT-2的位置编码方法","metadata":{}},{"cell_type":"code","source":"import tensorflow as tf\n\nclass HParams:\n    def __init__(self, **kwargs):\n        self.__dict__.update(kwargs)\n\ndef positions_for(tokens, past_length):\n    batch_size = tf.shape(tokens)[0]\n    nsteps = tf.shape(tokens)[1]\n    position_ids = past_length + tf.range(nsteps)\n    return tf.tile(tf.expand_dims(position_ids, 0), [batch_size, 1])\n\ndef position_embedding(hparams, position_ids):\n    wpe = tf.Variable(tf.random.normal([hparams.n_ctx,hparams.n_embd],stddev=0.01),name='wpe')\n    position_embeddings = tf.gather(wpe, position_ids)\n    return position_embeddings\n\n# Hyperparameters for the model\nhparams = HParams(\n    n_vocab=0,\n    n_ctx=1024,\n    n_embd=768,\n    n_head=12,\n    n_layer=12,\n)\n\ninput_tokens = tf.constant([[0, 1, 2, 3]], dtype=tf.int32)\npast_length = tf.constant(0)  # Assuming no past context\n\nposition_ids = positions_for(input_tokens, past_length)\nposition_embeddings = position_embedding(hparams, position_ids)\nprint(position_embeddings)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T10:07:43.627438Z","iopub.execute_input":"2024-04-27T10:07:43.628169Z","iopub.status.idle":"2024-04-27T10:07:43.645633Z","shell.execute_reply.started":"2024-04-27T10:07:43.628139Z","shell.execute_reply":"2024-04-27T10:07:43.644705Z"},"trusted":true},"execution_count":102,"outputs":[{"name":"stdout","text":"tf.Tensor(\n[[[ 0.01702908  0.00268412  0.01296544 ...  0.00706888  0.00186165\n    0.01521429]\n  [ 0.00431    -0.01150406  0.01421692 ... -0.00568195  0.00935402\n    0.01863918]\n  [-0.00091886 -0.00914316 -0.0180154  ...  0.00033014  0.00344726\n    0.01064758]\n  [ 0.00253335 -0.01882706  0.00029727 ...  0.0026667  -0.00202818\n   -0.00463023]]], shape=(1, 4, 768), dtype=float32)\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# 获取位置编码，并与向量相加（同位置相加）","metadata":{}},{"cell_type":"code","source":"import torch\nfrom transformers import GPT2Tokenizer, GPT2Model\n\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # 初始化\nmodel = GPT2Model.from_pretrained('gpt2')\n\ntext = \"LLM with me\"  # 待处理的文本\n# 分词并转换为索引\ninputs = tokenizer(text, return_tensors=\"pt\")\ninput_ids = inputs[\"input_ids\"]\nembeddings = model.get_input_embeddings()  # 获取模型的嵌入层\ninput_embeddings = embeddings(input_ids)  # 将索引转换为嵌入向量\n# 获取位置编码矩阵\nposition_ids = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)\nposition_embeddings = model.wpe(position_ids)\nfinal_embeddings = input_embeddings + position_embeddings  # 将位置编码与词嵌入相加以获得最终的输入嵌入\n\n# 查看最终的输入嵌入\nprint(final_embeddings)\nprint(final_embeddings.shape)","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:50.966165Z","iopub.execute_input":"2024-04-27T09:25:50.966557Z","iopub.status.idle":"2024-04-27T09:25:52.352012Z","shell.execute_reply.started":"2024-04-27T09:25:50.966529Z","shell.execute_reply":"2024-04-27T09:25:52.350974Z"},"trusted":true},"execution_count":86,"outputs":[{"name":"stdout","text":"tensor([[[ 0.2321, -0.3849,  0.1550,  ...,  0.0664,  0.1922,  0.3908],\n         [ 0.0081, -0.1923,  0.1255,  ..., -0.0160,  0.1091, -0.0756],\n         [ 0.0686, -0.0744,  0.0838,  ...,  0.0598,  0.1280,  0.0136],\n         [ 0.1512, -0.0985,  0.1991,  ..., -0.1582,  0.1241,  0.0501]]],\n       grad_fn=<AddBackward0>)\ntorch.Size([1, 4, 768])\n","output_type":"stream"}]},{"cell_type":"code","source":"print(final_embeddings[0][0][0])\nprint(input_embeddings[0][0][0] + position_embeddings[0][0][0])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:52.354052Z","iopub.execute_input":"2024-04-27T09:25:52.354508Z","iopub.status.idle":"2024-04-27T09:25:52.361606Z","shell.execute_reply.started":"2024-04-27T09:25:52.354474Z","shell.execute_reply":"2024-04-27T09:25:52.360662Z"},"trusted":true},"execution_count":87,"outputs":[{"name":"stdout","text":"tensor(0.2321, grad_fn=<SelectBackward0>)\ntensor(0.2321, grad_fn=<AddBackward0>)\n","output_type":"stream"}]},{"cell_type":"code","source":"print(final_embeddings[0][1][1])\nprint(input_embeddings[0][1][1] + position_embeddings[0][1][1])","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:25:52.362880Z","iopub.execute_input":"2024-04-27T09:25:52.363261Z","iopub.status.idle":"2024-04-27T09:25:52.371588Z","shell.execute_reply.started":"2024-04-27T09:25:52.363214Z","shell.execute_reply":"2024-04-27T09:25:52.370577Z"},"trusted":true},"execution_count":88,"outputs":[{"name":"stdout","text":"tensor(-0.1923, grad_fn=<SelectBackward0>)\ntensor(-0.1923, grad_fn=<AddBackward0>)\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{"execution":{"iopub.status.busy":"2024-04-27T09:06:38.339625Z","iopub.execute_input":"2024-04-27T09:06:38.340001Z","iopub.status.idle":"2024-04-27T09:06:38.353412Z","shell.execute_reply.started":"2024-04-27T09:06:38.339971Z","shell.execute_reply":"2024-04-27T09:06:38.352491Z"},"trusted":true},"execution_count":83,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}