diff --git a/assets/.DS_Store b/assets/.DS_Store
index 09f6af4..41951e6 100644
Binary files a/assets/.DS_Store and b/assets/.DS_Store differ
diff --git a/assets/image-20240427173643871.png b/assets/image-20240427173643871.png
deleted file mode 100644
index 1e1314b..0000000
Binary files a/assets/image-20240427173643871.png and /dev/null differ
diff --git a/assets/image-20240427180449855.png b/assets/image-20240427180449855.png
new file mode 100644
index 0000000..fb3666b
Binary files /dev/null and b/assets/image-20240427180449855.png differ
diff --git a/人人都能看懂的Transformer/第三章——位置编码.md b/人人都能看懂的Transformer/第三章——位置编码.md
index d3cffdb..b579e38 100644
--- a/人人都能看懂的Transformer/第三章——位置编码.md
+++ b/人人都能看懂的Transformer/第三章——位置编码.md
@@ -47,8 +47,8 @@ def get_positional_encoding(max_seq_len, d_model):
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return torch.from_numpy(position_enc).type(torch.FloatTensor)
-# 假设我们的模型维度是768,最大序列长度是4
-max_seq_len = 4
+# 假设我们的模型维度是768,最大序列长度是1024
+max_seq_len = 1024
d_model = 768
positional_encoding = get_positional_encoding(max_seq_len, d_model)
print(positional_encoding)
@@ -59,12 +59,13 @@ tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,
1.0243e-04, 1.0000e+00],
[ 9.0930e-01, -4.1615e-01, 9.2799e-01, ..., 1.0000e+00,
2.0486e-04, 1.0000e+00],
- [ 1.4112e-01, -9.8999e-01, 2.1109e-01, ..., 1.0000e+00,
- 3.0728e-04, 1.0000e+00]])
+ ...
"""
~~~
-
+
+
+> 我们上面参数也参考GPT-2的,比如max_seq_len=1024、d_model=768
为什么是用正弦和余弦函数,对于正弦函(sin):最大值是 1,最小值是 -1。对于余弦函数(cos):最大值是 1,最小值是 -1。也就是它们可以保证值是比较小的,而且也是符合深度学习模型可学习的参数。