|
|
|
@ -47,8 +47,8 @@ def get_positional_encoding(max_seq_len, d_model):
|
|
|
|
|
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
|
|
|
|
|
return torch.from_numpy(position_enc).type(torch.FloatTensor)
|
|
|
|
|
|
|
|
|
|
# 假设我们的模型维度是768,最大序列长度是4
|
|
|
|
|
max_seq_len = 4
|
|
|
|
|
# 假设我们的模型维度是768,最大序列长度是1024
|
|
|
|
|
max_seq_len = 1024
|
|
|
|
|
d_model = 768
|
|
|
|
|
positional_encoding = get_positional_encoding(max_seq_len, d_model)
|
|
|
|
|
print(positional_encoding)
|
|
|
|
@ -59,12 +59,13 @@ tensor([[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,
|
|
|
|
|
1.0243e-04, 1.0000e+00],
|
|
|
|
|
[ 9.0930e-01, -4.1615e-01, 9.2799e-01, ..., 1.0000e+00,
|
|
|
|
|
2.0486e-04, 1.0000e+00],
|
|
|
|
|
[ 1.4112e-01, -9.8999e-01, 2.1109e-01, ..., 1.0000e+00,
|
|
|
|
|
3.0728e-04, 1.0000e+00]])
|
|
|
|
|
...
|
|
|
|
|
"""
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
<img src="../assets/image-20240427173643871.png" alt="image-20240427173643871" style="zoom:50%;" />
|
|
|
|
|
<img src="../assets/image-20240427180449855.png" alt="image-20240427180449855" style="zoom:50%;" />
|
|
|
|
|
|
|
|
|
|
> 我们上面参数也参考GPT-2的,比如max_seq_len=1024、d_model=768
|
|
|
|
|
|
|
|
|
|
为什么是用正弦和余弦函数,对于正弦函(sin):最大值是 1,最小值是 -1。对于余弦函数(cos):最大值是 1,最小值是 -1。也就是它们可以保证值是比较小的,而且也是符合深度学习模型可学习的参数。
|
|
|
|
|
|
|
|
|
|