|
|
@ -344,3 +344,171 @@ def embedding_lookup(input_ids,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 位置编码
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
class BertModel(object):
|
|
|
|
|
|
|
|
"""BERT model ("Bidirectional Encoder Representations from Transformers").
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
|
|
|
# Already been converted into WordPiece token ids
|
|
|
|
|
|
|
|
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
|
|
|
|
|
|
|
|
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
|
|
|
|
|
|
|
|
token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
|
|
|
|
|
|
|
|
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = modeling.BertModel(config=config, is_training=True,
|
|
|
|
|
|
|
|
input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_embeddings = tf.get_variable(...)
|
|
|
|
|
|
|
|
pooled_output = model.get_pooled_output()
|
|
|
|
|
|
|
|
logits = tf.matmul(pooled_output, label_embeddings)
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
|
|
|
config,
|
|
|
|
|
|
|
|
is_training,
|
|
|
|
|
|
|
|
input_ids,
|
|
|
|
|
|
|
|
input_mask=None,
|
|
|
|
|
|
|
|
token_type_ids=None,
|
|
|
|
|
|
|
|
use_one_hot_embeddings=False,
|
|
|
|
|
|
|
|
scope=None):
|
|
|
|
|
|
|
|
"""Constructor for BertModel.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
config: `BertConfig` instance.
|
|
|
|
|
|
|
|
is_training: bool. true for training model, false for eval model. Controls
|
|
|
|
|
|
|
|
whether dropout will be applied.
|
|
|
|
|
|
|
|
input_ids: int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
|
|
|
|
|
|
|
|
embeddings or tf.embedding_lookup() for the word embeddings.
|
|
|
|
|
|
|
|
scope: (optional) variable scope. Defaults to "bert".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: The config is invalid or one of the input tensor shapes
|
|
|
|
|
|
|
|
is invalid.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Add positional embeddings and token type embeddings, then layer
|
|
|
|
|
|
|
|
# normalize and perform dropout.
|
|
|
|
|
|
|
|
self.embedding_output = embedding_postprocessor( # 制作位置编码,ctrl点击embedding_postprocessor
|
|
|
|
|
|
|
|
input_tensor=self.embedding_output,
|
|
|
|
|
|
|
|
use_token_type=True,
|
|
|
|
|
|
|
|
token_type_ids=token_type_ids,
|
|
|
|
|
|
|
|
token_type_vocab_size=config.type_vocab_size,
|
|
|
|
|
|
|
|
token_type_embedding_name="token_type_embeddings",
|
|
|
|
|
|
|
|
use_position_embeddings=True,
|
|
|
|
|
|
|
|
position_embedding_name="position_embeddings",
|
|
|
|
|
|
|
|
initializer_range=config.initializer_range,
|
|
|
|
|
|
|
|
max_position_embeddings=config.max_position_embeddings,
|
|
|
|
|
|
|
|
dropout_prob=config.hidden_dropout_prob)
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
def embedding_postprocessor(input_tensor,
|
|
|
|
|
|
|
|
use_token_type=False,
|
|
|
|
|
|
|
|
token_type_ids=None,
|
|
|
|
|
|
|
|
token_type_vocab_size=16,
|
|
|
|
|
|
|
|
token_type_embedding_name="token_type_embeddings",
|
|
|
|
|
|
|
|
use_position_embeddings=True,
|
|
|
|
|
|
|
|
position_embedding_name="position_embeddings",
|
|
|
|
|
|
|
|
initializer_range=0.02,
|
|
|
|
|
|
|
|
max_position_embeddings=512,
|
|
|
|
|
|
|
|
dropout_prob=0.1):
|
|
|
|
|
|
|
|
"""Performs various post-processing on a word embedding tensor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
input_tensor: float Tensor of shape [batch_size, seq_length,
|
|
|
|
|
|
|
|
embedding_size].
|
|
|
|
|
|
|
|
use_token_type: bool. Whether to add embeddings for `token_type_ids`.
|
|
|
|
|
|
|
|
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
Must be specified if `use_token_type` is True.
|
|
|
|
|
|
|
|
token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
|
|
|
|
|
|
|
|
token_type_embedding_name: string. The name of the embedding table variable
|
|
|
|
|
|
|
|
for token type ids.
|
|
|
|
|
|
|
|
use_position_embeddings: bool. Whether to add position embeddings for the
|
|
|
|
|
|
|
|
position of each token in the sequence.
|
|
|
|
|
|
|
|
position_embedding_name: string. The name of the embedding table variable
|
|
|
|
|
|
|
|
for positional embeddings.
|
|
|
|
|
|
|
|
initializer_range: float. Range of the weight initialization.
|
|
|
|
|
|
|
|
max_position_embeddings: int. Maximum sequence length that might ever be
|
|
|
|
|
|
|
|
used with this model. This can be longer than the sequence length of
|
|
|
|
|
|
|
|
input_tensor, but cannot be shorter.
|
|
|
|
|
|
|
|
dropout_prob: float. Dropout probability applied to the final output tensor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
float tensor with same shape as `input_tensor`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: One of the tensor shapes or input values is invalid.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
input_shape = get_shape_list(input_tensor, expected_rank=3)
|
|
|
|
|
|
|
|
batch_size = input_shape[0]
|
|
|
|
|
|
|
|
seq_length = input_shape[1]
|
|
|
|
|
|
|
|
width = input_shape[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = input_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if use_token_type: # 判断是第一句还是第二句,再做相应处理
|
|
|
|
|
|
|
|
if token_type_ids is None:
|
|
|
|
|
|
|
|
raise ValueError("`token_type_ids` must be specified if"
|
|
|
|
|
|
|
|
"`use_token_type` is True.")
|
|
|
|
|
|
|
|
token_type_table = tf.get_variable(
|
|
|
|
|
|
|
|
name=token_type_embedding_name,
|
|
|
|
|
|
|
|
shape=[token_type_vocab_size, width],
|
|
|
|
|
|
|
|
initializer=create_initializer(initializer_range))
|
|
|
|
|
|
|
|
# This vocab will be small so we always do one-hot here, since it is always
|
|
|
|
|
|
|
|
# faster for a small vocabulary.
|
|
|
|
|
|
|
|
flat_token_type_ids = tf.reshape(token_type_ids, [-1])
|
|
|
|
|
|
|
|
one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
|
|
|
|
|
|
|
|
token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
|
|
|
|
|
|
|
|
token_type_embeddings = tf.reshape(token_type_embeddings,
|
|
|
|
|
|
|
|
[batch_size, seq_length, width])
|
|
|
|
|
|
|
|
output += token_type_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if use_position_embeddings: # 判断是否要做位置编码信息
|
|
|
|
|
|
|
|
assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
|
|
|
|
|
|
|
|
with tf.control_dependencies([assert_op]):
|
|
|
|
|
|
|
|
full_position_embeddings = tf.get_variable(
|
|
|
|
|
|
|
|
name=position_embedding_name,
|
|
|
|
|
|
|
|
shape=[max_position_embeddings, width],
|
|
|
|
|
|
|
|
initializer=create_initializer(initializer_range))
|
|
|
|
|
|
|
|
# Since the position embedding table is a learned variable, we create it
|
|
|
|
|
|
|
|
# using a (long) sequence length `max_position_embeddings`. The actual
|
|
|
|
|
|
|
|
# sequence length might be shorter than this, for faster training of
|
|
|
|
|
|
|
|
# tasks that do not have long sequences.
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# So `full_position_embeddings` is effectively an embedding table
|
|
|
|
|
|
|
|
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
|
|
|
|
|
|
|
|
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
|
|
|
|
|
|
|
|
# perform a slice.
|
|
|
|
|
|
|
|
position_embeddings = tf.slice(full_position_embeddings, [0, 0],
|
|
|
|
|
|
|
|
[seq_length, -1]) # 如果位置编码给的过大,为了加速只需取出部分
|
|
|
|
|
|
|
|
num_dims = len(output.shape.as_list())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Only the last two dimensions are relevant (`seq_length` and `width`), so
|
|
|
|
|
|
|
|
# we broadcast among the first dimensions, which is typically just
|
|
|
|
|
|
|
|
# the batch size.
|
|
|
|
|
|
|
|
position_broadcast_shape = []
|
|
|
|
|
|
|
|
for _ in range(num_dims - 2):
|
|
|
|
|
|
|
|
position_broadcast_shape.append(1)
|
|
|
|
|
|
|
|
position_broadcast_shape.extend([seq_length, width])
|
|
|
|
|
|
|
|
position_embeddings = tf.reshape(position_embeddings,
|
|
|
|
|
|
|
|
position_broadcast_shape)
|
|
|
|
|
|
|
|
output += position_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = layer_norm_and_dropout(output, dropout_prob)
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|