|
|
@ -198,3 +198,149 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
|
|
|
|
writer.close()
|
|
|
|
writer.close()
|
|
|
|
~~~
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### Embedding层的作用
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
|
|
|
|
|
|
|
|
labels, num_labels, use_one_hot_embeddings):
|
|
|
|
|
|
|
|
"""Creates a classification model."""
|
|
|
|
|
|
|
|
model = modeling.BertModel( # ctrl点击BertModel跳转
|
|
|
|
|
|
|
|
config=bert_config, # 配置
|
|
|
|
|
|
|
|
is_training=is_training,
|
|
|
|
|
|
|
|
input_ids=input_ids, # 特征
|
|
|
|
|
|
|
|
input_mask=input_mask, # 特征0/1
|
|
|
|
|
|
|
|
token_type_ids=segment_ids, # 特征维度表示第一句话还是第二句
|
|
|
|
|
|
|
|
use_one_hot_embeddings=use_one_hot_embeddings)
|
|
|
|
|
|
|
|
。。。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BertModel(object):
|
|
|
|
|
|
|
|
"""BERT model ("Bidirectional Encoder Representations from Transformers").
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
|
|
|
# Already been converted into WordPiece token ids
|
|
|
|
|
|
|
|
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
|
|
|
|
|
|
|
|
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
|
|
|
|
|
|
|
|
token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
|
|
|
|
|
|
|
|
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = modeling.BertModel(config=config, is_training=True,
|
|
|
|
|
|
|
|
input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_embeddings = tf.get_variable(...)
|
|
|
|
|
|
|
|
pooled_output = model.get_pooled_output()
|
|
|
|
|
|
|
|
logits = tf.matmul(pooled_output, label_embeddings)
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
|
|
|
config,
|
|
|
|
|
|
|
|
is_training,
|
|
|
|
|
|
|
|
input_ids,
|
|
|
|
|
|
|
|
input_mask=None,
|
|
|
|
|
|
|
|
token_type_ids=None,
|
|
|
|
|
|
|
|
use_one_hot_embeddings=False,
|
|
|
|
|
|
|
|
scope=None):
|
|
|
|
|
|
|
|
"""Constructor for BertModel.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
config: `BertConfig` instance.
|
|
|
|
|
|
|
|
is_training: bool. true for training model, false for eval model. Controls
|
|
|
|
|
|
|
|
whether dropout will be applied.
|
|
|
|
|
|
|
|
input_ids: int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
|
|
|
|
|
|
|
|
embeddings or tf.embedding_lookup() for the word embeddings.
|
|
|
|
|
|
|
|
scope: (optional) variable scope. Defaults to "bert".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: The config is invalid or one of the input tensor shapes
|
|
|
|
|
|
|
|
is invalid.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
config = copy.deepcopy(config)
|
|
|
|
|
|
|
|
if not is_training:
|
|
|
|
|
|
|
|
config.hidden_dropout_prob = 0.0
|
|
|
|
|
|
|
|
config.attention_probs_dropout_prob = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_shape = get_shape_list(input_ids, expected_rank=2)
|
|
|
|
|
|
|
|
batch_size = input_shape[0]
|
|
|
|
|
|
|
|
seq_length = input_shape[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if input_mask is None: # 如果没设置mask,默认都是1
|
|
|
|
|
|
|
|
input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if token_type_ids is None: # 没设置就默认一句话
|
|
|
|
|
|
|
|
token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tf.variable_scope(scope, default_name="bert"):
|
|
|
|
|
|
|
|
with tf.variable_scope("embeddings"):
|
|
|
|
|
|
|
|
# Perform embedding lookup on the word ids. 词的embeddings
|
|
|
|
|
|
|
|
(self.embedding_output, self.embedding_table) = embedding_lookup( # ctrl点击embedding_lookup跳转
|
|
|
|
|
|
|
|
input_ids=input_ids, # 词
|
|
|
|
|
|
|
|
vocab_size=config.vocab_size, # 语料库
|
|
|
|
|
|
|
|
embedding_size=config.hidden_size, # 编码映射成多少维
|
|
|
|
|
|
|
|
initializer_range=config.initializer_range, # 初始化范围
|
|
|
|
|
|
|
|
word_embedding_name="word_embeddings",
|
|
|
|
|
|
|
|
use_one_hot_embeddings=use_one_hot_embeddings)
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
def embedding_lookup(input_ids,
|
|
|
|
|
|
|
|
vocab_size,
|
|
|
|
|
|
|
|
embedding_size=128,
|
|
|
|
|
|
|
|
initializer_range=0.02,
|
|
|
|
|
|
|
|
word_embedding_name="word_embeddings",
|
|
|
|
|
|
|
|
use_one_hot_embeddings=False):
|
|
|
|
|
|
|
|
"""Looks up words embeddings for id tensor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
|
|
|
|
|
|
|
|
ids.
|
|
|
|
|
|
|
|
vocab_size: int. Size of the embedding vocabulary.
|
|
|
|
|
|
|
|
embedding_size: int. Width of the word embeddings.
|
|
|
|
|
|
|
|
initializer_range: float. Embedding initialization range.
|
|
|
|
|
|
|
|
word_embedding_name: string. Name of the embedding table.
|
|
|
|
|
|
|
|
use_one_hot_embeddings: bool. If True, use one-hot method for word
|
|
|
|
|
|
|
|
embeddings. If False, use `tf.gather()`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
float Tensor of shape [batch_size, seq_length, embedding_size].
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# This function assumes that the input is of shape [batch_size, seq_length,
|
|
|
|
|
|
|
|
# num_inputs].
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# If the input is a 2D tensor of shape [batch_size, seq_length], we
|
|
|
|
|
|
|
|
# reshape to [batch_size, seq_length, 1].
|
|
|
|
|
|
|
|
if input_ids.shape.ndims == 2:
|
|
|
|
|
|
|
|
input_ids = tf.expand_dims(input_ids, axis=[-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embedding_table = tf.get_variable( # 词映射矩阵
|
|
|
|
|
|
|
|
name=word_embedding_name, # 词向量
|
|
|
|
|
|
|
|
shape=[vocab_size, embedding_size], # 获取语料库大表vovab.txt
|
|
|
|
|
|
|
|
initializer=create_initializer(initializer_range))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flat_input_ids = tf.reshape(input_ids, [-1])
|
|
|
|
|
|
|
|
if use_one_hot_embeddings:
|
|
|
|
|
|
|
|
one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # 查出所有词做one_hot
|
|
|
|
|
|
|
|
output = tf.matmul(one_hot_input_ids, embedding_table) # 运算一个batch里所有的映射结果
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
output = tf.gather(embedding_table, flat_input_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_shape = get_shape_list(input_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = tf.reshape(output,
|
|
|
|
|
|
|
|
input_shape[0:-1] + [input_shape[-1] * embedding_size]) # 制作返回结果
|
|
|
|
|
|
|
|
return (output, embedding_table) # 返回,词变成了向量
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|