|
|
@ -0,0 +1,514 @@
|
|
|
|
|
|
|
|
### BERT流程解读
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 数据读取模块
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
处理MRPC数据的类
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
class MrpcProcessor(DataProcessor):
|
|
|
|
|
|
|
|
"""Processor for the MRPC data set (GLUE version)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_train_examples(self, data_dir):
|
|
|
|
|
|
|
|
"""See base class."""
|
|
|
|
|
|
|
|
return self._create_examples(
|
|
|
|
|
|
|
|
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_dev_examples(self, data_dir):
|
|
|
|
|
|
|
|
"""See base class."""
|
|
|
|
|
|
|
|
return self._create_examples(
|
|
|
|
|
|
|
|
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_test_examples(self, data_dir):
|
|
|
|
|
|
|
|
"""See base class."""
|
|
|
|
|
|
|
|
return self._create_examples(
|
|
|
|
|
|
|
|
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_labels(self):
|
|
|
|
|
|
|
|
"""See base class."""
|
|
|
|
|
|
|
|
return ["0", "1"] # 是否是二分类
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _create_examples(self, lines, set_type):
|
|
|
|
|
|
|
|
"""Creates examples for the training and dev sets."""
|
|
|
|
|
|
|
|
examples = []
|
|
|
|
|
|
|
|
for (i, line) in enumerate(lines):
|
|
|
|
|
|
|
|
if i == 0:
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
guid = "%s-%s" % (set_type, i)
|
|
|
|
|
|
|
|
text_a = tokenization.convert_to_unicode(line[3]) # 相关的test_a和b怎么切分
|
|
|
|
|
|
|
|
text_b = tokenization.convert_to_unicode(line[4])
|
|
|
|
|
|
|
|
if set_type == "test":
|
|
|
|
|
|
|
|
label = "0"
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
label = tokenization.convert_to_unicode(line[0])
|
|
|
|
|
|
|
|
examples.append(
|
|
|
|
|
|
|
|
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
|
|
|
|
|
|
|
|
return examples
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
读取训练数据代码:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
if FLAGS.do_train:
|
|
|
|
|
|
|
|
train_examples = processor.get_train_examples(FLAGS.data_dir)
|
|
|
|
|
|
|
|
num_train_steps = int(
|
|
|
|
|
|
|
|
len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 得到需要迭代的次数,len(train_examples)计算出多少数据量 除以 我们设置的train_batch_size,再乘上epochs次数。
|
|
|
|
|
|
|
|
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时,让学习率偏小,经过warmup的百分比后,再还原回原始的学习率
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 数据预处理模块
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
# 衔接上一个
|
|
|
|
|
|
|
|
file_based_convert_examples_to_features(
|
|
|
|
|
|
|
|
train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ctrl点击file_based_xxx函数跳转
|
|
|
|
|
|
|
|
def file_based_convert_examples_to_features(
|
|
|
|
|
|
|
|
examples, label_list, max_seq_length, tokenizer, output_file):
|
|
|
|
|
|
|
|
"""Convert a set of `InputExample`s to a TFRecord file."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
writer = tf.python_io.TFRecordWriter(output_file) # TFRecord读取数据块,在bert中要求数据是TFRecord的形式。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (ex_index, example) in enumerate(examples):
|
|
|
|
|
|
|
|
if ex_index % 10000 == 0:
|
|
|
|
|
|
|
|
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # for循环变量取数据
|
|
|
|
|
|
|
|
feature = convert_single_example(ex_index, example, label_list,
|
|
|
|
|
|
|
|
max_seq_length, tokenizer) # ctrl点击convert_xxx跳转
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_single_example(ex_index, example, label_list, max_seq_length,
|
|
|
|
|
|
|
|
tokenizer):
|
|
|
|
|
|
|
|
"""Converts a single `InputExample` into a single `InputFeatures`."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(example, PaddingInputExample):
|
|
|
|
|
|
|
|
return InputFeatures(
|
|
|
|
|
|
|
|
input_ids=[0] * max_seq_length,
|
|
|
|
|
|
|
|
input_mask=[0] * max_seq_length,
|
|
|
|
|
|
|
|
segment_ids=[0] * max_seq_length,
|
|
|
|
|
|
|
|
label_id=0,
|
|
|
|
|
|
|
|
is_real_example=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_map = {} # 构建标签0, 1
|
|
|
|
|
|
|
|
for (i, label) in enumerate(label_list):
|
|
|
|
|
|
|
|
label_map[label] = i
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens_a = tokenizer.tokenize(example.text_a) # ctrl点击tokenize,对第一句话分词
|
|
|
|
|
|
|
|
tokens_b = None
|
|
|
|
|
|
|
|
if example.text_b: # 第二句话分词
|
|
|
|
|
|
|
|
tokens_b = tokenizer.tokenize(example.text_b)
|
|
|
|
|
|
|
|
if tokens_b:
|
|
|
|
|
|
|
|
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
|
|
|
|
|
|
|
# length is less than the specified length.
|
|
|
|
|
|
|
|
# Account for [CLS], [SEP], [SEP] with "- 3" # 保留3个特殊字符
|
|
|
|
|
|
|
|
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) # 如果太长就截断的操作
|
|
|
|
|
|
|
|
else: # 没有b的时候保留两个字符
|
|
|
|
|
|
|
|
# Account for [CLS] and [SEP] with "- 2"
|
|
|
|
|
|
|
|
if len(tokens_a) > max_seq_length - 2:
|
|
|
|
|
|
|
|
tokens_a = tokens_a[0:(max_seq_length - 2)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# The convention in BERT is:
|
|
|
|
|
|
|
|
# (a) For sequence pairs: # 将下面一对话,CLS开始,SEP断点,变成type_ids的0/1形式,0表示前一句,1表示后一句
|
|
|
|
|
|
|
|
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
|
|
|
|
|
|
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
|
|
|
|
|
|
def tokenize(self, text):
|
|
|
|
|
|
|
|
split_tokens = []
|
|
|
|
|
|
|
|
for token in self.basic_tokenizer.tokenize(text): # 词切片,将一个词切片成多个小段,让表达的含义更丰富
|
|
|
|
|
|
|
|
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
|
|
|
|
|
|
|
split_tokens.append(sub_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return split_tokens
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### tfrecord制作
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~~python
|
|
|
|
|
|
|
|
# 延续上面的convert_single_example模块
|
|
|
|
|
|
|
|
# 开始构建,创建两个列表来承接
|
|
|
|
|
|
|
|
tokens = []
|
|
|
|
|
|
|
|
segment_ids = []
|
|
|
|
|
|
|
|
tokens.append("[CLS]") # 第一个词是CLS
|
|
|
|
|
|
|
|
segment_ids.append(0) # 第一个的编码也肯定是0
|
|
|
|
|
|
|
|
for token in tokens_a:
|
|
|
|
|
|
|
|
tokens.append(token)
|
|
|
|
|
|
|
|
segment_ids.append(0) # 遍历获取,a(第一句话)都是0
|
|
|
|
|
|
|
|
tokens.append("[SEP]") # 遍历完增加个SEP连接符/断电
|
|
|
|
|
|
|
|
segment_ids.append(0) # tokens添加完SEP后,ids也添加对应的0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if tokens_b:
|
|
|
|
|
|
|
|
for token in tokens_b:
|
|
|
|
|
|
|
|
tokens.append(token)
|
|
|
|
|
|
|
|
segment_ids.append(1) # b和a一样,唯一不同的是添加的是1
|
|
|
|
|
|
|
|
tokens.append("[SEP]")
|
|
|
|
|
|
|
|
segment_ids.append(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_ids = tokenizer.convert_tokens_to_ids(tokens) # 转成ID的映射,就是vocab语料库索引
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
|
|
|
|
|
|
|
# tokens are attended to.
|
|
|
|
|
|
|
|
input_mask = [1] * len(input_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Zero-pad up to the sequence length. 保证输入的长度是一样的,多退少补
|
|
|
|
|
|
|
|
while len(input_ids) < max_seq_length: # PAD的长度取决于设置的最大长度,小于全补0
|
|
|
|
|
|
|
|
input_ids.append(0)
|
|
|
|
|
|
|
|
input_mask.append(0)
|
|
|
|
|
|
|
|
segment_ids.append(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert len(input_ids) == max_seq_length
|
|
|
|
|
|
|
|
assert len(input_mask) == max_seq_length
|
|
|
|
|
|
|
|
assert len(segment_ids) == max_seq_length
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_id = label_map[example.label]
|
|
|
|
|
|
|
|
if ex_index < 5:
|
|
|
|
|
|
|
|
tf.logging.info("*** Example ***") # 打印结果,这时候预处理的部分大致完成
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
return feature
|
|
|
|
|
|
|
|
~~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
返回原先的convert_single_example
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
for (ex_index, example) in enumerate(examples): # 不断遍历处理数据
|
|
|
|
|
|
|
|
if ex_index % 10000 == 0:
|
|
|
|
|
|
|
|
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
feature = convert_single_example(ex_index, example, label_list,
|
|
|
|
|
|
|
|
max_seq_length, tokenizer) # ctrl点击convert_xxx跳
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_int_feature(values):
|
|
|
|
|
|
|
|
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
|
|
|
|
|
|
|
return f
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
features = collections.OrderedDict() # 下面执行格式处理,处理成模型所需的格式
|
|
|
|
|
|
|
|
features["input_ids"] = create_int_feature(feature.input_ids)
|
|
|
|
|
|
|
|
features["input_mask"] = create_int_feature(feature.input_mask)
|
|
|
|
|
|
|
|
features["segment_ids"] = create_int_feature(feature.segment_ids)
|
|
|
|
|
|
|
|
features["label_ids"] = create_int_feature([feature.label_id])
|
|
|
|
|
|
|
|
features["is_real_example"] = create_int_feature(
|
|
|
|
|
|
|
|
[int(feature.is_real_example)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) # 最后转换成tf的数据格式
|
|
|
|
|
|
|
|
writer.write(tf_example.SerializeToString())
|
|
|
|
|
|
|
|
writer.close()
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### Embedding层的作用
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
|
|
|
|
|
|
|
|
labels, num_labels, use_one_hot_embeddings):
|
|
|
|
|
|
|
|
"""Creates a classification model."""
|
|
|
|
|
|
|
|
model = modeling.BertModel( # ctrl点击BertModel跳转
|
|
|
|
|
|
|
|
config=bert_config, # 配置
|
|
|
|
|
|
|
|
is_training=is_training,
|
|
|
|
|
|
|
|
input_ids=input_ids, # 特征
|
|
|
|
|
|
|
|
input_mask=input_mask, # 特征0/1
|
|
|
|
|
|
|
|
token_type_ids=segment_ids, # 特征维度表示第一句话还是第二句
|
|
|
|
|
|
|
|
use_one_hot_embeddings=use_one_hot_embeddings)
|
|
|
|
|
|
|
|
。。。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BertModel(object):
|
|
|
|
|
|
|
|
"""BERT model ("Bidirectional Encoder Representations from Transformers").
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
|
|
|
# Already been converted into WordPiece token ids
|
|
|
|
|
|
|
|
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
|
|
|
|
|
|
|
|
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
|
|
|
|
|
|
|
|
token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
|
|
|
|
|
|
|
|
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = modeling.BertModel(config=config, is_training=True,
|
|
|
|
|
|
|
|
input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_embeddings = tf.get_variable(...)
|
|
|
|
|
|
|
|
pooled_output = model.get_pooled_output()
|
|
|
|
|
|
|
|
logits = tf.matmul(pooled_output, label_embeddings)
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
|
|
|
config,
|
|
|
|
|
|
|
|
is_training,
|
|
|
|
|
|
|
|
input_ids,
|
|
|
|
|
|
|
|
input_mask=None,
|
|
|
|
|
|
|
|
token_type_ids=None,
|
|
|
|
|
|
|
|
use_one_hot_embeddings=False,
|
|
|
|
|
|
|
|
scope=None):
|
|
|
|
|
|
|
|
"""Constructor for BertModel.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
config: `BertConfig` instance.
|
|
|
|
|
|
|
|
is_training: bool. true for training model, false for eval model. Controls
|
|
|
|
|
|
|
|
whether dropout will be applied.
|
|
|
|
|
|
|
|
input_ids: int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
|
|
|
|
|
|
|
|
embeddings or tf.embedding_lookup() for the word embeddings.
|
|
|
|
|
|
|
|
scope: (optional) variable scope. Defaults to "bert".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: The config is invalid or one of the input tensor shapes
|
|
|
|
|
|
|
|
is invalid.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
config = copy.deepcopy(config)
|
|
|
|
|
|
|
|
if not is_training:
|
|
|
|
|
|
|
|
config.hidden_dropout_prob = 0.0
|
|
|
|
|
|
|
|
config.attention_probs_dropout_prob = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_shape = get_shape_list(input_ids, expected_rank=2)
|
|
|
|
|
|
|
|
batch_size = input_shape[0]
|
|
|
|
|
|
|
|
seq_length = input_shape[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if input_mask is None: # 如果没设置mask,默认都是1
|
|
|
|
|
|
|
|
input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if token_type_ids is None: # 没设置就默认一句话
|
|
|
|
|
|
|
|
token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tf.variable_scope(scope, default_name="bert"):
|
|
|
|
|
|
|
|
with tf.variable_scope("embeddings"):
|
|
|
|
|
|
|
|
# Perform embedding lookup on the word ids. 词的embeddings
|
|
|
|
|
|
|
|
(self.embedding_output, self.embedding_table) = embedding_lookup( # ctrl点击embedding_lookup跳转
|
|
|
|
|
|
|
|
input_ids=input_ids, # 词
|
|
|
|
|
|
|
|
vocab_size=config.vocab_size, # 语料库
|
|
|
|
|
|
|
|
embedding_size=config.hidden_size, # 编码映射成多少维
|
|
|
|
|
|
|
|
initializer_range=config.initializer_range, # 初始化范围
|
|
|
|
|
|
|
|
word_embedding_name="word_embeddings",
|
|
|
|
|
|
|
|
use_one_hot_embeddings=use_one_hot_embeddings)
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
def embedding_lookup(input_ids,
|
|
|
|
|
|
|
|
vocab_size,
|
|
|
|
|
|
|
|
embedding_size=128,
|
|
|
|
|
|
|
|
initializer_range=0.02,
|
|
|
|
|
|
|
|
word_embedding_name="word_embeddings",
|
|
|
|
|
|
|
|
use_one_hot_embeddings=False):
|
|
|
|
|
|
|
|
"""Looks up words embeddings for id tensor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
|
|
|
|
|
|
|
|
ids.
|
|
|
|
|
|
|
|
vocab_size: int. Size of the embedding vocabulary.
|
|
|
|
|
|
|
|
embedding_size: int. Width of the word embeddings.
|
|
|
|
|
|
|
|
initializer_range: float. Embedding initialization range.
|
|
|
|
|
|
|
|
word_embedding_name: string. Name of the embedding table.
|
|
|
|
|
|
|
|
use_one_hot_embeddings: bool. If True, use one-hot method for word
|
|
|
|
|
|
|
|
embeddings. If False, use `tf.gather()`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
float Tensor of shape [batch_size, seq_length, embedding_size].
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# This function assumes that the input is of shape [batch_size, seq_length,
|
|
|
|
|
|
|
|
# num_inputs].
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# If the input is a 2D tensor of shape [batch_size, seq_length], we
|
|
|
|
|
|
|
|
# reshape to [batch_size, seq_length, 1].
|
|
|
|
|
|
|
|
if input_ids.shape.ndims == 2:
|
|
|
|
|
|
|
|
input_ids = tf.expand_dims(input_ids, axis=[-1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embedding_table = tf.get_variable( # 词映射矩阵
|
|
|
|
|
|
|
|
name=word_embedding_name, # 词向量
|
|
|
|
|
|
|
|
shape=[vocab_size, embedding_size], # 获取语料库大表vovab.txt
|
|
|
|
|
|
|
|
initializer=create_initializer(initializer_range))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flat_input_ids = tf.reshape(input_ids, [-1])
|
|
|
|
|
|
|
|
if use_one_hot_embeddings:
|
|
|
|
|
|
|
|
one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # 查出所有词做one_hot
|
|
|
|
|
|
|
|
output = tf.matmul(one_hot_input_ids, embedding_table) # 运算一个batch里所有的映射结果
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
output = tf.gather(embedding_table, flat_input_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input_shape = get_shape_list(input_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = tf.reshape(output,
|
|
|
|
|
|
|
|
input_shape[0:-1] + [input_shape[-1] * embedding_size]) # 制作返回结果
|
|
|
|
|
|
|
|
return (output, embedding_table) # 返回,词变成了向量
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 位置编码
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
class BertModel(object):
|
|
|
|
|
|
|
|
"""BERT model ("Bidirectional Encoder Representations from Transformers").
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
|
|
|
# Already been converted into WordPiece token ids
|
|
|
|
|
|
|
|
input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
|
|
|
|
|
|
|
|
input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
|
|
|
|
|
|
|
|
token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
|
|
|
|
|
|
|
|
num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = modeling.BertModel(config=config, is_training=True,
|
|
|
|
|
|
|
|
input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label_embeddings = tf.get_variable(...)
|
|
|
|
|
|
|
|
pooled_output = model.get_pooled_output()
|
|
|
|
|
|
|
|
logits = tf.matmul(pooled_output, label_embeddings)
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
|
|
|
config,
|
|
|
|
|
|
|
|
is_training,
|
|
|
|
|
|
|
|
input_ids,
|
|
|
|
|
|
|
|
input_mask=None,
|
|
|
|
|
|
|
|
token_type_ids=None,
|
|
|
|
|
|
|
|
use_one_hot_embeddings=False,
|
|
|
|
|
|
|
|
scope=None):
|
|
|
|
|
|
|
|
"""Constructor for BertModel.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
config: `BertConfig` instance.
|
|
|
|
|
|
|
|
is_training: bool. true for training model, false for eval model. Controls
|
|
|
|
|
|
|
|
whether dropout will be applied.
|
|
|
|
|
|
|
|
input_ids: int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
|
|
|
|
|
|
|
|
embeddings or tf.embedding_lookup() for the word embeddings.
|
|
|
|
|
|
|
|
scope: (optional) variable scope. Defaults to "bert".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: The config is invalid or one of the input tensor shapes
|
|
|
|
|
|
|
|
is invalid.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Add positional embeddings and token type embeddings, then layer
|
|
|
|
|
|
|
|
# normalize and perform dropout.
|
|
|
|
|
|
|
|
self.embedding_output = embedding_postprocessor( # 制作位置编码,ctrl点击embedding_postprocessor
|
|
|
|
|
|
|
|
input_tensor=self.embedding_output,
|
|
|
|
|
|
|
|
use_token_type=True,
|
|
|
|
|
|
|
|
token_type_ids=token_type_ids,
|
|
|
|
|
|
|
|
token_type_vocab_size=config.type_vocab_size,
|
|
|
|
|
|
|
|
token_type_embedding_name="token_type_embeddings",
|
|
|
|
|
|
|
|
use_position_embeddings=True,
|
|
|
|
|
|
|
|
position_embedding_name="position_embeddings",
|
|
|
|
|
|
|
|
initializer_range=config.initializer_range,
|
|
|
|
|
|
|
|
max_position_embeddings=config.max_position_embeddings,
|
|
|
|
|
|
|
|
dropout_prob=config.hidden_dropout_prob)
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
|
|
|
def embedding_postprocessor(input_tensor,
|
|
|
|
|
|
|
|
use_token_type=False,
|
|
|
|
|
|
|
|
token_type_ids=None,
|
|
|
|
|
|
|
|
token_type_vocab_size=16,
|
|
|
|
|
|
|
|
token_type_embedding_name="token_type_embeddings",
|
|
|
|
|
|
|
|
use_position_embeddings=True,
|
|
|
|
|
|
|
|
position_embedding_name="position_embeddings",
|
|
|
|
|
|
|
|
initializer_range=0.02,
|
|
|
|
|
|
|
|
max_position_embeddings=512,
|
|
|
|
|
|
|
|
dropout_prob=0.1):
|
|
|
|
|
|
|
|
"""Performs various post-processing on a word embedding tensor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
input_tensor: float Tensor of shape [batch_size, seq_length,
|
|
|
|
|
|
|
|
embedding_size].
|
|
|
|
|
|
|
|
use_token_type: bool. Whether to add embeddings for `token_type_ids`.
|
|
|
|
|
|
|
|
token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
|
|
|
|
|
|
|
|
Must be specified if `use_token_type` is True.
|
|
|
|
|
|
|
|
token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
|
|
|
|
|
|
|
|
token_type_embedding_name: string. The name of the embedding table variable
|
|
|
|
|
|
|
|
for token type ids.
|
|
|
|
|
|
|
|
use_position_embeddings: bool. Whether to add position embeddings for the
|
|
|
|
|
|
|
|
position of each token in the sequence.
|
|
|
|
|
|
|
|
position_embedding_name: string. The name of the embedding table variable
|
|
|
|
|
|
|
|
for positional embeddings.
|
|
|
|
|
|
|
|
initializer_range: float. Range of the weight initialization.
|
|
|
|
|
|
|
|
max_position_embeddings: int. Maximum sequence length that might ever be
|
|
|
|
|
|
|
|
used with this model. This can be longer than the sequence length of
|
|
|
|
|
|
|
|
input_tensor, but cannot be shorter.
|
|
|
|
|
|
|
|
dropout_prob: float. Dropout probability applied to the final output tensor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
float tensor with same shape as `input_tensor`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
|
|
ValueError: One of the tensor shapes or input values is invalid.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
input_shape = get_shape_list(input_tensor, expected_rank=3)
|
|
|
|
|
|
|
|
batch_size = input_shape[0]
|
|
|
|
|
|
|
|
seq_length = input_shape[1]
|
|
|
|
|
|
|
|
width = input_shape[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = input_tensor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if use_token_type: # 判断是第一句还是第二句,再做相应处理
|
|
|
|
|
|
|
|
if token_type_ids is None:
|
|
|
|
|
|
|
|
raise ValueError("`token_type_ids` must be specified if"
|
|
|
|
|
|
|
|
"`use_token_type` is True.")
|
|
|
|
|
|
|
|
token_type_table = tf.get_variable(
|
|
|
|
|
|
|
|
name=token_type_embedding_name,
|
|
|
|
|
|
|
|
shape=[token_type_vocab_size, width],
|
|
|
|
|
|
|
|
initializer=create_initializer(initializer_range))
|
|
|
|
|
|
|
|
# This vocab will be small so we always do one-hot here, since it is always
|
|
|
|
|
|
|
|
# faster for a small vocabulary.
|
|
|
|
|
|
|
|
flat_token_type_ids = tf.reshape(token_type_ids, [-1])
|
|
|
|
|
|
|
|
one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
|
|
|
|
|
|
|
|
token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
|
|
|
|
|
|
|
|
token_type_embeddings = tf.reshape(token_type_embeddings,
|
|
|
|
|
|
|
|
[batch_size, seq_length, width])
|
|
|
|
|
|
|
|
output += token_type_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if use_position_embeddings: # 判断是否要做位置编码信息
|
|
|
|
|
|
|
|
assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
|
|
|
|
|
|
|
|
with tf.control_dependencies([assert_op]):
|
|
|
|
|
|
|
|
full_position_embeddings = tf.get_variable(
|
|
|
|
|
|
|
|
name=position_embedding_name,
|
|
|
|
|
|
|
|
shape=[max_position_embeddings, width],
|
|
|
|
|
|
|
|
initializer=create_initializer(initializer_range))
|
|
|
|
|
|
|
|
# Since the position embedding table is a learned variable, we create it
|
|
|
|
|
|
|
|
# using a (long) sequence length `max_position_embeddings`. The actual
|
|
|
|
|
|
|
|
# sequence length might be shorter than this, for faster training of
|
|
|
|
|
|
|
|
# tasks that do not have long sequences.
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# So `full_position_embeddings` is effectively an embedding table
|
|
|
|
|
|
|
|
# for position [0, 1, 2, ..., max_position_embeddings-1], and the current
|
|
|
|
|
|
|
|
# sequence has positions [0, 1, 2, ... seq_length-1], so we can just
|
|
|
|
|
|
|
|
# perform a slice.
|
|
|
|
|
|
|
|
position_embeddings = tf.slice(full_position_embeddings, [0, 0],
|
|
|
|
|
|
|
|
[seq_length, -1]) # 如果位置编码给的过大,为了加速只需取出部分
|
|
|
|
|
|
|
|
num_dims = len(output.shape.as_list())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Only the last two dimensions are relevant (`seq_length` and `width`), so
|
|
|
|
|
|
|
|
# we broadcast among the first dimensions, which is typically just
|
|
|
|
|
|
|
|
# the batch size.
|
|
|
|
|
|
|
|
position_broadcast_shape = []
|
|
|
|
|
|
|
|
for _ in range(num_dims - 2):
|
|
|
|
|
|
|
|
position_broadcast_shape.append(1)
|
|
|
|
|
|
|
|
position_broadcast_shape.extend([seq_length, width])
|
|
|
|
|
|
|
|
position_embeddings = tf.reshape(position_embeddings,
|
|
|
|
|
|
|
|
position_broadcast_shape)
|
|
|
|
|
|
|
|
output += position_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output = layer_norm_and_dropout(output, dropout_prob)
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|