|
|
|
@ -57,3 +57,68 @@ class MrpcProcessor(DataProcessor):
|
|
|
|
|
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时,让学习率偏小,经过warmup的百分比后,再还原回原始的学习率
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### 数据预处理模块
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
# 衔接上一个
|
|
|
|
|
file_based_convert_examples_to_features(
|
|
|
|
|
train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
|
|
|
|
|
|
|
|
|
|
# ctrl点击file_based_xxx函数跳转
|
|
|
|
|
def file_based_convert_examples_to_features(
|
|
|
|
|
examples, label_list, max_seq_length, tokenizer, output_file):
|
|
|
|
|
"""Convert a set of `InputExample`s to a TFRecord file."""
|
|
|
|
|
|
|
|
|
|
writer = tf.python_io.TFRecordWriter(output_file) # TFRecord读取数据块,在bert中要求数据是TFRecord的形式。
|
|
|
|
|
|
|
|
|
|
for (ex_index, example) in enumerate(examples):
|
|
|
|
|
if ex_index % 10000 == 0:
|
|
|
|
|
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # for循环变量取数据
|
|
|
|
|
feature = convert_single_example(ex_index, example, label_list,
|
|
|
|
|
max_seq_length, tokenizer) # ctrl点击convert_xxx跳转
|
|
|
|
|
|
|
|
|
|
def convert_single_example(ex_index, example, label_list, max_seq_length,
|
|
|
|
|
tokenizer):
|
|
|
|
|
"""Converts a single `InputExample` into a single `InputFeatures`."""
|
|
|
|
|
|
|
|
|
|
if isinstance(example, PaddingInputExample):
|
|
|
|
|
return InputFeatures(
|
|
|
|
|
input_ids=[0] * max_seq_length,
|
|
|
|
|
input_mask=[0] * max_seq_length,
|
|
|
|
|
segment_ids=[0] * max_seq_length,
|
|
|
|
|
label_id=0,
|
|
|
|
|
is_real_example=False)
|
|
|
|
|
|
|
|
|
|
label_map = {} # 构建标签0, 1
|
|
|
|
|
for (i, label) in enumerate(label_list):
|
|
|
|
|
label_map[label] = i
|
|
|
|
|
|
|
|
|
|
tokens_a = tokenizer.tokenize(example.text_a) # ctrl点击tokenize,对第一句话分词
|
|
|
|
|
tokens_b = None
|
|
|
|
|
if example.text_b: # 第二句话分词
|
|
|
|
|
tokens_b = tokenizer.tokenize(example.text_b)
|
|
|
|
|
if tokens_b:
|
|
|
|
|
# Modifies `tokens_a` and `tokens_b` in place so that the total
|
|
|
|
|
# length is less than the specified length.
|
|
|
|
|
# Account for [CLS], [SEP], [SEP] with "- 3" # 保留3个特殊字符
|
|
|
|
|
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) # 如果太长就截断的操作
|
|
|
|
|
else: # 没有b的时候保留两个字符
|
|
|
|
|
# Account for [CLS] and [SEP] with "- 2"
|
|
|
|
|
if len(tokens_a) > max_seq_length - 2:
|
|
|
|
|
tokens_a = tokens_a[0:(max_seq_length - 2)]
|
|
|
|
|
|
|
|
|
|
# The convention in BERT is:
|
|
|
|
|
# (a) For sequence pairs: # 将下面一对话,CLS开始,SEP断点,变成type_ids的0/1形式,0表示前一句,1表示后一句
|
|
|
|
|
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
|
|
|
|
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
|
|
|
|
def tokenize(self, text):
|
|
|
|
|
split_tokens = []
|
|
|
|
|
for token in self.basic_tokenizer.tokenize(text): # 词切片,将一个词切片成多个小段,让表达的含义更丰富
|
|
|
|
|
for sub_token in self.wordpiece_tokenizer.tokenize(token):
|
|
|
|
|
split_tokens.append(sub_token)
|
|
|
|
|
|
|
|
|
|
return split_tokens
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|