Add. Data preprocessing module

pull/2/head
benjas 5 years ago
parent b100cfa87f
commit 3cad39897e

@ -57,3 +57,68 @@ class MrpcProcessor(DataProcessor):
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时让学习率偏小经过warmup的百分比后再还原回原始的学习率
~~~
#### 数据预处理模块
~~~python
# 衔接上一个
file_based_convert_examples_to_features(
train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
# ctrl点击file_based_xxx函数跳转
def file_based_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""
writer = tf.python_io.TFRecordWriter(output_file) # TFRecord读取数据块在bert中要求数据是TFRecord的形式。
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # for循环变量取数据
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer) # ctrl点击convert_xxx跳转
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[0] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)
label_map = {} # 构建标签0, 1
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a) # ctrl点击tokenize对第一句话分词
tokens_b = None
if example.text_b: # 第二句话分词
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3" # 保留3个特殊字符
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) # 如果太长就截断的操作
else: # 没有b的时候保留两个字符
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs: # 将下面一对话CLS开始SEP断点变成type_ids的0/1形式0表示前一句1表示后一句
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text): # 词切片,将一个词切片成多个小段,让表达的含义更丰富
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
~~~

Loading…
Cancel
Save