|
|
|
@ -122,3 +122,79 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
|
|
|
|
|
return split_tokens
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#### tfrecord制作
|
|
|
|
|
|
|
|
|
|
~~~~python
|
|
|
|
|
# 延续上面的convert_single_example模块
|
|
|
|
|
# 开始构建,创建两个列表来承接
|
|
|
|
|
tokens = []
|
|
|
|
|
segment_ids = []
|
|
|
|
|
tokens.append("[CLS]") # 第一个词是CLS
|
|
|
|
|
segment_ids.append(0) # 第一个的编码也肯定是0
|
|
|
|
|
for token in tokens_a:
|
|
|
|
|
tokens.append(token)
|
|
|
|
|
segment_ids.append(0) # 遍历获取,a(第一句话)都是0
|
|
|
|
|
tokens.append("[SEP]") # 遍历完增加个SEP连接符/断电
|
|
|
|
|
segment_ids.append(0) # tokens添加完SEP后,ids也添加对应的0
|
|
|
|
|
|
|
|
|
|
if tokens_b:
|
|
|
|
|
for token in tokens_b:
|
|
|
|
|
tokens.append(token)
|
|
|
|
|
segment_ids.append(1) # b和a一样,唯一不同的是添加的是1
|
|
|
|
|
tokens.append("[SEP]")
|
|
|
|
|
segment_ids.append(1)
|
|
|
|
|
|
|
|
|
|
input_ids = tokenizer.convert_tokens_to_ids(tokens) # 转成ID的映射,就是vocab语料库索引
|
|
|
|
|
|
|
|
|
|
# The mask has 1 for real tokens and 0 for padding tokens. Only real
|
|
|
|
|
# tokens are attended to.
|
|
|
|
|
input_mask = [1] * len(input_ids)
|
|
|
|
|
|
|
|
|
|
# Zero-pad up to the sequence length. 保证输入的长度是一样的,多退少补
|
|
|
|
|
while len(input_ids) < max_seq_length: # PAD的长度取决于设置的最大长度,小于全补0
|
|
|
|
|
input_ids.append(0)
|
|
|
|
|
input_mask.append(0)
|
|
|
|
|
segment_ids.append(0)
|
|
|
|
|
|
|
|
|
|
assert len(input_ids) == max_seq_length
|
|
|
|
|
assert len(input_mask) == max_seq_length
|
|
|
|
|
assert len(segment_ids) == max_seq_length
|
|
|
|
|
|
|
|
|
|
label_id = label_map[example.label]
|
|
|
|
|
if ex_index < 5:
|
|
|
|
|
tf.logging.info("*** Example ***") # 打印结果,这时候预处理的部分大致完成
|
|
|
|
|
...
|
|
|
|
|
return feature
|
|
|
|
|
~~~~
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
返回原先的convert_single_example
|
|
|
|
|
|
|
|
|
|
~~~python
|
|
|
|
|
for (ex_index, example) in enumerate(examples): # 不断遍历处理数据
|
|
|
|
|
if ex_index % 10000 == 0:
|
|
|
|
|
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
|
|
|
|
|
|
|
|
|
|
feature = convert_single_example(ex_index, example, label_list,
|
|
|
|
|
max_seq_length, tokenizer) # ctrl点击convert_xxx跳
|
|
|
|
|
|
|
|
|
|
def create_int_feature(values):
|
|
|
|
|
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
|
|
|
|
|
return f
|
|
|
|
|
|
|
|
|
|
features = collections.OrderedDict() # 下面执行格式处理,处理成模型所需的格式
|
|
|
|
|
features["input_ids"] = create_int_feature(feature.input_ids)
|
|
|
|
|
features["input_mask"] = create_int_feature(feature.input_mask)
|
|
|
|
|
features["segment_ids"] = create_int_feature(feature.segment_ids)
|
|
|
|
|
features["label_ids"] = create_int_feature([feature.label_id])
|
|
|
|
|
features["is_real_example"] = create_int_feature(
|
|
|
|
|
[int(feature.is_real_example)])
|
|
|
|
|
|
|
|
|
|
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) # 最后转换成tf的数据格式
|
|
|
|
|
writer.write(tf_example.SerializeToString())
|
|
|
|
|
writer.close()
|
|
|
|
|
~~~
|
|
|
|
|
|
|
|
|
|