Add. Making tfrecord data

pull/2/head
benjas 5 years ago
parent 3cad39897e
commit 7cc22cd2da

@ -122,3 +122,79 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
return split_tokens
~~~
#### tfrecord制作
~~~~python
# 延续上面的convert_single_example模块
# 开始构建,创建两个列表来承接
tokens = []
segment_ids = []
tokens.append("[CLS]") # 第一个词是CLS
segment_ids.append(0) # 第一个的编码也肯定是0
for token in tokens_a:
tokens.append(token)
segment_ids.append(0) # 遍历获取a第一句话都是0
tokens.append("[SEP]") # 遍历完增加个SEP连接符/断电
segment_ids.append(0) # tokens添加完SEP后ids也添加对应的0
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1) # b和a一样唯一不同的是添加的是1
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens) # 转成ID的映射就是vocab语料库索引
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length. 保证输入的长度是一样的,多退少补
while len(input_ids) < max_seq_length: # PAD0
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[example.label]
if ex_index < 5:
tf.logging.info("*** Example ***") # 打印结果,这时候预处理的部分大致完成
...
return feature
~~~~
返回原先的convert_single_example
~~~python
for (ex_index, example) in enumerate(examples): # 不断遍历处理数据
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer) # ctrl点击convert_xxx跳
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
features = collections.OrderedDict() # 下面执行格式处理,处理成模型所需的格式
features["input_ids"] = create_int_feature(feature.input_ids)
features["input_mask"] = create_int_feature(feature.input_mask)
features["segment_ids"] = create_int_feature(feature.segment_ids)
features["label_ids"] = create_int_feature([feature.label_id])
features["is_real_example"] = create_int_feature(
[int(feature.is_real_example)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) # 最后转换成tf的数据格式
writer.write(tf_example.SerializeToString())
writer.close()
~~~

Loading…
Cancel
Save