From 7cc22cd2da9fa850833b799bc1cb4e2ef40a9ef9 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Thu, 7 Jan 2021 17:11:35 +0800 Subject: [PATCH] Add. Making tfrecord data --- .../源码解读.md | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md index ece695b..1807f34 100644 --- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md +++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md @@ -122,3 +122,79 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, return split_tokens ~~~ + + +#### tfrecord制作 + +~~~~python +# 延续上面的convert_single_example模块 + # 开始构建,创建两个列表来承接 + tokens = [] + segment_ids = [] + tokens.append("[CLS]") # 第一个词是CLS + segment_ids.append(0) # 第一个的编码也肯定是0 + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) # 遍历获取,a(第一句话)都是0 + tokens.append("[SEP]") # 遍历完增加个SEP连接符/断电 + segment_ids.append(0) # tokens添加完SEP后,ids也添加对应的0 + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) # b和a一样,唯一不同的是添加的是1 + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) # 转成ID的映射,就是vocab语料库索引 + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. 保证输入的长度是一样的,多退少补 + while len(input_ids) < max_seq_length: # PAD的长度取决于设置的最大长度,小于全补0 + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") # 打印结果,这时候预处理的部分大致完成 + ... + return feature +~~~~ + + + +返回原先的convert_single_example + +~~~python + for (ex_index, example) in enumerate(examples): # 不断遍历处理数据 + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) # ctrl点击convert_xxx跳 + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() # 下面执行格式处理,处理成模型所需的格式 + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) # 最后转换成tf的数据格式 + writer.write(tf_example.SerializeToString()) + writer.close() +~~~ +