Add. Making tfrecord data

5 years ago · 7cc22cd2da
parent 3cad39897e
commit 7cc22cd2da
1 changed files with 76 additions and 0 deletions
--- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
+++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
@ -122,3 +122,79 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
    return split_tokens
 ~~~

+
+
+#### tfrecord制作
+
+~~~~python
+# 延续上面的convert_single_example模块
+  # 开始构建，创建两个列表来承接
+  tokens = []
+  segment_ids = []
+  tokens.append("[CLS]")  # 第一个词是CLS
+  segment_ids.append(0)  # 第一个的编码也肯定是0
+  for token in tokens_a:
+    tokens.append(token)
+    segment_ids.append(0)  # 遍历获取，a（第一句话）都是0
+  tokens.append("[SEP]")  # 遍历完增加个SEP连接符/断电
+  segment_ids.append(0) # tokens添加完SEP后，ids也添加对应的0
+
+  if tokens_b:
+    for token in tokens_b:
+      tokens.append(token)
+      segment_ids.append(1)  # b和a一样，唯一不同的是添加的是1
+    tokens.append("[SEP]")
+    segment_ids.append(1)
+    
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 转成ID的映射，就是vocab语料库索引
+
+  # The mask has 1 for real tokens and 0 for padding tokens. Only real
+  # tokens are attended to.
+  input_mask = [1] * len(input_ids)
+
+  # Zero-pad up to the sequence length.  保证输入的长度是一样的，多退少补
+  while len(input_ids) < max_seq_length:  # PAD的长度取决于设置的最大长度，小于全补0
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+
+  assert len(input_ids) == max_seq_length
+  assert len(input_mask) == max_seq_length
+  assert len(segment_ids) == max_seq_length
+
+  label_id = label_map[example.label]
+  if ex_index < 5:
+    tf.logging.info("*** Example ***")  # 打印结果，这时候预处理的部分大致完成
+    ...
+  return feature
+~~~~
+
+
+
+返回原先的convert_single_example
+
+~~~python
+  for (ex_index, example) in enumerate(examples):  # 不断遍历处理数据
+    if ex_index % 10000 == 0:
+      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+    feature = convert_single_example(ex_index, example, label_list,
+                                     max_seq_length, tokenizer)  # ctrl点击convert_xxx跳
+
+    def create_int_feature(values):
+      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+      return f
+
+    features = collections.OrderedDict()  # 下面执行格式处理，处理成模型所需的格式
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    features["label_ids"] = create_int_feature([feature.label_id])
+    features["is_real_example"] = create_int_feature(
+        [int(feature.is_real_example)])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))  # 最后转换成tf的数据格式
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+~~~
+