Add. Data preprocessing module

5 years ago · 3cad39897e
parent b100cfa87f
commit 3cad39897e
1 changed files with 65 additions and 0 deletions
--- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
+++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
@ -57,3 +57,68 @@ class MrpcProcessor(DataProcessor):
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)  # 在刚开始时，让学习率偏小，经过warmup的百分比后，再还原回原始的学习率
 ~~~

+
+
+#### 数据预处理模块
+
+~~~python
+# 衔接上一个
+    file_based_convert_examples_to_features(
+        train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
+
+# ctrl点击file_based_xxx函数跳转
+def file_based_convert_examples_to_features(
+    examples, label_list, max_seq_length, tokenizer, output_file):
+  """Convert a set of `InputExample`s to a TFRecord file."""
+
+    writer = tf.python_io.TFRecordWriter(output_file)  # TFRecord读取数据块，在bert中要求数据是TFRecord的形式。
+    
+    for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))  # for循环变量取数据
+    feature = convert_single_example(ex_index, example, label_list,
+                                     max_seq_length, tokenizer)  # ctrl点击convert_xxx跳转
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+  """Converts a single `InputExample` into a single `InputFeatures`."""
+
+  if isinstance(example, PaddingInputExample):
+    return InputFeatures(
+        input_ids=[0] * max_seq_length,
+        input_mask=[0] * max_seq_length,
+        segment_ids=[0] * max_seq_length,
+        label_id=0,
+        is_real_example=False)
+
+  label_map = {}  # 构建标签0, 1
+  for (i, label) in enumerate(label_list):
+    label_map[label] = i
+
+  tokens_a = tokenizer.tokenize(example.text_a)  # ctrl点击tokenize，对第一句话分词
+  tokens_b = None
+  if example.text_b:  # 第二句话分词
+    tokens_b = tokenizer.tokenize(example.text_b)
+  if tokens_b:
+    # Modifies `tokens_a` and `tokens_b` in place so that the total
+    # length is less than the specified length.
+    # Account for [CLS], [SEP], [SEP] with "- 3"  # 保留3个特殊字符
+    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)  # 如果太长就截断的操作
+  else:  # 没有b的时候保留两个字符
+    # Account for [CLS] and [SEP] with "- 2"
+    if len(tokens_a) > max_seq_length - 2:
+      tokens_a = tokens_a[0:(max_seq_length - 2)]
+  
+  # The convention in BERT is:
+  # (a) For sequence pairs:  # 将下面一对话，CLS开始，SEP断点，变成type_ids的0/1形式，0表示前一句，1表示后一句
+  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):  # 词切片，将一个词切片成多个小段，让表达的含义更丰富
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+~~~
+