From 3cad39897ee9810820fa0a66fc18f3fe12a2a132 Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Thu, 7 Jan 2021 15:36:28 +0800 Subject: [PATCH] Add. Data preprocessing module --- .../源码解读.md | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md index 23946e7..ece695b 100644 --- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md +++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md @@ -57,3 +57,68 @@ class MrpcProcessor(DataProcessor): num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时,让学习率偏小,经过warmup的百分比后,再还原回原始的学习率 ~~~ + + +#### 数据预处理模块 + +~~~python +# 衔接上一个 + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + +# ctrl点击file_based_xxx函数跳转 +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) # TFRecord读取数据块,在bert中要求数据是TFRecord的形式。 + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # for循环变量取数据 + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) # ctrl点击convert_xxx跳转 + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} # 构建标签0, 1 + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) # ctrl点击tokenize,对第一句话分词 + tokens_b = None + if example.text_b: # 第二句话分词 + tokens_b = tokenizer.tokenize(example.text_b) + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" # 保留3个特殊字符 + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) # 如果太长就截断的操作 + else: # 没有b的时候保留两个字符 + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: # 将下面一对话,CLS开始,SEP断点,变成type_ids的0/1形式,0表示前一句,1表示后一句 + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): # 词切片,将一个词切片成多个小段,让表达的含义更丰富 + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens +~~~ +