From b100cfa87f6e49a275a4b78f668600b20c4abc4b Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Thu, 7 Jan 2021 14:44:15 +0800 Subject: [PATCH] =?UTF-8?q?Create=20=E6=BA=90=E7=A0=81=E8=A7=A3=E8=AF=BB.m?= =?UTF-8?q?d?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../源码解读.md | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md new file mode 100644 index 0000000..23946e7 --- /dev/null +++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md @@ -0,0 +1,59 @@ +### 源码解读 + +#### 数据读取模块 + +处理MRPC数据的类 + +~~~python +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] # 是否是二分类 + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) # 相关的test_a和b怎么切分 + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples +~~~ + + + +读取训练数据代码: + +~~~python + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 得到需要迭代的次数,len(train_examples)计算出多少数据量 除以 我们设置的train_batch_size,再乘上epochs次数。 + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时,让学习率偏小,经过warmup的百分比后,再还原回原始的学习率 +~~~ +