From 32f2f19292830689704e64925332d480814fbadb Mon Sep 17 00:00:00 2001
From: benjas <909336740@qq.com>
Date: Fri, 8 Jan 2021 09:38:57 +0800
Subject: [PATCH] =?UTF-8?q?Delete=20=E6=BA=90=E7=A0=81=E8=A7=A3=E8=AF=BB.m?=
 =?UTF-8?q?d?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../源码解读.md                           | 514 ------------------
 1 file changed, 514 deletions(-)
 delete mode 100644 NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md

diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
deleted file mode 100644
index c56e772..0000000
--- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
+++ /dev/null
@@ -1,514 +0,0 @@
-### 源码解读
-
-#### 数据读取模块
-
-处理MRPC数据的类
-
-~~~python
-class MrpcProcessor(DataProcessor):
-  """Processor for the MRPC data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]  # 是否是二分类
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[3])  # 相关的test_a和b怎么切分
-      text_b = tokenization.convert_to_unicode(line[4])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = tokenization.convert_to_unicode(line[0])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-~~~
-
-
-
-读取训练数据代码：
-
-~~~python
-  if FLAGS.do_train:
-    train_examples = processor.get_train_examples(FLAGS.data_dir)
-    num_train_steps = int(
-        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)  # 得到需要迭代的次数，len(train_examples)计算出多少数据量 除以 我们设置的train_batch_size，再乘上epochs次数。
-    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)  # 在刚开始时，让学习率偏小，经过warmup的百分比后，再还原回原始的学习率
-~~~
-
-
-
-#### 数据预处理模块
-
-~~~python
-# 衔接上一个
-    file_based_convert_examples_to_features(
-        train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
-
-# ctrl点击file_based_xxx函数跳转
-def file_based_convert_examples_to_features(
-    examples, label_list, max_seq_length, tokenizer, output_file):
-  """Convert a set of `InputExample`s to a TFRecord file."""
-
-    writer = tf.python_io.TFRecordWriter(output_file)  # TFRecord读取数据块，在bert中要求数据是TFRecord的形式。
-    
-    for (ex_index, example) in enumerate(examples):
-    if ex_index % 10000 == 0:
-      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))  # for循环变量取数据
-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)  # ctrl点击convert_xxx跳转
-
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
-  """Converts a single `InputExample` into a single `InputFeatures`."""
-
-  if isinstance(example, PaddingInputExample):
-    return InputFeatures(
-        input_ids=[0] * max_seq_length,
-        input_mask=[0] * max_seq_length,
-        segment_ids=[0] * max_seq_length,
-        label_id=0,
-        is_real_example=False)
-
-  label_map = {}  # 构建标签0, 1
-  for (i, label) in enumerate(label_list):
-    label_map[label] = i
-
-  tokens_a = tokenizer.tokenize(example.text_a)  # ctrl点击tokenize，对第一句话分词
-  tokens_b = None
-  if example.text_b:  # 第二句话分词
-    tokens_b = tokenizer.tokenize(example.text_b)
-  if tokens_b:
-    # Modifies `tokens_a` and `tokens_b` in place so that the total
-    # length is less than the specified length.
-    # Account for [CLS], [SEP], [SEP] with "- 3"  # 保留3个特殊字符
-    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)  # 如果太长就截断的操作
-  else:  # 没有b的时候保留两个字符
-    # Account for [CLS] and [SEP] with "- 2"
-    if len(tokens_a) > max_seq_length - 2:
-      tokens_a = tokens_a[0:(max_seq_length - 2)]
-  
-  # The convention in BERT is:
-  # (a) For sequence pairs:  # 将下面一对话，CLS开始，SEP断点，变成type_ids的0/1形式，0表示前一句，1表示后一句
-  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-  def tokenize(self, text):
-    split_tokens = []
-    for token in self.basic_tokenizer.tokenize(text):  # 词切片，将一个词切片成多个小段，让表达的含义更丰富
-      for sub_token in self.wordpiece_tokenizer.tokenize(token):
-        split_tokens.append(sub_token)
-
-    return split_tokens
-~~~
-
-
-
-#### tfrecord制作
-
-~~~~python
-# 延续上面的convert_single_example模块
-  # 开始构建，创建两个列表来承接
-  tokens = []
-  segment_ids = []
-  tokens.append("[CLS]")  # 第一个词是CLS
-  segment_ids.append(0)  # 第一个的编码也肯定是0
-  for token in tokens_a:
-    tokens.append(token)
-    segment_ids.append(0)  # 遍历获取，a（第一句话）都是0
-  tokens.append("[SEP]")  # 遍历完增加个SEP连接符/断电
-  segment_ids.append(0) # tokens添加完SEP后，ids也添加对应的0
-
-  if tokens_b:
-    for token in tokens_b:
-      tokens.append(token)
-      segment_ids.append(1)  # b和a一样，唯一不同的是添加的是1
-    tokens.append("[SEP]")
-    segment_ids.append(1)
-    
-  input_ids = tokenizer.convert_tokens_to_ids(tokens)  # 转成ID的映射，就是vocab语料库索引
-
-  # The mask has 1 for real tokens and 0 for padding tokens. Only real
-  # tokens are attended to.
-  input_mask = [1] * len(input_ids)
-
-  # Zero-pad up to the sequence length.  保证输入的长度是一样的，多退少补
-  while len(input_ids) < max_seq_length:  # PAD的长度取决于设置的最大长度，小于全补0
-    input_ids.append(0)
-    input_mask.append(0)
-    segment_ids.append(0)
-
-  assert len(input_ids) == max_seq_length
-  assert len(input_mask) == max_seq_length
-  assert len(segment_ids) == max_seq_length
-
-  label_id = label_map[example.label]
-  if ex_index < 5:
-    tf.logging.info("*** Example ***")  # 打印结果，这时候预处理的部分大致完成
-    ...
-  return feature
-~~~~
-
-
-
-返回原先的convert_single_example
-
-~~~python
-  for (ex_index, example) in enumerate(examples):  # 不断遍历处理数据
-    if ex_index % 10000 == 0:
-      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)  # ctrl点击convert_xxx跳
-
-    def create_int_feature(values):
-      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-      return f
-
-    features = collections.OrderedDict()  # 下面执行格式处理，处理成模型所需的格式
-    features["input_ids"] = create_int_feature(feature.input_ids)
-    features["input_mask"] = create_int_feature(feature.input_mask)
-    features["segment_ids"] = create_int_feature(feature.segment_ids)
-    features["label_ids"] = create_int_feature([feature.label_id])
-    features["is_real_example"] = create_int_feature(
-        [int(feature.is_real_example)])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))  # 最后转换成tf的数据格式
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-~~~
-
-
-
-#### Embedding层的作用
-
-~~~python
-def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
-                 labels, num_labels, use_one_hot_embeddings):
-  """Creates a classification model."""
-  model = modeling.BertModel(  # ctrl点击BertModel跳转
-      config=bert_config,  # 配置
-      is_training=is_training,
-      input_ids=input_ids,  # 特征
-      input_mask=input_mask,  # 特征0/1
-      token_type_ids=segment_ids,  # 特征维度表示第一句话还是第二句
-      use_one_hot_embeddings=use_one_hot_embeddings)
-    。。。
-
-class BertModel(object):
-  """BERT model ("Bidirectional Encoder Representations from Transformers").
-
-  Example usage:
-
-  ```python
-  # Already been converted into WordPiece token ids
-  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
-  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
-  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
-
-  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
-    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
-
-  model = modeling.BertModel(config=config, is_training=True,
-    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
-
-  label_embeddings = tf.get_variable(...)
-  pooled_output = model.get_pooled_output()
-  logits = tf.matmul(pooled_output, label_embeddings)
-  ...
-  ```
-  """
-
-  def __init__(self,
-               config,
-               is_training,
-               input_ids,
-               input_mask=None,
-               token_type_ids=None,
-               use_one_hot_embeddings=False,
-               scope=None):
-    """Constructor for BertModel.
-
-    Args:
-      config: `BertConfig` instance.
-      is_training: bool. true for training model, false for eval model. Controls
-        whether dropout will be applied.
-      input_ids: int32 Tensor of shape [batch_size, seq_length].
-      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
-      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
-      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
-        embeddings or tf.embedding_lookup() for the word embeddings.
-      scope: (optional) variable scope. Defaults to "bert".
-
-    Raises:
-      ValueError: The config is invalid or one of the input tensor shapes
-        is invalid.
-    """
-    config = copy.deepcopy(config)
-    if not is_training:
-      config.hidden_dropout_prob = 0.0
-      config.attention_probs_dropout_prob = 0.0
-
-    input_shape = get_shape_list(input_ids, expected_rank=2)
-    batch_size = input_shape[0]
-    seq_length = input_shape[1]
-
-    if input_mask is None:  # 如果没设置mask，默认都是1
-      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
-
-    if token_type_ids is None:  # 没设置就默认一句话
-      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
-
-    with tf.variable_scope(scope, default_name="bert"):
-      with tf.variable_scope("embeddings"):
-        # Perform embedding lookup on the word ids. 词的embeddings
-        (self.embedding_output, self.embedding_table) = embedding_lookup(  # ctrl点击embedding_lookup跳转
-            input_ids=input_ids,  # 词
-            vocab_size=config.vocab_size,  # 语料库
-            embedding_size=config.hidden_size,  # 编码映射成多少维
-            initializer_range=config.initializer_range,  # 初始化范围
-            word_embedding_name="word_embeddings",
-            use_one_hot_embeddings=use_one_hot_embeddings)
-~~~
-
-
-
-~~~python
-def embedding_lookup(input_ids,
-                     vocab_size,
-                     embedding_size=128,
-                     initializer_range=0.02,
-                     word_embedding_name="word_embeddings",
-                     use_one_hot_embeddings=False):
-  """Looks up words embeddings for id tensor.
-
-  Args:
-    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
-      ids.
-    vocab_size: int. Size of the embedding vocabulary.
-    embedding_size: int. Width of the word embeddings.
-    initializer_range: float. Embedding initialization range.
-    word_embedding_name: string. Name of the embedding table.
-    use_one_hot_embeddings: bool. If True, use one-hot method for word
-      embeddings. If False, use `tf.gather()`.
-
-  Returns:
-    float Tensor of shape [batch_size, seq_length, embedding_size].
-  """
-  # This function assumes that the input is of shape [batch_size, seq_length,
-  # num_inputs].
-  #
-  # If the input is a 2D tensor of shape [batch_size, seq_length], we
-  # reshape to [batch_size, seq_length, 1].
-  if input_ids.shape.ndims == 2:
-    input_ids = tf.expand_dims(input_ids, axis=[-1])
-
-  embedding_table = tf.get_variable(  # 词映射矩阵
-      name=word_embedding_name,  # 词向量
-      shape=[vocab_size, embedding_size],  # 获取语料库大表vovab.txt
-      initializer=create_initializer(initializer_range))
-
-  flat_input_ids = tf.reshape(input_ids, [-1])
-  if use_one_hot_embeddings:
-    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)  # 查出所有词做one_hot
-    output = tf.matmul(one_hot_input_ids, embedding_table)  # 运算一个batch里所有的映射结果
-  else:
-    output = tf.gather(embedding_table, flat_input_ids)
-
-  input_shape = get_shape_list(input_ids)
-
-  output = tf.reshape(output,
-                      input_shape[0:-1] + [input_shape[-1] * embedding_size])  # 制作返回结果
-  return (output, embedding_table)  # 返回，词变成了向量
-~~~
-
-
-
-#### 位置编码
-
-~~~python
-class BertModel(object):
-  """BERT model ("Bidirectional Encoder Representations from Transformers").
-
-  Example usage:
-
-  ```python
-  # Already been converted into WordPiece token ids
-  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
-  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
-  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
-
-  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
-    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
-
-  model = modeling.BertModel(config=config, is_training=True,
-    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
-
-  label_embeddings = tf.get_variable(...)
-  pooled_output = model.get_pooled_output()
-  logits = tf.matmul(pooled_output, label_embeddings)
-  ...
-  ```
-  """
-
-  def __init__(self,
-               config,
-               is_training,
-               input_ids,
-               input_mask=None,
-               token_type_ids=None,
-               use_one_hot_embeddings=False,
-               scope=None):
-    """Constructor for BertModel.
-
-    Args:
-      config: `BertConfig` instance.
-      is_training: bool. true for training model, false for eval model. Controls
-        whether dropout will be applied.
-      input_ids: int32 Tensor of shape [batch_size, seq_length].
-      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
-      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
-      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
-        embeddings or tf.embedding_lookup() for the word embeddings.
-      scope: (optional) variable scope. Defaults to "bert".
-
-    Raises:
-      ValueError: The config is invalid or one of the input tensor shapes
-        is invalid.
-    """
-	...
-
-        # Add positional embeddings and token type embeddings, then layer
-        # normalize and perform dropout.
-        self.embedding_output = embedding_postprocessor(  # 制作位置编码，ctrl点击embedding_postprocessor
-            input_tensor=self.embedding_output,
-            use_token_type=True,
-            token_type_ids=token_type_ids,
-            token_type_vocab_size=config.type_vocab_size,
-            token_type_embedding_name="token_type_embeddings",
-            use_position_embeddings=True,
-            position_embedding_name="position_embeddings",
-            initializer_range=config.initializer_range,
-            max_position_embeddings=config.max_position_embeddings,
-            dropout_prob=config.hidden_dropout_prob)
-~~~
-
-
-
-~~~python
-def embedding_postprocessor(input_tensor,
-                            use_token_type=False,
-                            token_type_ids=None,
-                            token_type_vocab_size=16,
-                            token_type_embedding_name="token_type_embeddings",
-                            use_position_embeddings=True,
-                            position_embedding_name="position_embeddings",
-                            initializer_range=0.02,
-                            max_position_embeddings=512,
-                            dropout_prob=0.1):
-  """Performs various post-processing on a word embedding tensor.
-
-  Args:
-    input_tensor: float Tensor of shape [batch_size, seq_length,
-      embedding_size].
-    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
-    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
-      Must be specified if `use_token_type` is True.
-    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
-    token_type_embedding_name: string. The name of the embedding table variable
-      for token type ids.
-    use_position_embeddings: bool. Whether to add position embeddings for the
-      position of each token in the sequence.
-    position_embedding_name: string. The name of the embedding table variable
-      for positional embeddings.
-    initializer_range: float. Range of the weight initialization.
-    max_position_embeddings: int. Maximum sequence length that might ever be
-      used with this model. This can be longer than the sequence length of
-      input_tensor, but cannot be shorter.
-    dropout_prob: float. Dropout probability applied to the final output tensor.
-
-  Returns:
-    float tensor with same shape as `input_tensor`.
-
-  Raises:
-    ValueError: One of the tensor shapes or input values is invalid.
-  """
-  input_shape = get_shape_list(input_tensor, expected_rank=3)
-  batch_size = input_shape[0]
-  seq_length = input_shape[1]
-  width = input_shape[2]
-
-  output = input_tensor
-
-  if use_token_type:  # 判断是第一句还是第二句，再做相应处理
-    if token_type_ids is None:
-      raise ValueError("`token_type_ids` must be specified if"
-                       "`use_token_type` is True.")
-    token_type_table = tf.get_variable(
-        name=token_type_embedding_name,
-        shape=[token_type_vocab_size, width],
-        initializer=create_initializer(initializer_range))
-    # This vocab will be small so we always do one-hot here, since it is always
-    # faster for a small vocabulary.
-    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
-    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
-    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
-    token_type_embeddings = tf.reshape(token_type_embeddings,
-                                       [batch_size, seq_length, width])
-    output += token_type_embeddings
-
-  if use_position_embeddings:  # 判断是否要做位置编码信息
-    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
-    with tf.control_dependencies([assert_op]):
-      full_position_embeddings = tf.get_variable(
-          name=position_embedding_name,
-          shape=[max_position_embeddings, width],
-          initializer=create_initializer(initializer_range))
-      # Since the position embedding table is a learned variable, we create it
-      # using a (long) sequence length `max_position_embeddings`. The actual
-      # sequence length might be shorter than this, for faster training of
-      # tasks that do not have long sequences.
-      #
-      # So `full_position_embeddings` is effectively an embedding table
-      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
-      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
-      # perform a slice.
-      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
-                                     [seq_length, -1])  # 如果位置编码给的过大，为了加速只需取出部分
-      num_dims = len(output.shape.as_list())
-
-      # Only the last two dimensions are relevant (`seq_length` and `width`), so
-      # we broadcast among the first dimensions, which is typically just
-      # the batch size.
-      position_broadcast_shape = []
-      for _ in range(num_dims - 2):
-        position_broadcast_shape.append(1)
-      position_broadcast_shape.extend([seq_length, width])
-      position_embeddings = tf.reshape(position_embeddings,
-                                       position_broadcast_shape)
-      output += position_embeddings
-
-  output = layer_norm_and_dropout(output, dropout_prob)
-  return output
-~~~
-