Add. Embedding

5 years ago · 55084a11fe
parent c82753c8f5
commit 55084a11fe
1 changed files with 146 additions and 0 deletions
--- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
+++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md
@ -198,3 +198,149 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
  writer.close()
 ~~~

+
+
+#### Embedding层的作用
+
+~~~python
+def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
+                 labels, num_labels, use_one_hot_embeddings):
+  """Creates a classification model."""
+  model = modeling.BertModel(  # ctrl点击BertModel跳转
+      config=bert_config,  # 配置
+      is_training=is_training,
+      input_ids=input_ids,  # 特征
+      input_mask=input_mask,  # 特征0/1
+      token_type_ids=segment_ids,  # 特征维度表示第一句话还是第二句
+      use_one_hot_embeddings=use_one_hot_embeddings)
+    。。。
+
+class BertModel(object):
+  """BERT model ("Bidirectional Encoder Representations from Transformers").
+
+  Example usage:
+
+  ```python
+  # Already been converted into WordPiece token ids
+  input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
+  input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
+  token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
+
+  config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+    num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+
+  model = modeling.BertModel(config=config, is_training=True,
+    input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+
+  label_embeddings = tf.get_variable(...)
+  pooled_output = model.get_pooled_output()
+  logits = tf.matmul(pooled_output, label_embeddings)
+  ...
+  ```
+  """
+
+  def __init__(self,
+               config,
+               is_training,
+               input_ids,
+               input_mask=None,
+               token_type_ids=None,
+               use_one_hot_embeddings=False,
+               scope=None):
+    """Constructor for BertModel.
+
+    Args:
+      config: `BertConfig` instance.
+      is_training: bool. true for training model, false for eval model. Controls
+        whether dropout will be applied.
+      input_ids: int32 Tensor of shape [batch_size, seq_length].
+      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
+      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
+        embeddings or tf.embedding_lookup() for the word embeddings.
+      scope: (optional) variable scope. Defaults to "bert".
+
+    Raises:
+      ValueError: The config is invalid or one of the input tensor shapes
+        is invalid.
+    """
+    config = copy.deepcopy(config)
+    if not is_training:
+      config.hidden_dropout_prob = 0.0
+      config.attention_probs_dropout_prob = 0.0
+
+    input_shape = get_shape_list(input_ids, expected_rank=2)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+
+    if input_mask is None:  # 如果没设置mask，默认都是1
+      input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+
+    if token_type_ids is None:  # 没设置就默认一句话
+      token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
+
+    with tf.variable_scope(scope, default_name="bert"):
+      with tf.variable_scope("embeddings"):
+        # Perform embedding lookup on the word ids. 词的embeddings
+        (self.embedding_output, self.embedding_table) = embedding_lookup(  # ctrl点击embedding_lookup跳转
+            input_ids=input_ids,  # 词
+            vocab_size=config.vocab_size,  # 语料库
+            embedding_size=config.hidden_size,  # 编码映射成多少维
+            initializer_range=config.initializer_range,  # 初始化范围
+            word_embedding_name="word_embeddings",
+            use_one_hot_embeddings=use_one_hot_embeddings)
+~~~
+
+
+
+~~~python
+def embedding_lookup(input_ids,
+                     vocab_size,
+                     embedding_size=128,
+                     initializer_range=0.02,
+                     word_embedding_name="word_embeddings",
+                     use_one_hot_embeddings=False):
+  """Looks up words embeddings for id tensor.
+
+  Args:
+    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
+      ids.
+    vocab_size: int. Size of the embedding vocabulary.
+    embedding_size: int. Width of the word embeddings.
+    initializer_range: float. Embedding initialization range.
+    word_embedding_name: string. Name of the embedding table.
+    use_one_hot_embeddings: bool. If True, use one-hot method for word
+      embeddings. If False, use `tf.gather()`.
+
+  Returns:
+    float Tensor of shape [batch_size, seq_length, embedding_size].
+  """
+  # This function assumes that the input is of shape [batch_size, seq_length,
+  # num_inputs].
+  #
+  # If the input is a 2D tensor of shape [batch_size, seq_length], we
+  # reshape to [batch_size, seq_length, 1].
+  if input_ids.shape.ndims == 2:
+    input_ids = tf.expand_dims(input_ids, axis=[-1])
+
+  embedding_table = tf.get_variable(  # 词映射矩阵
+      name=word_embedding_name,  # 词向量
+      shape=[vocab_size, embedding_size],  # 获取语料库大表vovab.txt
+      initializer=create_initializer(initializer_range))
+
+  flat_input_ids = tf.reshape(input_ids, [-1])
+  if use_one_hot_embeddings:
+    one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)  # 查出所有词做one_hot
+    output = tf.matmul(one_hot_input_ids, embedding_table)  # 运算一个batch里所有的映射结果
+  else:
+    output = tf.gather(embedding_table, flat_input_ids)
+
+  input_shape = get_shape_list(input_ids)
+
+  output = tf.reshape(output,
+                      input_shape[0:-1] + [input_shape[-1] * embedding_size])  # 制作返回结果
+  return (output, embedding_table)  # 返回，词变成了向量
+~~~
+
+
+