From 55084a11fe526c5cfc304ee6c1dbfdb516228ede Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 8 Jan 2021 09:22:16 +0800 Subject: [PATCH] Add. Embedding --- .../源码解读.md | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md index 1807f34..4a72883 100644 --- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md +++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md @@ -198,3 +198,149 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, writer.close() ~~~ + + +#### Embedding层的作用 + +~~~python +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( # ctrl点击BertModel跳转 + config=bert_config, # 配置 + is_training=is_training, + input_ids=input_ids, # 特征 + input_mask=input_mask, # 特征0/1 + token_type_ids=segment_ids, # 特征维度表示第一句话还是第二句 + use_one_hot_embeddings=use_one_hot_embeddings) + 。。。 + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: # 如果没设置mask,默认都是1 + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: # 没设置就默认一句话 + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. 词的embeddings + (self.embedding_output, self.embedding_table) = embedding_lookup( # ctrl点击embedding_lookup跳转 + input_ids=input_ids, # 词 + vocab_size=config.vocab_size, # 语料库 + embedding_size=config.hidden_size, # 编码映射成多少维 + initializer_range=config.initializer_range, # 初始化范围 + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) +~~~ + + + +~~~python +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( # 词映射矩阵 + name=word_embedding_name, # 词向量 + shape=[vocab_size, embedding_size], # 获取语料库大表vovab.txt + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # 查出所有词做one_hot + output = tf.matmul(one_hot_input_ids, embedding_table) # 运算一个batch里所有的映射结果 + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) # 制作返回结果 + return (output, embedding_table) # 返回,词变成了向量 +~~~ + + +