From 79d8f4ad1a873aec3ebaa9e4b38d4ef9eb37ac6e Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 8 Jan 2021 09:39:01 +0800 Subject: [PATCH] =?UTF-8?q?Create=20BERT=E6=B5=81=E7=A8=8B=E8=A7=A3?= =?UTF-8?q?=E8=AF=BB.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../BERT流程解读.md | 514 ++++++++++++++++++ 1 file changed, 514 insertions(+) create mode 100644 NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/BERT流程解读.md diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/BERT流程解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/BERT流程解读.md new file mode 100644 index 0000000..359587f --- /dev/null +++ b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/BERT流程解读.md @@ -0,0 +1,514 @@ +### BERT流程解读 + +#### 数据读取模块 + +处理MRPC数据的类 + +~~~python +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] # 是否是二分类 + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) # 相关的test_a和b怎么切分 + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples +~~~ + + + +读取训练数据代码: + +~~~python + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 得到需要迭代的次数,len(train_examples)计算出多少数据量 除以 我们设置的train_batch_size,再乘上epochs次数。 + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时,让学习率偏小,经过warmup的百分比后,再还原回原始的学习率 +~~~ + + + +#### 数据预处理模块 + +~~~python +# 衔接上一个 + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + +# ctrl点击file_based_xxx函数跳转 +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) # TFRecord读取数据块,在bert中要求数据是TFRecord的形式。 + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # for循环变量取数据 + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) # ctrl点击convert_xxx跳转 + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} # 构建标签0, 1 + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) # ctrl点击tokenize,对第一句话分词 + tokens_b = None + if example.text_b: # 第二句话分词 + tokens_b = tokenizer.tokenize(example.text_b) + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" # 保留3个特殊字符 + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) # 如果太长就截断的操作 + else: # 没有b的时候保留两个字符 + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: # 将下面一对话,CLS开始,SEP断点,变成type_ids的0/1形式,0表示前一句,1表示后一句 + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): # 词切片,将一个词切片成多个小段,让表达的含义更丰富 + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens +~~~ + + + +#### tfrecord制作 + +~~~~python +# 延续上面的convert_single_example模块 + # 开始构建,创建两个列表来承接 + tokens = [] + segment_ids = [] + tokens.append("[CLS]") # 第一个词是CLS + segment_ids.append(0) # 第一个的编码也肯定是0 + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) # 遍历获取,a(第一句话)都是0 + tokens.append("[SEP]") # 遍历完增加个SEP连接符/断电 + segment_ids.append(0) # tokens添加完SEP后,ids也添加对应的0 + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) # b和a一样,唯一不同的是添加的是1 + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) # 转成ID的映射,就是vocab语料库索引 + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. 保证输入的长度是一样的,多退少补 + while len(input_ids) < max_seq_length: # PAD的长度取决于设置的最大长度,小于全补0 + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") # 打印结果,这时候预处理的部分大致完成 + ... + return feature +~~~~ + + + +返回原先的convert_single_example + +~~~python + for (ex_index, example) in enumerate(examples): # 不断遍历处理数据 + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) # ctrl点击convert_xxx跳 + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() # 下面执行格式处理,处理成模型所需的格式 + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) # 最后转换成tf的数据格式 + writer.write(tf_example.SerializeToString()) + writer.close() +~~~ + + + +#### Embedding层的作用 + +~~~python +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( # ctrl点击BertModel跳转 + config=bert_config, # 配置 + is_training=is_training, + input_ids=input_ids, # 特征 + input_mask=input_mask, # 特征0/1 + token_type_ids=segment_ids, # 特征维度表示第一句话还是第二句 + use_one_hot_embeddings=use_one_hot_embeddings) + 。。。 + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: # 如果没设置mask,默认都是1 + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: # 没设置就默认一句话 + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. 词的embeddings + (self.embedding_output, self.embedding_table) = embedding_lookup( # ctrl点击embedding_lookup跳转 + input_ids=input_ids, # 词 + vocab_size=config.vocab_size, # 语料库 + embedding_size=config.hidden_size, # 编码映射成多少维 + initializer_range=config.initializer_range, # 初始化范围 + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) +~~~ + + + +~~~python +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( # 词映射矩阵 + name=word_embedding_name, # 词向量 + shape=[vocab_size, embedding_size], # 获取语料库大表vovab.txt + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # 查出所有词做one_hot + output = tf.matmul(one_hot_input_ids, embedding_table) # 运算一个batch里所有的映射结果 + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) # 制作返回结果 + return (output, embedding_table) # 返回,词变成了向量 +~~~ + + + +#### 位置编码 + +~~~python +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + ... + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( # 制作位置编码,ctrl点击embedding_postprocessor + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) +~~~ + + + +~~~python +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: # 判断是第一句还是第二句,再做相应处理 + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: # 判断是否要做位置编码信息 + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) # 如果位置编码给的过大,为了加速只需取出部分 + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output +~~~ +