From 32f2f19292830689704e64925332d480814fbadb Mon Sep 17 00:00:00 2001 From: benjas <909336740@qq.com> Date: Fri, 8 Jan 2021 09:38:57 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20=E6=BA=90=E7=A0=81=E8=A7=A3=E8=AF=BB.m?= =?UTF-8?q?d?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../源码解读.md | 514 ------------------ 1 file changed, 514 deletions(-) delete mode 100644 NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md diff --git a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md b/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md deleted file mode 100644 index c56e772..0000000 --- a/NLP通用框架BERT项目实战/第二章——BERT源码解读与应用实例/源码解读.md +++ /dev/null @@ -1,514 +0,0 @@ -### 源码解读 - -#### 数据读取模块 - -处理MRPC数据的类 - -~~~python -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_test_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") - - def get_labels(self): - """See base class.""" - return ["0", "1"] # 是否是二分类 - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = tokenization.convert_to_unicode(line[3]) # 相关的test_a和b怎么切分 - text_b = tokenization.convert_to_unicode(line[4]) - if set_type == "test": - label = "0" - else: - label = tokenization.convert_to_unicode(line[0]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples -~~~ - - - -读取训练数据代码: - -~~~python - if FLAGS.do_train: - train_examples = processor.get_train_examples(FLAGS.data_dir) - num_train_steps = int( - len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) # 得到需要迭代的次数,len(train_examples)计算出多少数据量 除以 我们设置的train_batch_size,再乘上epochs次数。 - num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # 在刚开始时,让学习率偏小,经过warmup的百分比后,再还原回原始的学习率 -~~~ - - - -#### 数据预处理模块 - -~~~python -# 衔接上一个 - file_based_convert_examples_to_features( - train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) - -# ctrl点击file_based_xxx函数跳转 -def file_based_convert_examples_to_features( - examples, label_list, max_seq_length, tokenizer, output_file): - """Convert a set of `InputExample`s to a TFRecord file.""" - - writer = tf.python_io.TFRecordWriter(output_file) # TFRecord读取数据块,在bert中要求数据是TFRecord的形式。 - - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) # for循环变量取数据 - feature = convert_single_example(ex_index, example, label_list, - max_seq_length, tokenizer) # ctrl点击convert_xxx跳转 - -def convert_single_example(ex_index, example, label_list, max_seq_length, - tokenizer): - """Converts a single `InputExample` into a single `InputFeatures`.""" - - if isinstance(example, PaddingInputExample): - return InputFeatures( - input_ids=[0] * max_seq_length, - input_mask=[0] * max_seq_length, - segment_ids=[0] * max_seq_length, - label_id=0, - is_real_example=False) - - label_map = {} # 构建标签0, 1 - for (i, label) in enumerate(label_list): - label_map[label] = i - - tokens_a = tokenizer.tokenize(example.text_a) # ctrl点击tokenize,对第一句话分词 - tokens_b = None - if example.text_b: # 第二句话分词 - tokens_b = tokenizer.tokenize(example.text_b) - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" # 保留3个特殊字符 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) # 如果太长就截断的操作 - else: # 没有b的时候保留两个字符 - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] - - # The convention in BERT is: - # (a) For sequence pairs: # 将下面一对话,CLS开始,SEP断点,变成type_ids的0/1形式,0表示前一句,1表示后一句 - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): # 词切片,将一个词切片成多个小段,让表达的含义更丰富 - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens -~~~ - - - -#### tfrecord制作 - -~~~~python -# 延续上面的convert_single_example模块 - # 开始构建,创建两个列表来承接 - tokens = [] - segment_ids = [] - tokens.append("[CLS]") # 第一个词是CLS - segment_ids.append(0) # 第一个的编码也肯定是0 - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) # 遍历获取,a(第一句话)都是0 - tokens.append("[SEP]") # 遍历完增加个SEP连接符/断电 - segment_ids.append(0) # tokens添加完SEP后,ids也添加对应的0 - - if tokens_b: - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) # b和a一样,唯一不同的是添加的是1 - tokens.append("[SEP]") - segment_ids.append(1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) # 转成ID的映射,就是vocab语料库索引 - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. 保证输入的长度是一样的,多退少补 - while len(input_ids) < max_seq_length: # PAD的长度取决于设置的最大长度,小于全补0 - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - label_id = label_map[example.label] - if ex_index < 5: - tf.logging.info("*** Example ***") # 打印结果,这时候预处理的部分大致完成 - ... - return feature -~~~~ - - - -返回原先的convert_single_example - -~~~python - for (ex_index, example) in enumerate(examples): # 不断遍历处理数据 - if ex_index % 10000 == 0: - tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - feature = convert_single_example(ex_index, example, label_list, - max_seq_length, tokenizer) # ctrl点击convert_xxx跳 - - def create_int_feature(values): - f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) - return f - - features = collections.OrderedDict() # 下面执行格式处理,处理成模型所需的格式 - features["input_ids"] = create_int_feature(feature.input_ids) - features["input_mask"] = create_int_feature(feature.input_mask) - features["segment_ids"] = create_int_feature(feature.segment_ids) - features["label_ids"] = create_int_feature([feature.label_id]) - features["is_real_example"] = create_int_feature( - [int(feature.is_real_example)]) - - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) # 最后转换成tf的数据格式 - writer.write(tf_example.SerializeToString()) - writer.close() -~~~ - - - -#### Embedding层的作用 - -~~~python -def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, - labels, num_labels, use_one_hot_embeddings): - """Creates a classification model.""" - model = modeling.BertModel( # ctrl点击BertModel跳转 - config=bert_config, # 配置 - is_training=is_training, - input_ids=input_ids, # 特征 - input_mask=input_mask, # 特征0/1 - token_type_ids=segment_ids, # 特征维度表示第一句话还是第二句 - use_one_hot_embeddings=use_one_hot_embeddings) - 。。。 - -class BertModel(object): - """BERT model ("Bidirectional Encoder Representations from Transformers"). - - Example usage: - - ```python - # Already been converted into WordPiece token ids - input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) - input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) - token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) - - config = modeling.BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - - model = modeling.BertModel(config=config, is_training=True, - input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) - - label_embeddings = tf.get_variable(...) - pooled_output = model.get_pooled_output() - logits = tf.matmul(pooled_output, label_embeddings) - ... - ``` - """ - - def __init__(self, - config, - is_training, - input_ids, - input_mask=None, - token_type_ids=None, - use_one_hot_embeddings=False, - scope=None): - """Constructor for BertModel. - - Args: - config: `BertConfig` instance. - is_training: bool. true for training model, false for eval model. Controls - whether dropout will be applied. - input_ids: int32 Tensor of shape [batch_size, seq_length]. - input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - use_one_hot_embeddings: (optional) bool. Whether to use one-hot word - embeddings or tf.embedding_lookup() for the word embeddings. - scope: (optional) variable scope. Defaults to "bert". - - Raises: - ValueError: The config is invalid or one of the input tensor shapes - is invalid. - """ - config = copy.deepcopy(config) - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 - - input_shape = get_shape_list(input_ids, expected_rank=2) - batch_size = input_shape[0] - seq_length = input_shape[1] - - if input_mask is None: # 如果没设置mask,默认都是1 - input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) - - if token_type_ids is None: # 没设置就默认一句话 - token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) - - with tf.variable_scope(scope, default_name="bert"): - with tf.variable_scope("embeddings"): - # Perform embedding lookup on the word ids. 词的embeddings - (self.embedding_output, self.embedding_table) = embedding_lookup( # ctrl点击embedding_lookup跳转 - input_ids=input_ids, # 词 - vocab_size=config.vocab_size, # 语料库 - embedding_size=config.hidden_size, # 编码映射成多少维 - initializer_range=config.initializer_range, # 初始化范围 - word_embedding_name="word_embeddings", - use_one_hot_embeddings=use_one_hot_embeddings) -~~~ - - - -~~~python -def embedding_lookup(input_ids, - vocab_size, - embedding_size=128, - initializer_range=0.02, - word_embedding_name="word_embeddings", - use_one_hot_embeddings=False): - """Looks up words embeddings for id tensor. - - Args: - input_ids: int32 Tensor of shape [batch_size, seq_length] containing word - ids. - vocab_size: int. Size of the embedding vocabulary. - embedding_size: int. Width of the word embeddings. - initializer_range: float. Embedding initialization range. - word_embedding_name: string. Name of the embedding table. - use_one_hot_embeddings: bool. If True, use one-hot method for word - embeddings. If False, use `tf.gather()`. - - Returns: - float Tensor of shape [batch_size, seq_length, embedding_size]. - """ - # This function assumes that the input is of shape [batch_size, seq_length, - # num_inputs]. - # - # If the input is a 2D tensor of shape [batch_size, seq_length], we - # reshape to [batch_size, seq_length, 1]. - if input_ids.shape.ndims == 2: - input_ids = tf.expand_dims(input_ids, axis=[-1]) - - embedding_table = tf.get_variable( # 词映射矩阵 - name=word_embedding_name, # 词向量 - shape=[vocab_size, embedding_size], # 获取语料库大表vovab.txt - initializer=create_initializer(initializer_range)) - - flat_input_ids = tf.reshape(input_ids, [-1]) - if use_one_hot_embeddings: - one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # 查出所有词做one_hot - output = tf.matmul(one_hot_input_ids, embedding_table) # 运算一个batch里所有的映射结果 - else: - output = tf.gather(embedding_table, flat_input_ids) - - input_shape = get_shape_list(input_ids) - - output = tf.reshape(output, - input_shape[0:-1] + [input_shape[-1] * embedding_size]) # 制作返回结果 - return (output, embedding_table) # 返回,词变成了向量 -~~~ - - - -#### 位置编码 - -~~~python -class BertModel(object): - """BERT model ("Bidirectional Encoder Representations from Transformers"). - - Example usage: - - ```python - # Already been converted into WordPiece token ids - input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) - input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) - token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) - - config = modeling.BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - - model = modeling.BertModel(config=config, is_training=True, - input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) - - label_embeddings = tf.get_variable(...) - pooled_output = model.get_pooled_output() - logits = tf.matmul(pooled_output, label_embeddings) - ... - ``` - """ - - def __init__(self, - config, - is_training, - input_ids, - input_mask=None, - token_type_ids=None, - use_one_hot_embeddings=False, - scope=None): - """Constructor for BertModel. - - Args: - config: `BertConfig` instance. - is_training: bool. true for training model, false for eval model. Controls - whether dropout will be applied. - input_ids: int32 Tensor of shape [batch_size, seq_length]. - input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - use_one_hot_embeddings: (optional) bool. Whether to use one-hot word - embeddings or tf.embedding_lookup() for the word embeddings. - scope: (optional) variable scope. Defaults to "bert". - - Raises: - ValueError: The config is invalid or one of the input tensor shapes - is invalid. - """ - ... - - # Add positional embeddings and token type embeddings, then layer - # normalize and perform dropout. - self.embedding_output = embedding_postprocessor( # 制作位置编码,ctrl点击embedding_postprocessor - input_tensor=self.embedding_output, - use_token_type=True, - token_type_ids=token_type_ids, - token_type_vocab_size=config.type_vocab_size, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=config.initializer_range, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) -~~~ - - - -~~~python -def embedding_postprocessor(input_tensor, - use_token_type=False, - token_type_ids=None, - token_type_vocab_size=16, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=0.02, - max_position_embeddings=512, - dropout_prob=0.1): - """Performs various post-processing on a word embedding tensor. - - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, - embedding_size]. - use_token_type: bool. Whether to add embeddings for `token_type_ids`. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - Must be specified if `use_token_type` is True. - token_type_vocab_size: int. The vocabulary size of `token_type_ids`. - token_type_embedding_name: string. The name of the embedding table variable - for token type ids. - use_position_embeddings: bool. Whether to add position embeddings for the - position of each token in the sequence. - position_embedding_name: string. The name of the embedding table variable - for positional embeddings. - initializer_range: float. Range of the weight initialization. - max_position_embeddings: int. Maximum sequence length that might ever be - used with this model. This can be longer than the sequence length of - input_tensor, but cannot be shorter. - dropout_prob: float. Dropout probability applied to the final output tensor. - - Returns: - float tensor with same shape as `input_tensor`. - - Raises: - ValueError: One of the tensor shapes or input values is invalid. - """ - input_shape = get_shape_list(input_tensor, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - width = input_shape[2] - - output = input_tensor - - if use_token_type: # 判断是第一句还是第二句,再做相应处理 - if token_type_ids is None: - raise ValueError("`token_type_ids` must be specified if" - "`use_token_type` is True.") - token_type_table = tf.get_variable( - name=token_type_embedding_name, - shape=[token_type_vocab_size, width], - initializer=create_initializer(initializer_range)) - # This vocab will be small so we always do one-hot here, since it is always - # faster for a small vocabulary. - flat_token_type_ids = tf.reshape(token_type_ids, [-1]) - one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) - token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) - token_type_embeddings = tf.reshape(token_type_embeddings, - [batch_size, seq_length, width]) - output += token_type_embeddings - - if use_position_embeddings: # 判断是否要做位置编码信息 - assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) - with tf.control_dependencies([assert_op]): - full_position_embeddings = tf.get_variable( - name=position_embedding_name, - shape=[max_position_embeddings, width], - initializer=create_initializer(initializer_range)) - # Since the position embedding table is a learned variable, we create it - # using a (long) sequence length `max_position_embeddings`. The actual - # sequence length might be shorter than this, for faster training of - # tasks that do not have long sequences. - # - # So `full_position_embeddings` is effectively an embedding table - # for position [0, 1, 2, ..., max_position_embeddings-1], and the current - # sequence has positions [0, 1, 2, ... seq_length-1], so we can just - # perform a slice. - position_embeddings = tf.slice(full_position_embeddings, [0, 0], - [seq_length, -1]) # 如果位置编码给的过大,为了加速只需取出部分 - num_dims = len(output.shape.as_list()) - - # Only the last two dimensions are relevant (`seq_length` and `width`), so - # we broadcast among the first dimensions, which is typically just - # the batch size. - position_broadcast_shape = [] - for _ in range(num_dims - 2): - position_broadcast_shape.append(1) - position_broadcast_shape.extend([seq_length, width]) - position_embeddings = tf.reshape(position_embeddings, - position_broadcast_shape) - output += position_embeddings - - output = layer_norm_and_dropout(output, dropout_prob) - return output -~~~ -