BERT 文本分类

Question

我是 BERT 新手，正在尝试通过 coursera 课程学习 BERT 文本分类微调 https://www.coursera.org/projects/fine-tune-bert-tensorflow/

根据课程，我想分别使用 'SGD' 和 'ADAM' 优化器比较 BERT-12 和 BERT-24 之间的文本分类性能。

我发现当我使用BERT-12时，结果是正常的。然而，当切换到 BERT-24 时，虽然准确率很好 (9X%)，但召回率和精度值极低（甚至接近于零）。

请问我的代码是否有问题？

另外，为了提高准确率和召回率，我应该增加更多的密集层并改变激活函数吗？我应该使用的最佳学习率值是多少？

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import sys
sys.path.append('models')
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df= pd.read_csv('https://archive.org/download/fine-tune-bert-tensorflow-train.csv/train.csv.zip', compression='zip', low_memory=False)

train_data_ratio = 0.1
val_data_ratio = 0.1
rand_seed = 42

train_df, remaining = train_test_split(df, random_state=rand_seed, train_size=train_data_ratio, stratify=df.target.values)
valid_df, _ = train_test_split (remaining , random_state=rand_seed, train_size=val_data_ratio, stratify=remaining.target.values)

#load data from main memory to cpu
with tf.device('/cpu:0'):
  train_data = tf.data.Dataset.from_tensor_slices ((train_df['question_text'].values, train_df['target'].values))
  valid_data = tf.data.Dataset.from_tensor_slices ((valid_df.question_text.values, valid_df.target.values))

"""
Each line of the dataset is composed of the review text and its label
- Data preprocessing consists of transforming text to BERT input features:
input_word_ids, input_mask, segment_ids
- In the process, tokenizing the text is done with the provided BERT model tokenizer
"""

label_list = [0,1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
train_batch_size= 32
learning_rate = 0.001 
num_layer = 24 # change between bert-12 and bert-24 to compare the diff
epochs = 4
optimizer = 'SGD'

assert num_layer in [12, 24] 
if num_layer == 12:
    train_batch_size = 32
elif num_layer == 24:
    train_batch_size = 4 

assert optimizer in ['SGD', 'Adam'] 
if optimizer == 'Adam':
    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
elif optimizer == 'SGD':
    opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)


# Get BERT layer and tokenizer:
https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2
bert_12 = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
bert_24 = "https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2"

if num_layer == 12:
    bert_layer = hub.KerasLayer(bert_12, trainable=True)
elif num_layer == 24:
    bert_layer = hub.KerasLayer(bert_24, trainable=True)
    
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #from tensor to numpy
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() #check if it is lower case (no conversion. to check better)
tokenizer = tokenization.FullTokenizer (vocab_file, do_lower_case)


# from data to features that can be understood by bert

def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
  example = classifier_data_lib.InputExample(guid=None,
                                             text_a=text.numpy(),
                                             text_b=None,
                                             label=label.numpy())
  feature=classifier_data_lib.convert_single_example(0,example,label_list,max_seq_length, tokenizer)

  return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
  
def to_feature_map(text, label):
  input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], 
                                Tout=[tf.int32, tf.int32, tf.int32, tf.int32])


  input_ids.set_shape([max_seq_length])
  input_mask.set_shape([max_seq_length])
  segment_ids.set_shape([max_seq_length])
  label_id.set_shape([])

  x = {
        'input_word_ids': input_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }
  return (x, label_id)
  
with tf.device('/cpu:0'):
  # train
  train_data = (train_data.map(to_feature_map,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          #.cache()
                          .shuffle(1000)
                          .batch(train_batch_size, drop_remainder=True)
                          .prefetch(tf.data.experimental.AUTOTUNE))

  # valid
  valid_data = (valid_data.map(to_feature_map,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .batch(train_batch_size, drop_remainder=True)
                          .prefetch(tf.data.experimental.AUTOTUNE)) 
  

# Building the model
def create_model():
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_mask")
  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_type_ids")

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

  drop = tf.keras.layers.Dropout(0.4)(pooled_output)
  output = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(drop)

  model = tf.keras.Model(
    inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
    outputs=output)
  return model
  
  
model = create_model()
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              #metrics=[tf.keras.metrics.BinaryAccuracy()])
              metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

epochs = epochs
history = model.fit(train_data,
                    validation_data=valid_data,
                    epochs=epochs,
                    verbose=1)
                    
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()

非常感谢！

Answer 1

也许可以尝试将精度和召回率添加到自定义回调函数中，这样您就可以检查发生了什么。我在 (pdb.set_trace()) 中添加了一个调试点，因此该过程将在第一个纪元结束后暂停，您可以逐步查看每个点以调查数据。

from sklearn.metrics import precision_score, recall_score
import pdb


class Callbacks(tf.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(myCallback, self).__init__()
        self.valid_data = valid_data
        

    def on_epoch_end(self, epoch, logs={}):

        pdb.set_trace()

        val_x = valid_data[:-1] # Get bert inputs
        val_y = valid_data[-1] # Get labels

        # Get predictions for the filtered val data
        val_scores = self.model.predict(val_x)

        # Get indices of best predictions - you might need to alter this
        val_y_pred = tf.argmax(val_scores, axis=1)
        val_y_true = tf.argmax(val_y, axis=1)
        
        # Calculate precision and recall
        precision = precision_score(val_y_true, val_y_pred, average='weighted')
        recall = recall_score(val_y_true, val_y_pred, average='weighted')
        
        # Add scores to logs to see in training output
        logs['precision'] = precision
        logs['recall'] = recall

要将验证数据传递给回调，您需要将如下内容添加到适合函数中：

cbs = Callbacks(valid_data)

model.fit(...., callbacks=[cbs])

BERT 文本分类

BERT Text Classification

deep-learning

keras

tensorflow

bert-language-model