BERT 文本分类
BERT Text Classification
我是 BERT 新手,正在尝试通过 coursera 课程学习 BERT 文本分类微调 https://www.coursera.org/projects/fine-tune-bert-tensorflow/
根据课程,我想分别使用 'SGD' 和 'ADAM' 优化器比较 BERT-12 和 BERT-24 之间的文本分类性能。
我发现当我使用BERT-12时,结果是正常的。然而,当切换到 BERT-24 时,虽然准确率很好 (9X%),但召回率和精度值极低(甚至接近于零)。
请问我的代码是否有问题?
另外,为了提高准确率和召回率,我应该增加更多的密集层并改变激活函数吗?我应该使用的最佳学习率值是多少?
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import sys
sys.path.append('models')
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df= pd.read_csv('https://archive.org/download/fine-tune-bert-tensorflow-train.csv/train.csv.zip', compression='zip', low_memory=False)
train_data_ratio = 0.1
val_data_ratio = 0.1
rand_seed = 42
train_df, remaining = train_test_split(df, random_state=rand_seed, train_size=train_data_ratio, stratify=df.target.values)
valid_df, _ = train_test_split (remaining , random_state=rand_seed, train_size=val_data_ratio, stratify=remaining.target.values)
#load data from main memory to cpu
with tf.device('/cpu:0'):
train_data = tf.data.Dataset.from_tensor_slices ((train_df['question_text'].values, train_df['target'].values))
valid_data = tf.data.Dataset.from_tensor_slices ((valid_df.question_text.values, valid_df.target.values))
"""
Each line of the dataset is composed of the review text and its label
- Data preprocessing consists of transforming text to BERT input features:
input_word_ids, input_mask, segment_ids
- In the process, tokenizing the text is done with the provided BERT model tokenizer
"""
label_list = [0,1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
train_batch_size= 32
learning_rate = 0.001
num_layer = 24 # change between bert-12 and bert-24 to compare the diff
epochs = 4
optimizer = 'SGD'
assert num_layer in [12, 24]
if num_layer == 12:
train_batch_size = 32
elif num_layer == 24:
train_batch_size = 4
assert optimizer in ['SGD', 'Adam']
if optimizer == 'Adam':
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
elif optimizer == 'SGD':
opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)
# Get BERT layer and tokenizer:
https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2
bert_12 = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
bert_24 = "https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2"
if num_layer == 12:
bert_layer = hub.KerasLayer(bert_12, trainable=True)
elif num_layer == 24:
bert_layer = hub.KerasLayer(bert_24, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #from tensor to numpy
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() #check if it is lower case (no conversion. to check better)
tokenizer = tokenization.FullTokenizer (vocab_file, do_lower_case)
# from data to features that can be understood by bert
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
example = classifier_data_lib.InputExample(guid=None,
text_a=text.numpy(),
text_b=None,
label=label.numpy())
feature=classifier_data_lib.convert_single_example(0,example,label_list,max_seq_length, tokenizer)
return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
def to_feature_map(text, label):
input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label],
Tout=[tf.int32, tf.int32, tf.int32, tf.int32])
input_ids.set_shape([max_seq_length])
input_mask.set_shape([max_seq_length])
segment_ids.set_shape([max_seq_length])
label_id.set_shape([])
x = {
'input_word_ids': input_ids,
'input_mask': input_mask,
'input_type_ids': segment_ids
}
return (x, label_id)
with tf.device('/cpu:0'):
# train
train_data = (train_data.map(to_feature_map,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
#.cache()
.shuffle(1000)
.batch(train_batch_size, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
# valid
valid_data = (valid_data.map(to_feature_map,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
.batch(train_batch_size, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
# Building the model
def create_model():
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
name="input_mask")
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
name="input_type_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
drop = tf.keras.layers.Dropout(0.4)(pooled_output)
output = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(drop)
model = tf.keras.Model(
inputs={
'input_word_ids': input_word_ids,
'input_mask': input_mask,
'input_type_ids': input_type_ids
},
outputs=output)
return model
model = create_model()
model.compile(optimizer=optimizer,
loss=tf.keras.losses.BinaryCrossentropy(),
#metrics=[tf.keras.metrics.BinaryAccuracy()])
metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])
epochs = epochs
history = model.fit(train_data,
validation_data=valid_data,
epochs=epochs,
verbose=1)
import matplotlib.pyplot as plt
def plot_graphs(history, metric):
plt.plot(history.history[metric])
plt.plot(history.history['val_'+metric], '')
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_'+metric])
plt.show()
非常感谢!
也许可以尝试将精度和召回率添加到自定义回调函数中,这样您就可以检查发生了什么。我在 (pdb.set_trace()
) 中添加了一个调试点,因此该过程将在第一个纪元结束后暂停,您可以逐步查看每个点以调查数据。
from sklearn.metrics import precision_score, recall_score
import pdb
class Callbacks(tf.keras.callbacks.Callback):
def __init__(self, valid_data):
super(myCallback, self).__init__()
self.valid_data = valid_data
def on_epoch_end(self, epoch, logs={}):
pdb.set_trace()
val_x = valid_data[:-1] # Get bert inputs
val_y = valid_data[-1] # Get labels
# Get predictions for the filtered val data
val_scores = self.model.predict(val_x)
# Get indices of best predictions - you might need to alter this
val_y_pred = tf.argmax(val_scores, axis=1)
val_y_true = tf.argmax(val_y, axis=1)
# Calculate precision and recall
precision = precision_score(val_y_true, val_y_pred, average='weighted')
recall = recall_score(val_y_true, val_y_pred, average='weighted')
# Add scores to logs to see in training output
logs['precision'] = precision
logs['recall'] = recall
要将验证数据传递给回调,您需要将如下内容添加到适合函数中:
cbs = Callbacks(valid_data)
model.fit(...., callbacks=[cbs])
我是 BERT 新手,正在尝试通过 coursera 课程学习 BERT 文本分类微调 https://www.coursera.org/projects/fine-tune-bert-tensorflow/
根据课程,我想分别使用 'SGD' 和 'ADAM' 优化器比较 BERT-12 和 BERT-24 之间的文本分类性能。
我发现当我使用BERT-12时,结果是正常的。然而,当切换到 BERT-24 时,虽然准确率很好 (9X%),但召回率和精度值极低(甚至接近于零)。
请问我的代码是否有问题?
另外,为了提高准确率和召回率,我应该增加更多的密集层并改变激活函数吗?我应该使用的最佳学习率值是多少?
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import sys
sys.path.append('models')
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df= pd.read_csv('https://archive.org/download/fine-tune-bert-tensorflow-train.csv/train.csv.zip', compression='zip', low_memory=False)
train_data_ratio = 0.1
val_data_ratio = 0.1
rand_seed = 42
train_df, remaining = train_test_split(df, random_state=rand_seed, train_size=train_data_ratio, stratify=df.target.values)
valid_df, _ = train_test_split (remaining , random_state=rand_seed, train_size=val_data_ratio, stratify=remaining.target.values)
#load data from main memory to cpu
with tf.device('/cpu:0'):
train_data = tf.data.Dataset.from_tensor_slices ((train_df['question_text'].values, train_df['target'].values))
valid_data = tf.data.Dataset.from_tensor_slices ((valid_df.question_text.values, valid_df.target.values))
"""
Each line of the dataset is composed of the review text and its label
- Data preprocessing consists of transforming text to BERT input features:
input_word_ids, input_mask, segment_ids
- In the process, tokenizing the text is done with the provided BERT model tokenizer
"""
label_list = [0,1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
train_batch_size= 32
learning_rate = 0.001
num_layer = 24 # change between bert-12 and bert-24 to compare the diff
epochs = 4
optimizer = 'SGD'
assert num_layer in [12, 24]
if num_layer == 12:
train_batch_size = 32
elif num_layer == 24:
train_batch_size = 4
assert optimizer in ['SGD', 'Adam']
if optimizer == 'Adam':
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
elif optimizer == 'SGD':
opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)
# Get BERT layer and tokenizer:
https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2
bert_12 = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
bert_24 = "https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/2"
if num_layer == 12:
bert_layer = hub.KerasLayer(bert_12, trainable=True)
elif num_layer == 24:
bert_layer = hub.KerasLayer(bert_24, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #from tensor to numpy
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() #check if it is lower case (no conversion. to check better)
tokenizer = tokenization.FullTokenizer (vocab_file, do_lower_case)
# from data to features that can be understood by bert
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
example = classifier_data_lib.InputExample(guid=None,
text_a=text.numpy(),
text_b=None,
label=label.numpy())
feature=classifier_data_lib.convert_single_example(0,example,label_list,max_seq_length, tokenizer)
return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
def to_feature_map(text, label):
input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label],
Tout=[tf.int32, tf.int32, tf.int32, tf.int32])
input_ids.set_shape([max_seq_length])
input_mask.set_shape([max_seq_length])
segment_ids.set_shape([max_seq_length])
label_id.set_shape([])
x = {
'input_word_ids': input_ids,
'input_mask': input_mask,
'input_type_ids': segment_ids
}
return (x, label_id)
with tf.device('/cpu:0'):
# train
train_data = (train_data.map(to_feature_map,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
#.cache()
.shuffle(1000)
.batch(train_batch_size, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
# valid
valid_data = (valid_data.map(to_feature_map,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
.batch(train_batch_size, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
# Building the model
def create_model():
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
name="input_mask")
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
name="input_type_ids")
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
drop = tf.keras.layers.Dropout(0.4)(pooled_output)
output = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(drop)
model = tf.keras.Model(
inputs={
'input_word_ids': input_word_ids,
'input_mask': input_mask,
'input_type_ids': input_type_ids
},
outputs=output)
return model
model = create_model()
model.compile(optimizer=optimizer,
loss=tf.keras.losses.BinaryCrossentropy(),
#metrics=[tf.keras.metrics.BinaryAccuracy()])
metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])
epochs = epochs
history = model.fit(train_data,
validation_data=valid_data,
epochs=epochs,
verbose=1)
import matplotlib.pyplot as plt
def plot_graphs(history, metric):
plt.plot(history.history[metric])
plt.plot(history.history['val_'+metric], '')
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_'+metric])
plt.show()
非常感谢!
也许可以尝试将精度和召回率添加到自定义回调函数中,这样您就可以检查发生了什么。我在 (pdb.set_trace()
) 中添加了一个调试点,因此该过程将在第一个纪元结束后暂停,您可以逐步查看每个点以调查数据。
from sklearn.metrics import precision_score, recall_score
import pdb
class Callbacks(tf.keras.callbacks.Callback):
def __init__(self, valid_data):
super(myCallback, self).__init__()
self.valid_data = valid_data
def on_epoch_end(self, epoch, logs={}):
pdb.set_trace()
val_x = valid_data[:-1] # Get bert inputs
val_y = valid_data[-1] # Get labels
# Get predictions for the filtered val data
val_scores = self.model.predict(val_x)
# Get indices of best predictions - you might need to alter this
val_y_pred = tf.argmax(val_scores, axis=1)
val_y_true = tf.argmax(val_y, axis=1)
# Calculate precision and recall
precision = precision_score(val_y_true, val_y_pred, average='weighted')
recall = recall_score(val_y_true, val_y_pred, average='weighted')
# Add scores to logs to see in training output
logs['precision'] = precision
logs['recall'] = recall
要将验证数据传递给回调,您需要将如下内容添加到适合函数中:
cbs = Callbacks(valid_data)
model.fit(...., callbacks=[cbs])