我的文本分类器模型没有改进 类
My text classifier model doens't improve with multiple classes
我正在尝试为文本 classification 训练一个模型,该模型采用从文章中嵌入的最多 300 个整数的列表。该模型训练没有问题,除了准确性不会提高。
目标由 41 个类别组成,编码为从 0 到 41 的 int,然后进行归一化。
table 看起来像这样
另外,我不知道我的模型应该是什么样子,因为我参考了下面两个不同的例子
我已经尝试基于这两个模型修改我的模型,但模型精度不会改变,甚至每个时期都会降低
我应该为我的模型添加更多层还是我做了一些我没有意识到的蠢事?
注意:如果'df.pickle'下载link坏了,使用this link
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
from os.path import exists
from os import mkdir
import tensorflow as tf
import pandas as pd
import pickle
# Define dataframe path
df_path = 'df.pickle'
# Check if local dataframe exists
if not exists(df_path):
# Download binary from dropbox
content = urlopen('https://ucd92a22d5e0d4d29b8edb608305.dl.dropboxusercontent.com/cd/0/get/Askx_25n3JI-jmnZsWXmMmRgd4O2EH1w9l0U6zCMq7xdSXs_IN_i2zuUviseqa9N7-WrReFbGhQi8CeseV5cNsFTO8dzRmSdxjr-MWEDQNpPaZ8Ik29E_58YAjY57qTc4CA/file#').read()
# Write to file
with open(df_path, 'wb') as file: file.write(content)
# Load the dataframe from bytes
df = pickle.loads(content)
# If the file exists (aka. downloaded)
else:
# Load the dataframe from file
df = pickle.load(open(df_path, 'rb'))
# Normalize the category
df['Category_Code'] = df['Category_Code'].apply(lambda x: x / 41)
train_df, test_df = [pd.DataFrame() for _ in range(2)]
x_train, x_test, y_train, y_test = train_test_split(df['Content_Parsed'], df['Category_Code'], test_size=0.15, random_state=8)
train_df['Content_Parsed'], train_df['Category_Code'] = x_train, y_train
test_df['Content_Parsed'], test_df['Category_Code'] = x_test, y_test
# Variable containing the number of words we want to keep in our vocabulary
NUM_WORDS = 10000
# Input/Token length
SEQ_LEN = 300
# Create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(train_df['Content_Parsed'])
# Convert text data to numerical indexes
train_seqs=tokenizer.texts_to_sequences(train_df['Content_Parsed'])
test_seqs=tokenizer.texts_to_sequences(test_df['Content_Parsed'])
# Pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)
train_seqs=tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=SEQ_LEN, padding="post")
test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=SEQ_LEN, padding="post")
# Create Models folder if not exists
if not exists('Models'): mkdir('Models')
# Define local model path
model_path = 'Models/model.pickle'
# Check if model exists/pre-trained
if not exists(model_path):
# Define word embedding size
EMBEDDING_SIZE = 16
# Create new model
'''
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
# tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
'''
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
# tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# Stop training when a monitored quantity has stopped improving.
es = tf.keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=1)
# Define batch size (Can be tuned to improve model accuracy)
BATCH_SIZE = 16
# Define number or cycle to train
EPOCHS = 20
# Using GPU (If error means you don't have GPU. Use CPU instead)
with tf.device('/GPU:0'):
# Train/Fit the model
history = model.fit(
train_seqs,
train_df['Category_Code'].values,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_split=0.2,
validation_steps=30,
callbacks=[es]
)
# Evaluate the model
model.evaluate(test_seqs, test_df['Category_Code'].values)
# Save the model into a file
with open(model_path, 'wb') as file: file.write(pickle.dumps(model))
else:
# Load the model
model = pickle.load(open(model_path, 'rb'))
# Check the model
model.summary()
经过 2 天的调整和了解更多示例后,我发现 this 网站很好地解释了 multi-class classification.
我所做的修改详情如下:
因为我要为多个classes构建模型,在模型编译期间模型应该使用categorical_crossentropy
因为它是 损失函数 而不是 binary_crossentropy
.
该模型应该产生与您的 总长度相似的输出数量 class 您将 class 确定哪个在我的例子中 41。 (一个热编码)
最后一层的激活函数应该是"softmax"
,因为我们选择了一个置信度最高的标签(最接近1.0
)。
您需要根据要 classify 的 classes 的数量相应地调整图层。请参阅 here 了解如何改进您的模型。
我的最终代码看起来像这样
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
from functools import reduce
from os.path import exists
from os import listdir
from sys import exit
import tensorflow as tf
import pandas as pd
import pickle
import re
# Specify dataframe path
df_path = 'df.pickle'
# Check if the file exists
if not exists(df_path):
# Specify url of the dataframe binary
url = 'https://www.dropbox.com/s/76hibe24hmpz3bk/df.pickle?dl=1'
# Read the byte content from url
content = urlopen(url).read()
# Write to a file to save up time
with open(df_path, 'wb') as file: file.write(pickle.dumps(content))
# Unpickle the dataframe
df = pickle.loads(content)
else:
# Load the pickle dataframe
df = pickle.load(open(df_path, 'rb'))
# Useful variables
MAX_NUM_WORDS = 50000 # Vocabulary size for our tokenizer
MAX_SEQ_LENGTH = 600 # Maximum length of tokens (for padding later)
EMBEDDING_SIZE = 256 # Embedding size (Tweak to improve accuracy)
OUTPUT_LENGTH = len(df['Category'].unique()) # Number of class to be classified
# Create our tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
# Fit our tokenizer with words/tokens
tokenizer.fit_on_texts(df['Content_Parsed'].values)
# Get our token vocabulary
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))
# Parse our text into sequence of numbers using our tokenizer
X = tokenizer.texts_to_sequences(df['Content_Parsed'].values)
# Pad the sequence up to the MAX_SEQ_LENGTH
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQ_LENGTH)
print('Shape of feature tensor: {}'.format(X.shape))
# Convert our labels into dummy variable (More info on the link provided above)
Y = pd.get_dummies(df['Category']).values
print('Shape of label tensor: {}'.format(Y.shape))
# Split our features and labels into test and train dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Creating our model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(MAX_NUM_WORDS, EMBEDDING_SIZE, input_length=MAX_SEQ_LENGTH))
model.add(tf.keras.layers.SpatialDropout1D(0.2))
# The number 64 could be changed based on your model performance
model.add(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))
# Our output layer with length similar to the OUTPUT_LENGTH
model.add(tf.keras.layers.Dense(OUTPUT_LENGTH, activation='softmax'))
# Compile our model with "categorical_crossentropy" loss function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Model variables
EPOCHS = 100 # Number of cycle to run (The early stopping may stop the training process accordingly)
BATCH_SIZE = 64 # Batch size (Tweaking this may improve model performance a bit)
checkpoint_path = 'model_checkpoints' # Checkpoint path of our model
# Use GPU if available
with tf.device('/GPU:0'):
# Fit/Train our model
history = model.fit(
x_train, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_split=0.1,
callbacks=[
tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001),
tf.keras.callbacks.ModelCheckpoint(
checkpoint_path,
monitor='val_acc',
save_best_only=True,
save_weights_only=False
)
],
verbose=1
)
现在,我的模型准确度表现良好并且每个时期都在增加,但由于验证准确度(val_acc
大约 76
~77
%)表现不佳,我可能需要稍微调整 model/layers。
下面提供了输出快照
我正在尝试为文本 classification 训练一个模型,该模型采用从文章中嵌入的最多 300 个整数的列表。该模型训练没有问题,除了准确性不会提高。
目标由 41 个类别组成,编码为从 0 到 41 的 int,然后进行归一化。
table 看起来像这样
另外,我不知道我的模型应该是什么样子,因为我参考了下面两个不同的例子
我已经尝试基于这两个模型修改我的模型,但模型精度不会改变,甚至每个时期都会降低
我应该为我的模型添加更多层还是我做了一些我没有意识到的蠢事?
注意:如果'df.pickle'下载link坏了,使用this link
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
from os.path import exists
from os import mkdir
import tensorflow as tf
import pandas as pd
import pickle
# Define dataframe path
df_path = 'df.pickle'
# Check if local dataframe exists
if not exists(df_path):
# Download binary from dropbox
content = urlopen('https://ucd92a22d5e0d4d29b8edb608305.dl.dropboxusercontent.com/cd/0/get/Askx_25n3JI-jmnZsWXmMmRgd4O2EH1w9l0U6zCMq7xdSXs_IN_i2zuUviseqa9N7-WrReFbGhQi8CeseV5cNsFTO8dzRmSdxjr-MWEDQNpPaZ8Ik29E_58YAjY57qTc4CA/file#').read()
# Write to file
with open(df_path, 'wb') as file: file.write(content)
# Load the dataframe from bytes
df = pickle.loads(content)
# If the file exists (aka. downloaded)
else:
# Load the dataframe from file
df = pickle.load(open(df_path, 'rb'))
# Normalize the category
df['Category_Code'] = df['Category_Code'].apply(lambda x: x / 41)
train_df, test_df = [pd.DataFrame() for _ in range(2)]
x_train, x_test, y_train, y_test = train_test_split(df['Content_Parsed'], df['Category_Code'], test_size=0.15, random_state=8)
train_df['Content_Parsed'], train_df['Category_Code'] = x_train, y_train
test_df['Content_Parsed'], test_df['Category_Code'] = x_test, y_test
# Variable containing the number of words we want to keep in our vocabulary
NUM_WORDS = 10000
# Input/Token length
SEQ_LEN = 300
# Create tokenizer for our data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(train_df['Content_Parsed'])
# Convert text data to numerical indexes
train_seqs=tokenizer.texts_to_sequences(train_df['Content_Parsed'])
test_seqs=tokenizer.texts_to_sequences(test_df['Content_Parsed'])
# Pad data up to SEQ_LEN (note that we truncate if there are more than SEQ_LEN tokens)
train_seqs=tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=SEQ_LEN, padding="post")
test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=SEQ_LEN, padding="post")
# Create Models folder if not exists
if not exists('Models'): mkdir('Models')
# Define local model path
model_path = 'Models/model.pickle'
# Check if model exists/pre-trained
if not exists(model_path):
# Define word embedding size
EMBEDDING_SIZE = 16
# Create new model
'''
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
# tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
'''
model = tf.keras.Sequential([
tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_SIZE),
# tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(EMBEDDING_SIZE, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy']
)
# Stop training when a monitored quantity has stopped improving.
es = tf.keras.callbacks.EarlyStopping(monitor='val_acc', mode='max', patience=1)
# Define batch size (Can be tuned to improve model accuracy)
BATCH_SIZE = 16
# Define number or cycle to train
EPOCHS = 20
# Using GPU (If error means you don't have GPU. Use CPU instead)
with tf.device('/GPU:0'):
# Train/Fit the model
history = model.fit(
train_seqs,
train_df['Category_Code'].values,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
validation_split=0.2,
validation_steps=30,
callbacks=[es]
)
# Evaluate the model
model.evaluate(test_seqs, test_df['Category_Code'].values)
# Save the model into a file
with open(model_path, 'wb') as file: file.write(pickle.dumps(model))
else:
# Load the model
model = pickle.load(open(model_path, 'rb'))
# Check the model
model.summary()
经过 2 天的调整和了解更多示例后,我发现 this 网站很好地解释了 multi-class classification.
我所做的修改详情如下:
因为我要为多个classes构建模型,在模型编译期间模型应该使用
categorical_crossentropy
因为它是 损失函数 而不是binary_crossentropy
.该模型应该产生与您的 总长度相似的输出数量 class 您将 class 确定哪个在我的例子中 41。 (一个热编码)
最后一层的激活函数应该是
"softmax"
,因为我们选择了一个置信度最高的标签(最接近1.0
)。您需要根据要 classify 的 classes 的数量相应地调整图层。请参阅 here 了解如何改进您的模型。
我的最终代码看起来像这样
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
from functools import reduce
from os.path import exists
from os import listdir
from sys import exit
import tensorflow as tf
import pandas as pd
import pickle
import re
# Specify dataframe path
df_path = 'df.pickle'
# Check if the file exists
if not exists(df_path):
# Specify url of the dataframe binary
url = 'https://www.dropbox.com/s/76hibe24hmpz3bk/df.pickle?dl=1'
# Read the byte content from url
content = urlopen(url).read()
# Write to a file to save up time
with open(df_path, 'wb') as file: file.write(pickle.dumps(content))
# Unpickle the dataframe
df = pickle.loads(content)
else:
# Load the pickle dataframe
df = pickle.load(open(df_path, 'rb'))
# Useful variables
MAX_NUM_WORDS = 50000 # Vocabulary size for our tokenizer
MAX_SEQ_LENGTH = 600 # Maximum length of tokens (for padding later)
EMBEDDING_SIZE = 256 # Embedding size (Tweak to improve accuracy)
OUTPUT_LENGTH = len(df['Category'].unique()) # Number of class to be classified
# Create our tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, lower=True)
# Fit our tokenizer with words/tokens
tokenizer.fit_on_texts(df['Content_Parsed'].values)
# Get our token vocabulary
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))
# Parse our text into sequence of numbers using our tokenizer
X = tokenizer.texts_to_sequences(df['Content_Parsed'].values)
# Pad the sequence up to the MAX_SEQ_LENGTH
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQ_LENGTH)
print('Shape of feature tensor: {}'.format(X.shape))
# Convert our labels into dummy variable (More info on the link provided above)
Y = pd.get_dummies(df['Category']).values
print('Shape of label tensor: {}'.format(Y.shape))
# Split our features and labels into test and train dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
# Creating our model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(MAX_NUM_WORDS, EMBEDDING_SIZE, input_length=MAX_SEQ_LENGTH))
model.add(tf.keras.layers.SpatialDropout1D(0.2))
# The number 64 could be changed based on your model performance
model.add(tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2))
# Our output layer with length similar to the OUTPUT_LENGTH
model.add(tf.keras.layers.Dense(OUTPUT_LENGTH, activation='softmax'))
# Compile our model with "categorical_crossentropy" loss function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Model variables
EPOCHS = 100 # Number of cycle to run (The early stopping may stop the training process accordingly)
BATCH_SIZE = 64 # Batch size (Tweaking this may improve model performance a bit)
checkpoint_path = 'model_checkpoints' # Checkpoint path of our model
# Use GPU if available
with tf.device('/GPU:0'):
# Fit/Train our model
history = model.fit(
x_train, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_split=0.1,
callbacks=[
tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001),
tf.keras.callbacks.ModelCheckpoint(
checkpoint_path,
monitor='val_acc',
save_best_only=True,
save_weights_only=False
)
],
verbose=1
)
现在,我的模型准确度表现良好并且每个时期都在增加,但由于验证准确度(val_acc
大约 76
~77
%)表现不佳,我可能需要稍微调整 model/layers。
下面提供了输出快照