R: LIME returns 不同特征数字的错误,但事实并非如此
R: LIME returns error on different feature numbers when it's not the case
我正在构建克林顿和特朗普推文的文本分类器(数据可以在 Kaggle 上找到)。
我正在使用 quanteda
软件包进行 EDA 和建模:
library(dplyr)
library(stringr)
library(quanteda)
library(lime)
#data prep
tweet_csv <- read_csv("tweets.csv")
tweet_data <- tweet_csv %>%
select(author = handle,
text,
retweet_count,
favorite_count,
source_url,
timestamp = time) %>%
mutate(date = as_date(str_sub(timestamp, 1, 10)),
hour = hour(hms(str_sub(timestamp, 12, 19))),
tweet_num = row_number()) %>%
select(-timestamp)
# creating corpus and dfm
tweet_corpus <- corpus(tweet_data)
edited_dfm <- dfm(tweet_corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
set.seed(32984)
trainIndex <- sample.int(n = nrow(tweet_csv), size = floor(.8*nrow(tweet_csv)), replace = F)
train_dfm <- edited_dfm[as.vector(trainIndex), ]
train_raw <- tweet_data[as.vector(trainIndex), ]
train_label <- train_raw$author == "realDonaldTrump"
test_dfm <- edited_dfm[-as.vector(trainIndex), ]
test_raw <- tweet_data[-as.vector(trainIndex), ]
test_label <- test_raw$author == "realDonaldTrump"
# making sure train and test sets have the same features
test_dfm <- dfm_select(test_dfm, train_dfm)
# using quanteda's NB model
nb_model <- quanteda::textmodel_nb(train_dfm, train_labels)
nb_preds <- predict(nb_model, test_dfm)
# defining textmodel_nb as classification model
class(nb_model)
model_type.textmodel_nb_fitted <- function(x, ...) {
return("classification")
}
# a wrapper-up function for data preprocessing
get_matrix <- function(df){
corpus <- corpus(df)
dfm <- dfm(corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
}
然后我定义解释器 - 这里没有问题:
explainer <- lime(train_raw[1:5],
model = nb_model,
preprocess = get_matrix)
但是当我 运行 解释器时,即使在与 explainer
中完全相同的数据集上,我也会收到错误消息:
explanation <- lime::explain(train_raw[1:5],
explainer,
n_labels = 1,
n_features = 6,
cols = 2,
verbose = 0)
Error in predict.textmodel_nb_fitted(x, newdata = newdata, type = type, :
feature set in newdata different from that in training set
跟quanteda
和dfms有关系吗?老实说,我不明白为什么会这样。任何帮助都会很棒,谢谢!
我们可以追溯到 predict_model
,它调用了 predict.textmodel_nb_fitted
(我只使用了 train_raw
的前 10 行来加快计算速度):
traceback()
# 7: stop("feature set in newdata different from that in training set")
# 6: predict.textmodel_nb_fitted(x, newdata = newdata, type = type,
# ...)
# 5: predict(x, newdata = newdata, type = type, ...)
# 4: predict_model.default(explainer$model, case_perm, type = o_type)
# 3: predict_model(explainer$model, case_perm, type = o_type)
# 2: explain.data.frame(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
# 1: lime::explain(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
问题是 predict.textmodel_nb_fitted
需要 dfm,而不是数据框。例如,predict(nb_model, test_raw[1:5])
会给您同样的 "feature set in newdata different from that in training set" 错误。但是,explain
将数据框作为其 x
参数。
一个解决方案是为 predict_model
编写自定义 textmodel_nb_fitted
方法,在调用 predict.textmodel_nb_fitted
之前执行必要的对象转换:
predict_model.textmodel_nb_fitted <- function(x, newdata, type, ...) {
X <- corpus(newdata)
X <- dfm_select(dfm(X), x$data$x)
res <- predict(x, newdata = X, ...)
switch(
type,
raw = data.frame(Response = res$nb.predicted, stringsAsFactors = FALSE),
prob = as.data.frame(res$posterior.prob, check.names = FALSE)
)
}
这给了我们
explanation <- lime::explain(train_raw[1:10, 1:5],
explainer,
n_labels = 1,
n_features = 5,
cols = 2,
verbose = 0)
explanation[1, 1:5]
# model_type case label label_prob model_r2
# 1 classification 1 FALSE 0.9999986 0.001693861
我正在构建克林顿和特朗普推文的文本分类器(数据可以在 Kaggle 上找到)。
我正在使用 quanteda
软件包进行 EDA 和建模:
library(dplyr)
library(stringr)
library(quanteda)
library(lime)
#data prep
tweet_csv <- read_csv("tweets.csv")
tweet_data <- tweet_csv %>%
select(author = handle,
text,
retweet_count,
favorite_count,
source_url,
timestamp = time) %>%
mutate(date = as_date(str_sub(timestamp, 1, 10)),
hour = hour(hms(str_sub(timestamp, 12, 19))),
tweet_num = row_number()) %>%
select(-timestamp)
# creating corpus and dfm
tweet_corpus <- corpus(tweet_data)
edited_dfm <- dfm(tweet_corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
set.seed(32984)
trainIndex <- sample.int(n = nrow(tweet_csv), size = floor(.8*nrow(tweet_csv)), replace = F)
train_dfm <- edited_dfm[as.vector(trainIndex), ]
train_raw <- tweet_data[as.vector(trainIndex), ]
train_label <- train_raw$author == "realDonaldTrump"
test_dfm <- edited_dfm[-as.vector(trainIndex), ]
test_raw <- tweet_data[-as.vector(trainIndex), ]
test_label <- test_raw$author == "realDonaldTrump"
# making sure train and test sets have the same features
test_dfm <- dfm_select(test_dfm, train_dfm)
# using quanteda's NB model
nb_model <- quanteda::textmodel_nb(train_dfm, train_labels)
nb_preds <- predict(nb_model, test_dfm)
# defining textmodel_nb as classification model
class(nb_model)
model_type.textmodel_nb_fitted <- function(x, ...) {
return("classification")
}
# a wrapper-up function for data preprocessing
get_matrix <- function(df){
corpus <- corpus(df)
dfm <- dfm(corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
}
然后我定义解释器 - 这里没有问题:
explainer <- lime(train_raw[1:5],
model = nb_model,
preprocess = get_matrix)
但是当我 运行 解释器时,即使在与 explainer
中完全相同的数据集上,我也会收到错误消息:
explanation <- lime::explain(train_raw[1:5],
explainer,
n_labels = 1,
n_features = 6,
cols = 2,
verbose = 0)
Error in predict.textmodel_nb_fitted(x, newdata = newdata, type = type, : feature set in newdata different from that in training set
跟quanteda
和dfms有关系吗?老实说,我不明白为什么会这样。任何帮助都会很棒,谢谢!
我们可以追溯到 predict_model
,它调用了 predict.textmodel_nb_fitted
(我只使用了 train_raw
的前 10 行来加快计算速度):
traceback()
# 7: stop("feature set in newdata different from that in training set")
# 6: predict.textmodel_nb_fitted(x, newdata = newdata, type = type,
# ...)
# 5: predict(x, newdata = newdata, type = type, ...)
# 4: predict_model.default(explainer$model, case_perm, type = o_type)
# 3: predict_model(explainer$model, case_perm, type = o_type)
# 2: explain.data.frame(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
# 1: lime::explain(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
问题是 predict.textmodel_nb_fitted
需要 dfm,而不是数据框。例如,predict(nb_model, test_raw[1:5])
会给您同样的 "feature set in newdata different from that in training set" 错误。但是,explain
将数据框作为其 x
参数。
一个解决方案是为 predict_model
编写自定义 textmodel_nb_fitted
方法,在调用 predict.textmodel_nb_fitted
之前执行必要的对象转换:
predict_model.textmodel_nb_fitted <- function(x, newdata, type, ...) {
X <- corpus(newdata)
X <- dfm_select(dfm(X), x$data$x)
res <- predict(x, newdata = X, ...)
switch(
type,
raw = data.frame(Response = res$nb.predicted, stringsAsFactors = FALSE),
prob = as.data.frame(res$posterior.prob, check.names = FALSE)
)
}
这给了我们
explanation <- lime::explain(train_raw[1:10, 1:5],
explainer,
n_labels = 1,
n_features = 5,
cols = 2,
verbose = 0)
explanation[1, 1:5]
# model_type case label label_prob model_r2
# 1 classification 1 FALSE 0.9999986 0.001693861