修改 R 中的对象,使其兼容绘图
Modifying Objects in R so that they are Compatible for Plotting
我在 R 中找到了一个程序,它能够为数据集中的观察结果绘制图表。
#source: https://cran.r-project.org/web/packages/xgboost/xgboost.pdf
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
对于生成的上述图,我想修改它们,以便在这些图上仅显示第一个观察值。我尝试修改下面的代码
# repeat for just one row: error
b=(agaricus.test$data)[1,]
b = as.matrix(b)
contr <- predict(bst, b, predcontrib = TRUE)
xgb.plot.shap(b, contr, model = bst, top_n = 12, n_col = 3)
Error in xgb.plot.shap(b, contr, model = bst, top_n = 12) :
shap_contrib is not compatible with the provided data
有人可以告诉我我做错了什么吗?或者这根本不可能?
谢谢
据我所知,它并不是那样工作的。这是一个示例 - 二进制变量的得分为 0 或 1:得分 0 = SHAP 值介于 0.2 和 0.5 之间,而得分 1 = SHAP 值介于 1.2 和 1.5 之间 - 这就是该图所说明的 -该变量的 0 和 1 之间 SHAP 值的差异。选择“第一个观察值”可以是得分为 0 或得分为 1 的观察值,因此显示的 SHAP 值并不能真正告诉您有关变量的太多信息。这就是为什么 SHAP 图需要不止一个观察值的矩阵(以及为什么你的方法不起作用)。
尽管如此,如果您愿意,您可以提取前 n 个观测值的 SHAP 值,然后在 ggplot 或 base R 中单独绘制第一个观测值,例如
library(tidyverse)
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
method = "hist", objective = "binary:logistic",
nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
## Use "plot = FALSE" to return the data to "mat", instead of the rendered plot
mat <- xgb.plot.shap(agaricus.test$data[1:2,], contr[1:2,], model = bst,
top_n = 12, n_col = 3, plot = FALSE)
## Format the data
mat$shap_contrib %>%
t() %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("Variable", "SHAP", "second_observation")) %>%
## Then plot however you want
ggplot(aes(y = SHAP, x = "")) +
geom_point(pch = 3) +
theme_bw() +
theme(axis.ticks.x = element_blank(),
axis.title.x = element_blank()) +
facet_wrap(facets = vars(Variable))
根据评论更新:
library(tidyverse)
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
objective = "binary:logistic",
nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE, approxcontrib = FALSE)
pred <- predict(bst, agaricus.test$data)
## Use "plot = FALSE" to return the data to "mat", instead of the rendered plot
mat <- xgb.plot.shap(agaricus.test$data[1:2,], contr[1:2,], model = bst,
top_n = 12, n_col = 3, plot = FALSE)
## Format the data
SHAP <- as.matrix(mat$shap_contrib[1,]) %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("Variable", "SHAP"))
Score <- as.matrix(mat$data[1,]) %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("Variable", "Score"))
Pred <- ifelse(pred[1] <= 0.5, 0, 1)
SHAP_Score <- left_join(SHAP, Score, by = "Variable")
SHAP_Score_Pred <- cbind(SHAP_Score, Pred)
ggplot(SHAP_Score_Pred, aes(y = SHAP, x = Score)) +
geom_hline(yintercept = 0, lty = 2, col = "grey75") +
geom_point(pch = 3, cex = 3, col = "red") +
ggtitle(label = paste("Prediction for this observation =", Pred, sep = " ")) +
theme_bw(base_size = 12) +
theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 16)) +
scale_x_continuous(breaks = c(0,1)) +
facet_wrap(facets = vars(Variable))
我在 R 中找到了一个程序,它能够为数据集中的观察结果绘制图表。
#source: https://cran.r-project.org/web/packages/xgboost/xgboost.pdf
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
对于生成的上述图,我想修改它们,以便在这些图上仅显示第一个观察值。我尝试修改下面的代码
# repeat for just one row: error
b=(agaricus.test$data)[1,]
b = as.matrix(b)
contr <- predict(bst, b, predcontrib = TRUE)
xgb.plot.shap(b, contr, model = bst, top_n = 12, n_col = 3)
Error in xgb.plot.shap(b, contr, model = bst, top_n = 12) :
shap_contrib is not compatible with the provided data
有人可以告诉我我做错了什么吗?或者这根本不可能? 谢谢
据我所知,它并不是那样工作的。这是一个示例 - 二进制变量的得分为 0 或 1:得分 0 = SHAP 值介于 0.2 和 0.5 之间,而得分 1 = SHAP 值介于 1.2 和 1.5 之间 - 这就是该图所说明的 -该变量的 0 和 1 之间 SHAP 值的差异。选择“第一个观察值”可以是得分为 0 或得分为 1 的观察值,因此显示的 SHAP 值并不能真正告诉您有关变量的太多信息。这就是为什么 SHAP 图需要不止一个观察值的矩阵(以及为什么你的方法不起作用)。
尽管如此,如果您愿意,您可以提取前 n 个观测值的 SHAP 值,然后在 ggplot 或 base R 中单独绘制第一个观测值,例如
library(tidyverse)
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
method = "hist", objective = "binary:logistic",
nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
## Use "plot = FALSE" to return the data to "mat", instead of the rendered plot
mat <- xgb.plot.shap(agaricus.test$data[1:2,], contr[1:2,], model = bst,
top_n = 12, n_col = 3, plot = FALSE)
## Format the data
mat$shap_contrib %>%
t() %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("Variable", "SHAP", "second_observation")) %>%
## Then plot however you want
ggplot(aes(y = SHAP, x = "")) +
geom_point(pch = 3) +
theme_bw() +
theme(axis.ticks.x = element_blank(),
axis.title.x = element_blank()) +
facet_wrap(facets = vars(Variable))
根据评论更新:
library(tidyverse)
library(xgboost)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
eta = 0.1, max_depth = 3, subsample = .5,
objective = "binary:logistic",
nthread = 2, verbose = 0)
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE, approxcontrib = FALSE)
pred <- predict(bst, agaricus.test$data)
## Use "plot = FALSE" to return the data to "mat", instead of the rendered plot
mat <- xgb.plot.shap(agaricus.test$data[1:2,], contr[1:2,], model = bst,
top_n = 12, n_col = 3, plot = FALSE)
## Format the data
SHAP <- as.matrix(mat$shap_contrib[1,]) %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("Variable", "SHAP"))
Score <- as.matrix(mat$data[1,]) %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("Variable", "Score"))
Pred <- ifelse(pred[1] <= 0.5, 0, 1)
SHAP_Score <- left_join(SHAP, Score, by = "Variable")
SHAP_Score_Pred <- cbind(SHAP_Score, Pred)
ggplot(SHAP_Score_Pred, aes(y = SHAP, x = Score)) +
geom_hline(yintercept = 0, lty = 2, col = "grey75") +
geom_point(pch = 3, cex = 3, col = "red") +
ggtitle(label = paste("Prediction for this observation =", Pred, sep = " ")) +
theme_bw(base_size = 12) +
theme(axis.text = element_text(size = 14),
axis.title = element_text(size = 16)) +
scale_x_continuous(breaks = c(0,1)) +
facet_wrap(facets = vars(Variable))