如何获得mlr3中Logistic回归的系数?
How to get the coefficient of the Logistic Regression in mlr3?
刚开始使用mlr3,对语法还很陌生,有两个问题:
- 如何从 mlr3 中经过训练的逻辑回归中获取系数?
- 我正在处理一个极度不平衡的数据集,98% vs 2%,并且这个数据集中有超过 200 万行,我尝试使用 SMOTE 方法,但是它很慢,因为它可以很快完成在 python 中,我的代码有没有错误?
这是我的代码:
task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
table(task$truth())
po_over = po("classbalancing",id="oversample",adjust="minor",reference="minor",shuffle=F,ratio=16)
table(po_over$train(list(task))$output$truth())
learner = mlr_learners$get("classif.rpart")
learner$predict_type = "prob"
learner = po_over %>>% learner
resampling = rsmp("holdout",ratio=0.8)
rr = resample(task,learner,resampling,store_models = T)
res <- rr$prediction()
auto1 <- autoplot(res)
auto2 <- autoplot(res,type='roc')
rr$score(msr("classif.acc"))$classif.acc %>% print()
对于 SMOTE:
gr_smote =
po("colapply", id = "int_to_num",
applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
po("smote", dup_size = 15) %>>%
po("colapply", id = "num_to_int",
applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))
这是我为您的问题 #1 收集的内容
创建一个包含大约 98% 的 1 和 2% 的 0 的数据集
制定训练和测试任务
(1) 创建过平衡 po 东西
(2) 以这种方式创建学习者,您原始代码中的方式不适用于 po
在训练集上训练学习器
在测试集上测试
library(mlr3)
library(dplyr)
library(mlr3pipelines)
set.seed(10)
pcs=data.frame(a=runif(1000), b=runif(1000))
pcs = pcs %>%
mutate(c=2*a+3*b, d=ifelse(c>.6, 1, 0), navigator=factor(d)) %>%
select(-c, -d)
task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
train_set = sample(task$nrow, 0.8 * task$nrow)
test_set = setdiff(seq_len(task$nrow), train_set)
task_train <- task$clone()$filter(train_set)
task_test <- task$clone()$filter(test_set)
po_over1= po("classbalancing")
po_over1$param_set$values=list(ratio=16, reference="minor", adjust="minor", shuffle=FALSE)
learner=GraphLearner$new(
po_over1 %>>%
po("learner", lrn("classif.rpart",
predict_type="prob"))
)
learner$train(task_train)
pred=learner$predict(task_test)
输出:
learner$model
#' You can see the predicted probability by following the decision tree
#' e.g. say you have a data point a and b
#' first check that b>=.112 or b<.112 (nodes 2 and 3)
#' etc.
1) root 1085 304 1 (0.71981567 0.28018433)
2) b>=0.1122314 728 16 1 (0.97802198 0.02197802)
4) a>=0.007176245 709 0 1 (1.00000000 0.00000000) *
5) a< 0.007176245 19 3 0 (0.15789474 0.84210526) *
3) b< 0.1122314 357 69 0 (0.19327731 0.80672269)
6) a>=0.246552 65 0 1 (1.00000000 0.00000000) *
7) a< 0.246552 292 4 0 (0.01369863 0.98630137) *
#Test predictions
pred$confusion
truth
response 1 0
1 195 1
0 0 4
这是针对问题 #2 SMOTE
gr_smote =
po("colapply", id = "int_to_num",
applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
po("smote", dup_size = 15) %>>%
po("colapply", id = "num_to_int",
applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))
learner=GraphLearner$new(
gr_smote %>>% po("learner", lrn("classif.rpart", predict_type="prob"))
)
learner$train(task_train)
learner$model
1) root 1085 304 1 (0.7198157 0.2801843)
2) b>=0.5 391 0 1 (1.0000000 0.0000000) *
3) b< 0.5 694 304 1 (0.5619597 0.4380403)
6) a>=0.5 203 0 1 (1.0000000 0.0000000) *
7) a< 0.5 491 187 0 (0.3808554 0.6191446) *
pred=learner$predict(task_test)
pred$confusion
truth
response 1 0
1 159 0
0 36 5
刚开始使用mlr3,对语法还很陌生,有两个问题:
- 如何从 mlr3 中经过训练的逻辑回归中获取系数?
- 我正在处理一个极度不平衡的数据集,98% vs 2%,并且这个数据集中有超过 200 万行,我尝试使用 SMOTE 方法,但是它很慢,因为它可以很快完成在 python 中,我的代码有没有错误? 这是我的代码:
task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
table(task$truth())
po_over = po("classbalancing",id="oversample",adjust="minor",reference="minor",shuffle=F,ratio=16)
table(po_over$train(list(task))$output$truth())
learner = mlr_learners$get("classif.rpart")
learner$predict_type = "prob"
learner = po_over %>>% learner
resampling = rsmp("holdout",ratio=0.8)
rr = resample(task,learner,resampling,store_models = T)
res <- rr$prediction()
auto1 <- autoplot(res)
auto2 <- autoplot(res,type='roc')
rr$score(msr("classif.acc"))$classif.acc %>% print()
对于 SMOTE:
gr_smote =
po("colapply", id = "int_to_num",
applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
po("smote", dup_size = 15) %>>%
po("colapply", id = "num_to_int",
applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))
这是我为您的问题 #1 收集的内容
创建一个包含大约 98% 的 1 和 2% 的 0 的数据集
制定训练和测试任务
(1) 创建过平衡 po 东西
(2) 以这种方式创建学习者,您原始代码中的方式不适用于 po
在训练集上训练学习器
在测试集上测试
library(mlr3)
library(dplyr)
library(mlr3pipelines)
set.seed(10)
pcs=data.frame(a=runif(1000), b=runif(1000))
pcs = pcs %>%
mutate(c=2*a+3*b, d=ifelse(c>.6, 1, 0), navigator=factor(d)) %>%
select(-c, -d)
task = TaskClassif$new("pcs",backend =pcs,target = "navigator",positive = "1" )
train_set = sample(task$nrow, 0.8 * task$nrow)
test_set = setdiff(seq_len(task$nrow), train_set)
task_train <- task$clone()$filter(train_set)
task_test <- task$clone()$filter(test_set)
po_over1= po("classbalancing")
po_over1$param_set$values=list(ratio=16, reference="minor", adjust="minor", shuffle=FALSE)
learner=GraphLearner$new(
po_over1 %>>%
po("learner", lrn("classif.rpart",
predict_type="prob"))
)
learner$train(task_train)
pred=learner$predict(task_test)
输出:
learner$model
#' You can see the predicted probability by following the decision tree
#' e.g. say you have a data point a and b
#' first check that b>=.112 or b<.112 (nodes 2 and 3)
#' etc.
1) root 1085 304 1 (0.71981567 0.28018433)
2) b>=0.1122314 728 16 1 (0.97802198 0.02197802)
4) a>=0.007176245 709 0 1 (1.00000000 0.00000000) *
5) a< 0.007176245 19 3 0 (0.15789474 0.84210526) *
3) b< 0.1122314 357 69 0 (0.19327731 0.80672269)
6) a>=0.246552 65 0 1 (1.00000000 0.00000000) *
7) a< 0.246552 292 4 0 (0.01369863 0.98630137) *
#Test predictions
pred$confusion
truth
response 1 0
1 195 1
0 0 4
这是针对问题 #2 SMOTE
gr_smote =
po("colapply", id = "int_to_num",
applicator = as.numeric, affect_columns = selector_type("integer")) %>>%
po("smote", dup_size = 15) %>>%
po("colapply", id = "num_to_int",
applicator = function(x) as.integer(round(x, 0L)), affect_columns = selector_type("numeric"))
learner=GraphLearner$new(
gr_smote %>>% po("learner", lrn("classif.rpart", predict_type="prob"))
)
learner$train(task_train)
learner$model
1) root 1085 304 1 (0.7198157 0.2801843)
2) b>=0.5 391 0 1 (1.0000000 0.0000000) *
3) b< 0.5 694 304 1 (0.5619597 0.4380403)
6) a>=0.5 203 0 1 (1.0000000 0.0000000) *
7) a< 0.5 491 187 0 (0.3808554 0.6191446) *
pred=learner$predict(task_test)
pred$confusion
truth
response 1 0
1 159 0
0 36 5