如何在 mlr3 中根据指标列和批量训练预测对任务进行子集化?
How to subset task according to indicator column and batch train-predict in mlr3?
背景
我正在使用 R 中的 mlr3 包进行建模和预测。我正在处理一个由测试集和训练集组成的大数据集。测试集和训练集由指示符列表示(代码:test_or_train)。
目标
- 使用数据集中 train_or_test 列指示的训练行对所有学习器进行批量训练。
- 批量预测 test_or_train 列中 'test' 指定的行与相应的训练学习器。
代码
- 带有测试列车指示器列的占位符数据集。 (在实际数据中train-test split不是人为的)
- 两个任务(在实际代码中任务是不同的,而且还有更多。)
library(readr)
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(reprex)
library(caret)
# Data
urlfile = 'https://raw.githubusercontent.com/shudras/office_data/master/office_data.csv'
data = read_csv(url(urlfile))[-1]
## Create artificial partition to test and train sets
art_part = createDataPartition(data$imdb_rating, list=FALSE)
train = data[art_part,]
test = data[-art_part,]
## Add test-train indicators
train$test_or_train = 'train'
test$test_or_train = 'test'
## Data set that I want to work / am working with
data = rbind(test, train)
# Create two tasks (Here the tasks are the same but in my data set they differ.)
task1 =
TaskRegr$new(
id = 'office1',
backend = data,
target = 'imdb_rating'
)
task2 =
TaskRegr$new(
id = 'office2',
backend = data,
target = 'imdb_rating'
)
# Model specification
graph =
po('scale') %>>%
lrn('regr.cv_glmnet',
id = 'rp',
alpha = 1,
family = 'gaussian'
)
# Learner creation
learner = GraphLearner$new(graph)
# Goal
## 1. Batch train all learners with the train rows indicated by the train_or_test column in the data set
## 2. Batch predict the rows designated by the 'test' in the test_or_train column with the respective trained learner
由 reprex package (v0.3.0)
于 2020-06-22 创建
备注
我尝试将 benchmark_grid 与 row_ids 一起使用来仅使用火车行来训练学习者,但这没有用,而且也无法使用列指示符使用比与行索引。使用列测试训练指示符,可以使用一个规则(用于拆分),而使用行索引仅在任务包含相同行时才有效。
benchmark_grid(
tasks = list(task1, task2),
learners = learner,
row_ids = train_rows # Not an argument and not favorable to work with indices
)
您可以使用 benchmark
自定义设计。
下面应该完成这项工作(请注意,我为每个 Task
分别实例化了一个自定义 Resampling
。
library(data.table)
design = data.table(
task = list(task1, task2),
learner = list(learner)
)
library(mlr3misc)
design$resampling = map(design$task, function(x) {
# get train/test split
split = x$data()[["test_or_train"]]
# remove train-test split column from the task
x$select(setdiff(x$feature_names, "test_or_train"))
# instantiate a custom resampling with the given split
rsmp("custom")$instantiate(x,
train_sets = list(which(split == "train")),
test_sets = list(which(split == "test"))
)
})
benchmark(design)
您能否更清楚地说明 batch-processing
的意思,或者这是否回答了您的问题?
背景
我正在使用 R 中的 mlr3 包进行建模和预测。我正在处理一个由测试集和训练集组成的大数据集。测试集和训练集由指示符列表示(代码:test_or_train)。
目标
- 使用数据集中 train_or_test 列指示的训练行对所有学习器进行批量训练。
- 批量预测 test_or_train 列中 'test' 指定的行与相应的训练学习器。
代码
- 带有测试列车指示器列的占位符数据集。 (在实际数据中train-test split不是人为的)
- 两个任务(在实际代码中任务是不同的,而且还有更多。)
library(readr)
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(reprex)
library(caret)
# Data
urlfile = 'https://raw.githubusercontent.com/shudras/office_data/master/office_data.csv'
data = read_csv(url(urlfile))[-1]
## Create artificial partition to test and train sets
art_part = createDataPartition(data$imdb_rating, list=FALSE)
train = data[art_part,]
test = data[-art_part,]
## Add test-train indicators
train$test_or_train = 'train'
test$test_or_train = 'test'
## Data set that I want to work / am working with
data = rbind(test, train)
# Create two tasks (Here the tasks are the same but in my data set they differ.)
task1 =
TaskRegr$new(
id = 'office1',
backend = data,
target = 'imdb_rating'
)
task2 =
TaskRegr$new(
id = 'office2',
backend = data,
target = 'imdb_rating'
)
# Model specification
graph =
po('scale') %>>%
lrn('regr.cv_glmnet',
id = 'rp',
alpha = 1,
family = 'gaussian'
)
# Learner creation
learner = GraphLearner$new(graph)
# Goal
## 1. Batch train all learners with the train rows indicated by the train_or_test column in the data set
## 2. Batch predict the rows designated by the 'test' in the test_or_train column with the respective trained learner
由 reprex package (v0.3.0)
于 2020-06-22 创建备注
我尝试将 benchmark_grid 与 row_ids 一起使用来仅使用火车行来训练学习者,但这没有用,而且也无法使用列指示符使用比与行索引。使用列测试训练指示符,可以使用一个规则(用于拆分),而使用行索引仅在任务包含相同行时才有效。
benchmark_grid(
tasks = list(task1, task2),
learners = learner,
row_ids = train_rows # Not an argument and not favorable to work with indices
)
您可以使用 benchmark
自定义设计。
下面应该完成这项工作(请注意,我为每个 Task
分别实例化了一个自定义 Resampling
。
library(data.table)
design = data.table(
task = list(task1, task2),
learner = list(learner)
)
library(mlr3misc)
design$resampling = map(design$task, function(x) {
# get train/test split
split = x$data()[["test_or_train"]]
# remove train-test split column from the task
x$select(setdiff(x$feature_names, "test_or_train"))
# instantiate a custom resampling with the given split
rsmp("custom")$instantiate(x,
train_sets = list(which(split == "train")),
test_sets = list(which(split == "test"))
)
})
benchmark(design)
您能否更清楚地说明 batch-processing
的意思,或者这是否回答了您的问题?