在不使用 r 中的 for 循环的情况下迭代数据帧的列
Iterating over the colums of a dataframe without using for loop in r
下面是训练集和测试集,分别是train_dat
和test_dat
。我想遍历数据框中的每个分类列,并将每个级别分配给一个整数,该整数由分类变量级别的字母顺序确定。要分配的整数值存储在 uniqueValuesTrain
中。不使用 for 循环,如何实现更快的矢量化解决方案?
# Data
cat_var_1 <- c("blue", "green", "green", "blue", "red", "brown")
cat_var_2 <- c("rock", "blues", "jazz", "jazz", "rock", "indie")
reg_var_1 <- c(23, 22, 21, 24, 56, 28)
target <- c(1, 0, 1, 0, 0, 1)
train_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 , target, stringsAsFactors=TRUE)
cat_var_1 <- c("green", "green", "blue", "blue", "green", "purple", "magenta")
cat_var_2 <- c("rock", "rock", "jazz", "jazz", "jazz", "pop", "house")
reg_var_1 <- c(12, 23, 25, 27, 34, 12, 32)
target <- c(1, 1, 1, 0, 1, 0, 0)
test_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 ,target, stringsAsFactors=TRUE)
targetVariable <- "target"
catVariables <- names(Filter(is.factor, train_dat))
# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]
# Integer Encoding
trainInt <- copy(train_dat)
testInt <- copy(test_dat)
for(col in catVariables){
uniqueValuesTrain <- sort(unique(trainInt[[col]]))
trainInt[[col]] <- match(trainInt[[col]], uniqueValuesTrain)
testInt[[col]] <- match(testInt[[col]], uniqueValuesTrain)
}
这里有一个选项tidyverse
library(dplyr)
train_dat <- train_dat %>%
mutate(across(all_of(catVariables),
~ match(.x, sort(unique(.x))),
.names = "{.col}_new"))
test_dat <- test_dat %>%
mutate(across(all_of(catVariables),
~ match(.x, sort(unique(train_dat[[cur_column()]]))), .names = "{.col}_new"))
-输出
> train_dat
cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1 blue rock 23 1 1 4
2 green blues 22 0 3 1
3 green jazz 21 1 3 3
4 blue jazz 24 0 1 3
5 red rock 56 0 4 4
6 brown indie 28 1 2 2
> test_dat
cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1 green rock 12 1 3 4
2 green rock 23 1 3 4
3 blue jazz 25 1 1 3
4 blue jazz 27 0 1 3
5 green jazz 34 1 3 3
6 purple pop 12 0 NA NA
7 magenta house 32 0 NA NA
简单一点:
library(data.table)
f <- function(x) match(x,sort(unique(x)))
setDT(train_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
setDT(test_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
输出:
cat_var_1 cat_var_2 reg_var_1 target
<int> <int> <num> <num>
1: 1 4 23 1
2: 3 1 22 0
3: 3 3 21 1
4: 1 3 24 0
5: 4 4 56 0
6: 2 2 28 1
cat_var_1 cat_var_2 reg_var_1 target
<int> <int> <num> <num>
1: 2 4 12 1
2: 2 4 23 1
3: 1 2 25 1
4: 1 2 27 0
5: 2 2 34 1
6: 4 3 12 0
7: 3 1 32 0
下面是训练集和测试集,分别是train_dat
和test_dat
。我想遍历数据框中的每个分类列,并将每个级别分配给一个整数,该整数由分类变量级别的字母顺序确定。要分配的整数值存储在 uniqueValuesTrain
中。不使用 for 循环,如何实现更快的矢量化解决方案?
# Data
cat_var_1 <- c("blue", "green", "green", "blue", "red", "brown")
cat_var_2 <- c("rock", "blues", "jazz", "jazz", "rock", "indie")
reg_var_1 <- c(23, 22, 21, 24, 56, 28)
target <- c(1, 0, 1, 0, 0, 1)
train_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 , target, stringsAsFactors=TRUE)
cat_var_1 <- c("green", "green", "blue", "blue", "green", "purple", "magenta")
cat_var_2 <- c("rock", "rock", "jazz", "jazz", "jazz", "pop", "house")
reg_var_1 <- c(12, 23, 25, 27, 34, 12, 32)
target <- c(1, 1, 1, 0, 1, 0, 0)
test_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 ,target, stringsAsFactors=TRUE)
targetVariable <- "target"
catVariables <- names(Filter(is.factor, train_dat))
# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]
# Integer Encoding
trainInt <- copy(train_dat)
testInt <- copy(test_dat)
for(col in catVariables){
uniqueValuesTrain <- sort(unique(trainInt[[col]]))
trainInt[[col]] <- match(trainInt[[col]], uniqueValuesTrain)
testInt[[col]] <- match(testInt[[col]], uniqueValuesTrain)
}
这里有一个选项tidyverse
library(dplyr)
train_dat <- train_dat %>%
mutate(across(all_of(catVariables),
~ match(.x, sort(unique(.x))),
.names = "{.col}_new"))
test_dat <- test_dat %>%
mutate(across(all_of(catVariables),
~ match(.x, sort(unique(train_dat[[cur_column()]]))), .names = "{.col}_new"))
-输出
> train_dat
cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1 blue rock 23 1 1 4
2 green blues 22 0 3 1
3 green jazz 21 1 3 3
4 blue jazz 24 0 1 3
5 red rock 56 0 4 4
6 brown indie 28 1 2 2
> test_dat
cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1 green rock 12 1 3 4
2 green rock 23 1 3 4
3 blue jazz 25 1 1 3
4 blue jazz 27 0 1 3
5 green jazz 34 1 3 3
6 purple pop 12 0 NA NA
7 magenta house 32 0 NA NA
简单一点:
library(data.table)
f <- function(x) match(x,sort(unique(x)))
setDT(train_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
setDT(test_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
输出:
cat_var_1 cat_var_2 reg_var_1 target
<int> <int> <num> <num>
1: 1 4 23 1
2: 3 1 22 0
3: 3 3 21 1
4: 1 3 24 0
5: 4 4 56 0
6: 2 2 28 1
cat_var_1 cat_var_2 reg_var_1 target
<int> <int> <num> <num>
1: 2 4 12 1
2: 2 4 23 1
3: 1 2 25 1
4: 1 2 27 0
5: 2 2 34 1
6: 4 3 12 0
7: 3 1 32 0