在不使用 r 中的 for 循环的情况下迭代数据帧的列

Iterating over the colums of a dataframe without using for loop in r

下面是训练集和测试集,分别是train_dattest_dat。我想遍历数据框中的每个分类列,并将每个级别分配给一个整数,该整数由分类变量级别的字母顺序确定。要分配的整数值存储在 uniqueValuesTrain 中。不使用 for 循环,如何实现更快的矢量化解决方案?

# Data 

cat_var_1 <- c("blue", "green", "green", "blue", "red", "brown")
cat_var_2 <- c("rock", "blues", "jazz", "jazz", "rock", "indie")
reg_var_1 <- c(23, 22, 21, 24, 56, 28)
target <- c(1, 0, 1, 0, 0, 1)

train_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 , target, stringsAsFactors=TRUE)


cat_var_1 <- c("green", "green", "blue", "blue", "green", "purple", "magenta")
cat_var_2 <- c("rock", "rock", "jazz", "jazz", "jazz", "pop", "house")
reg_var_1 <- c(12, 23, 25, 27, 34, 12, 32)
target <- c(1, 1, 1, 0, 1, 0, 0)

test_dat <- data.frame(cat_var_1, cat_var_2, reg_var_1 ,target, stringsAsFactors=TRUE)

targetVariable <- "target"
catVariables <- names(Filter(is.factor, train_dat))

# Remove target variable from catVariables
catVariables <- catVariables[!(catVariables %in% targetVariable)]

# Integer Encoding

trainInt <- copy(train_dat)
testInt <- copy(test_dat)

for(col in catVariables){
  uniqueValuesTrain <- sort(unique(trainInt[[col]]))
  trainInt[[col]] <- match(trainInt[[col]], uniqueValuesTrain)
  testInt[[col]] <- match(testInt[[col]], uniqueValuesTrain)
}



这里有一个选项tidyverse

library(dplyr)
train_dat <- train_dat %>%
  mutate(across(all_of(catVariables),
     ~ match(.x, sort(unique(.x))),
    .names = "{.col}_new"))
test_dat <- test_dat %>% 
   mutate(across(all_of(catVariables), 
   ~ match(.x, sort(unique(train_dat[[cur_column()]]))), .names = "{.col}_new"))

-输出

> train_dat
  cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1      blue      rock        23      1             1             4
2     green     blues        22      0             3             1
3     green      jazz        21      1             3             3
4      blue      jazz        24      0             1             3
5       red      rock        56      0             4             4
6     brown     indie        28      1             2             2
> test_dat
  cat_var_1 cat_var_2 reg_var_1 target cat_var_1_new cat_var_2_new
1     green      rock        12      1             3             4
2     green      rock        23      1             3             4
3      blue      jazz        25      1             1             3
4      blue      jazz        27      0             1             3
5     green      jazz        34      1             3             3
6    purple       pop        12      0            NA            NA
7   magenta     house        32      0            NA            NA

简单一点:

library(data.table)
f <- function(x) match(x,sort(unique(x)))
setDT(train_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]
setDT(test_dat)[,(catVariables):=lapply(.SD,f),.SDcols = catVariables][]

输出:

   cat_var_1 cat_var_2 reg_var_1 target
       <int>     <int>     <num>  <num>
1:         1         4        23      1
2:         3         1        22      0
3:         3         3        21      1
4:         1         3        24      0
5:         4         4        56      0
6:         2         2        28      1

  cat_var_1 cat_var_2 reg_var_1 target
       <int>     <int>     <num>  <num>
1:         2         4        12      1
2:         2         4        23      1
3:         1         2        25      1
4:         1         2        27      0
5:         2         2        34      1
6:         4         3        12      0
7:         3         1        32      0