在不使用循环的情况下重新编码多个变量
Recoding of multiple variables without using loop
str1<-c("A","B","C","D","E","F")
str2<-c("Apple", "Mango", "Avocado", "Watermelon", "Banana", "Pineapple")
str3<-c("Mouse","Cat", "Lion", "Shark", "Eagle", "Ladybug")
num1<-c(1:6)
num2<-c(2.3, 3.5, 4, 7, 6.2, 3)
binary1<-c(0,1,0,1,0,0)
binary2<-c(1,1,0,0,0,1)
mydata<-data.frame(str1,str2, str3,num1,num2, binary1, binary2)
人们总是说向量化比循环更好。
所以我想知道如何通过矢量化而不是使用循环来重新编码大量变量:
我的第一个任务是改变str1
、str2
和str3
的因子,我用了:
for (i in c("str1","str2","str3"){
mydata[i]<-as.factor (mydata[i])
}
我的第二个任务是更改因子中的变量 binary1
和 binary2
,并更改它们在 0=No
、1= Yes
中的值。我用过:
for (i in c("binary1","binary2"){
mydata[i]<-factor (mydata[i], levels=c(0,1), labels=c("No","Yes"))
}
如何在每种情况下使用矢量化而不是循环。
例如,通过使用 dplyr
:
library(dplyr)
mydata %>%
mutate(across(c(1:3,6:7), ~as.factor(.)),
across(starts_with("bin"), ~ifelse(. == 1, "Yes", "No")))
str1 str2 str3 num1 num2 binary1 binary2
1 A Apple Mouse 1 2.3 No Yes
2 B Mango Cat 2 3.5 Yes Yes
3 C Avocado Lion 3 4.0 No No
4 D Watermelon Shark 4 7.0 Yes No
5 E Banana Eagle 5 6.2 No No
6 F Pineapple Ladybug 6 3.0 No Yes
您可以使用 purrr
中的 map()
函数。
# Change str1, str2 and str3 into factors using the map() function
mydata[, c("str1", "str2", "str3")] <-
purrr::map(mydata[, c("str1", "str2", "str3")],
.f = as.factor)
str(mydata)
# Change variables binary1 and binary2 in factor and change their values in 0 = No, 1 = Yes using the map() function
mydata[, c("binary1", "binary2")] <-
purrr::map(mydata[, c("binary1", "binary2")],
.f = factor, levels = c(0, 1), labels = c("No", "Yes"))
str(mydata)
'data.frame': 6 obs. of 7 variables:
$ str1 : Factor w/ 6 levels "A","B","C","D",..: 1 2 3 4 5 6
$ str2 : Factor w/ 6 levels "Apple","Avocado",..: 1 4 2 6 3 5
$ str3 : Factor w/ 6 levels "Cat","Eagle",..: 5 1 4 6 2 3
$ num1 : int 1 2 3 4 5 6
$ num2 : num 2.3 3.5 4 7 6.2 3
$ binary1: num 0 1 0 1 0 0
$ binary2: num 1 1 0 0 0 1
请使用 data.table
在下面找到一种替代解决方案
- 代码
library(data.table)
sel_cols1 <- c("str1", "str2", "str3")
sel_cols2 <- c("binary1", "binary2")
setDT(mydata)[, (sel_cols1) := lapply(.SD, as.factor), .SDcols = sel_cols1
][, (sel_cols2) := lapply(.SD, function(x) as.factor(fifelse(x == 0, "No", "Yes"))), .SDcols = sel_cols2][]
- 输出
#> str1 str2 str3 num1 num2 binary1 binary2
#> 1: A Apple Mouse 1 2.3 No Yes
#> 2: B Mango Cat 2 3.5 Yes Yes
#> 3: C Avocado Lion 3 4.0 No No
#> 4: D Watermelon Shark 4 7.0 Yes No
#> 5: E Banana Eagle 5 6.2 No No
#> 6: F Pineapple Ladybug 6 3.0 No Yes
- 检查
class
个变量
sapply(mydata,class)
#> str1 str2 str3 num1 num2 binary1 binary2
#> "factor" "factor" "factor" "integer" "numeric" "factor" "factor"
由 reprex package (v2.0.1)
于 2021-11-16 创建
str1<-c("A","B","C","D","E","F")
str2<-c("Apple", "Mango", "Avocado", "Watermelon", "Banana", "Pineapple")
str3<-c("Mouse","Cat", "Lion", "Shark", "Eagle", "Ladybug")
num1<-c(1:6)
num2<-c(2.3, 3.5, 4, 7, 6.2, 3)
binary1<-c(0,1,0,1,0,0)
binary2<-c(1,1,0,0,0,1)
mydata<-data.frame(str1,str2, str3,num1,num2, binary1, binary2)
人们总是说向量化比循环更好。
所以我想知道如何通过矢量化而不是使用循环来重新编码大量变量:
我的第一个任务是改变str1
、str2
和str3
的因子,我用了:
for (i in c("str1","str2","str3"){
mydata[i]<-as.factor (mydata[i])
}
我的第二个任务是更改因子中的变量 binary1
和 binary2
,并更改它们在 0=No
、1= Yes
中的值。我用过:
for (i in c("binary1","binary2"){
mydata[i]<-factor (mydata[i], levels=c(0,1), labels=c("No","Yes"))
}
如何在每种情况下使用矢量化而不是循环。
例如,通过使用 dplyr
:
library(dplyr)
mydata %>%
mutate(across(c(1:3,6:7), ~as.factor(.)),
across(starts_with("bin"), ~ifelse(. == 1, "Yes", "No")))
str1 str2 str3 num1 num2 binary1 binary2
1 A Apple Mouse 1 2.3 No Yes
2 B Mango Cat 2 3.5 Yes Yes
3 C Avocado Lion 3 4.0 No No
4 D Watermelon Shark 4 7.0 Yes No
5 E Banana Eagle 5 6.2 No No
6 F Pineapple Ladybug 6 3.0 No Yes
您可以使用 purrr
中的 map()
函数。
# Change str1, str2 and str3 into factors using the map() function
mydata[, c("str1", "str2", "str3")] <-
purrr::map(mydata[, c("str1", "str2", "str3")],
.f = as.factor)
str(mydata)
# Change variables binary1 and binary2 in factor and change their values in 0 = No, 1 = Yes using the map() function
mydata[, c("binary1", "binary2")] <-
purrr::map(mydata[, c("binary1", "binary2")],
.f = factor, levels = c(0, 1), labels = c("No", "Yes"))
str(mydata)
'data.frame': 6 obs. of 7 variables:
$ str1 : Factor w/ 6 levels "A","B","C","D",..: 1 2 3 4 5 6
$ str2 : Factor w/ 6 levels "Apple","Avocado",..: 1 4 2 6 3 5
$ str3 : Factor w/ 6 levels "Cat","Eagle",..: 5 1 4 6 2 3
$ num1 : int 1 2 3 4 5 6
$ num2 : num 2.3 3.5 4 7 6.2 3
$ binary1: num 0 1 0 1 0 0
$ binary2: num 1 1 0 0 0 1
请使用 data.table
- 代码
library(data.table)
sel_cols1 <- c("str1", "str2", "str3")
sel_cols2 <- c("binary1", "binary2")
setDT(mydata)[, (sel_cols1) := lapply(.SD, as.factor), .SDcols = sel_cols1
][, (sel_cols2) := lapply(.SD, function(x) as.factor(fifelse(x == 0, "No", "Yes"))), .SDcols = sel_cols2][]
- 输出
#> str1 str2 str3 num1 num2 binary1 binary2
#> 1: A Apple Mouse 1 2.3 No Yes
#> 2: B Mango Cat 2 3.5 Yes Yes
#> 3: C Avocado Lion 3 4.0 No No
#> 4: D Watermelon Shark 4 7.0 Yes No
#> 5: E Banana Eagle 5 6.2 No No
#> 6: F Pineapple Ladybug 6 3.0 No Yes
- 检查
class
个变量
sapply(mydata,class)
#> str1 str2 str3 num1 num2 binary1 binary2
#> "factor" "factor" "factor" "integer" "numeric" "factor" "factor"
由 reprex package (v2.0.1)
于 2021-11-16 创建