一次将多列强制转换为因子
Coerce multiple columns to factors at once
我有一个如下所示的示例数据框:
data <- data.frame(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
我想知道如何 select 多列并将它们一起转换为因子。我通常按照data$A = as.factor(data$A)
这样的方式来做。但是当数据框很大并且包含很多列时,这种方式会非常耗时。有谁知道更好的方法吗?
选择一些要转换为因子的列:
cols <- c("A", "C", "D", "H")
使用lapply()
强制替换所选列:
data[cols] <- lapply(data[cols], factor) ## as.factor() could also be used
查看结果:
sapply(data, class)
# A B C D E F G
# "factor" "integer" "factor" "factor" "integer" "integer" "integer"
# H I J
# "factor" "integer" "integer"
这是一个使用 dplyr
的选项。来自 magrittr
的 %<>%
运算符使用结果值更新 lhs 对象。
library(magrittr)
library(dplyr)
cols <- c("A", "C", "D", "H")
data %<>%
mutate_each_(funs(factor(.)),cols)
str(data)
#'data.frame': 4 obs. of 10 variables:
# $ A: Factor w/ 4 levels "23","24","26",..: 1 2 3 4
# $ B: int 15 13 39 16
# $ C: Factor w/ 4 levels "3","5","18","37": 2 1 3 4
# $ D: Factor w/ 4 levels "2","6","28","38": 3 1 4 2
# $ E: int 14 4 22 20
# $ F: int 7 19 36 27
# $ G: int 35 40 21 10
# $ H: Factor w/ 4 levels "11","29","32",..: 1 4 3 2
# $ I: int 17 1 9 25
# $ J: int 12 30 8 33
或者如果我们使用 data.table
,或者使用 for
循环 set
setDT(data)
for(j in cols){
set(data, i=NULL, j=j, value=factor(data[[j]]))
}
或者我们可以在.SDcols
中指定'cols'并将rhs赋给(:=
)'cols'
setDT(data)[, (cols):= lapply(.SD, factor), .SDcols=cols]
最近的tidyverse
方法是使用mutate_at
函数:
library(tidyverse)
library(magrittr)
set.seed(88)
data <- data.frame(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
cols <- c("A", "C", "D", "H")
data %<>% mutate_at(cols, factor)
str(data)
$ A: Factor w/ 4 levels "5","17","18",..: 2 1 4 3
$ B: int 36 35 2 26
$ C: Factor w/ 4 levels "22","31","32",..: 1 2 4 3
$ D: Factor w/ 4 levels "1","9","16","39": 3 4 1 2
$ E: int 3 14 30 38
$ F: int 27 15 28 37
$ G: int 19 11 6 21
$ H: Factor w/ 4 levels "7","12","20",..: 1 3 4 2
$ I: int 23 24 13 8
$ J: int 10 25 4 33
并且,为了完整性和关于 this question asking about changing string columns only,有 mutate_if
:
data <- cbind(stringVar = sample(c("foo","bar"),10,replace=TRUE),
data.frame(matrix(sample(1:40), 10, 10, dimnames = list(1:10, LETTERS[1:10]))),stringsAsFactors=FALSE)
factoredData = data %>% mutate_if(is.character,funs(factor(.)))
如果您有另一个 objective 从 table 获取值然后使用它们进行转换,您可以尝试以下方式
### pre processing
ind <- bigm.train[,lapply(.SD,is.character)]
ind <- names(ind[,.SD[T]])
### Convert multiple columns to factor
bigm.train[,(ind):=lapply(.SD,factor),.SDcols=ind]
这会选择专门基于字符的列,然后将它们转换为因子。
您可以使用 mutate_if
(dplyr
):
例如,强制 integer
in factor
:
mydata=structure(list(a = 1:10, b = 1:10, c = c("a", "a", "b", "b",
"c", "c", "c", "c", "c", "c")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 10 x 3
a b c
<int> <int> <chr>
1 1 1 a
2 2 2 a
3 3 3 b
4 4 4 b
5 5 5 c
6 6 6 c
7 7 7 c
8 8 8 c
9 9 9 c
10 10 10 c
使用函数:
library(dplyr)
mydata%>%
mutate_if(is.integer,as.factor)
# A tibble: 10 x 3
a b c
<fct> <fct> <chr>
1 1 1 a
2 2 2 a
3 3 3 b
4 4 4 b
5 5 5 c
6 6 6 c
7 7 7 c
8 8 8 c
9 9 9 c
10 10 10 c
这是一个 data.table
示例。我在这个例子中使用了 grep
,因为这就是我经常 select 许多列的方式,方法是对它们的名称使用部分匹配。
library(data.table)
data <- data.table(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
factorCols <- grep(pattern = "A|C|D|H", x = names(data), value = TRUE)
data[, (factorCols) := lapply(.SD, as.factor), .SDcols = factorCols]
这是使用 purrr
包中的 modify_at()
函数的另一种 tidyverse 方法。
library(purrr)
# Data frame with only integer columns
data <- data.frame(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
# Modify specified columns to a factor class
data_with_factors <- data %>%
purrr::modify_at(c("A", "C", "E"), factor)
# Check the results:
str(data_with_factors)
# 'data.frame': 4 obs. of 10 variables:
# $ A: Factor w/ 4 levels "8","12","33",..: 1 3 4 2
# $ B: int 25 32 2 19
# $ C: Factor w/ 4 levels "5","15","35",..: 1 3 4 2
# $ D: int 11 7 27 6
# $ E: Factor w/ 4 levels "1","4","16","20": 2 3 1 4
# $ F: int 21 23 39 18
# $ G: int 31 14 38 26
# $ H: int 17 24 34 10
# $ I: int 13 28 30 29
# $ J: int 3 22 37 9
似乎在 data.frame 上使用 SAPPLY 立即将变量转换为因子不起作用,因为它会生成矩阵/数组。我的做法是用LAPPLY代替,如下
## let us create a data.frame here
class <- c("7", "6", "5", "3")
cash <- c(100, 200, 300, 150)
height <- c(170, 180, 150, 165)
people <- data.frame(class, cash, height)
class(people) ## This is a dataframe
## We now apply lapply to the data.frame as follows.
bb <- lapply(people, as.factor) %>% data.frame()
## The lapply part returns a list which we coerce back to a data.frame
class(bb) ## A data.frame
##Now let us check the classes of the variables
class(bb$class)
class(bb$height)
class(bb$cash) ## as expected, are all factors.
一个简单且更新的解决方案
data <- data %>%
mutate_at(cols, list(~factor(.)))
截至 2021 年,当前的 tidyverse/dplyr
方法将使用 across
和 <tidy-select>
语句。
library(dplyr)
data %>% mutate(across(*<tidy-select>*, *function*))
across(<tidy-select>)
允许非常一致和轻松地选择要转换的列。
一些例子:
data %>% mutate(across(c(A, B, C, E), as.factor)) # select columns A to C, and E (by name)
data %>% mutate(across(where(is.character), as.factor)) # select character columns
data %>% mutate(across(1:5, as.factor)) # select first 5 columns (by index)
我有一个如下所示的示例数据框:
data <- data.frame(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
我想知道如何 select 多列并将它们一起转换为因子。我通常按照data$A = as.factor(data$A)
这样的方式来做。但是当数据框很大并且包含很多列时,这种方式会非常耗时。有谁知道更好的方法吗?
选择一些要转换为因子的列:
cols <- c("A", "C", "D", "H")
使用lapply()
强制替换所选列:
data[cols] <- lapply(data[cols], factor) ## as.factor() could also be used
查看结果:
sapply(data, class)
# A B C D E F G
# "factor" "integer" "factor" "factor" "integer" "integer" "integer"
# H I J
# "factor" "integer" "integer"
这是一个使用 dplyr
的选项。来自 magrittr
的 %<>%
运算符使用结果值更新 lhs 对象。
library(magrittr)
library(dplyr)
cols <- c("A", "C", "D", "H")
data %<>%
mutate_each_(funs(factor(.)),cols)
str(data)
#'data.frame': 4 obs. of 10 variables:
# $ A: Factor w/ 4 levels "23","24","26",..: 1 2 3 4
# $ B: int 15 13 39 16
# $ C: Factor w/ 4 levels "3","5","18","37": 2 1 3 4
# $ D: Factor w/ 4 levels "2","6","28","38": 3 1 4 2
# $ E: int 14 4 22 20
# $ F: int 7 19 36 27
# $ G: int 35 40 21 10
# $ H: Factor w/ 4 levels "11","29","32",..: 1 4 3 2
# $ I: int 17 1 9 25
# $ J: int 12 30 8 33
或者如果我们使用 data.table
,或者使用 for
循环 set
setDT(data)
for(j in cols){
set(data, i=NULL, j=j, value=factor(data[[j]]))
}
或者我们可以在.SDcols
中指定'cols'并将rhs赋给(:=
)'cols'
setDT(data)[, (cols):= lapply(.SD, factor), .SDcols=cols]
最近的tidyverse
方法是使用mutate_at
函数:
library(tidyverse)
library(magrittr)
set.seed(88)
data <- data.frame(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
cols <- c("A", "C", "D", "H")
data %<>% mutate_at(cols, factor)
str(data)
$ A: Factor w/ 4 levels "5","17","18",..: 2 1 4 3
$ B: int 36 35 2 26
$ C: Factor w/ 4 levels "22","31","32",..: 1 2 4 3
$ D: Factor w/ 4 levels "1","9","16","39": 3 4 1 2
$ E: int 3 14 30 38
$ F: int 27 15 28 37
$ G: int 19 11 6 21
$ H: Factor w/ 4 levels "7","12","20",..: 1 3 4 2
$ I: int 23 24 13 8
$ J: int 10 25 4 33
并且,为了完整性和关于 this question asking about changing string columns only,有 mutate_if
:
data <- cbind(stringVar = sample(c("foo","bar"),10,replace=TRUE),
data.frame(matrix(sample(1:40), 10, 10, dimnames = list(1:10, LETTERS[1:10]))),stringsAsFactors=FALSE)
factoredData = data %>% mutate_if(is.character,funs(factor(.)))
如果您有另一个 objective 从 table 获取值然后使用它们进行转换,您可以尝试以下方式
### pre processing
ind <- bigm.train[,lapply(.SD,is.character)]
ind <- names(ind[,.SD[T]])
### Convert multiple columns to factor
bigm.train[,(ind):=lapply(.SD,factor),.SDcols=ind]
这会选择专门基于字符的列,然后将它们转换为因子。
您可以使用 mutate_if
(dplyr
):
例如,强制 integer
in factor
:
mydata=structure(list(a = 1:10, b = 1:10, c = c("a", "a", "b", "b",
"c", "c", "c", "c", "c", "c")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 10 x 3
a b c
<int> <int> <chr>
1 1 1 a
2 2 2 a
3 3 3 b
4 4 4 b
5 5 5 c
6 6 6 c
7 7 7 c
8 8 8 c
9 9 9 c
10 10 10 c
使用函数:
library(dplyr)
mydata%>%
mutate_if(is.integer,as.factor)
# A tibble: 10 x 3
a b c
<fct> <fct> <chr>
1 1 1 a
2 2 2 a
3 3 3 b
4 4 4 b
5 5 5 c
6 6 6 c
7 7 7 c
8 8 8 c
9 9 9 c
10 10 10 c
这是一个 data.table
示例。我在这个例子中使用了 grep
,因为这就是我经常 select 许多列的方式,方法是对它们的名称使用部分匹配。
library(data.table)
data <- data.table(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
factorCols <- grep(pattern = "A|C|D|H", x = names(data), value = TRUE)
data[, (factorCols) := lapply(.SD, as.factor), .SDcols = factorCols]
这是使用 purrr
包中的 modify_at()
函数的另一种 tidyverse 方法。
library(purrr)
# Data frame with only integer columns
data <- data.frame(matrix(sample(1:40), 4, 10, dimnames = list(1:4, LETTERS[1:10])))
# Modify specified columns to a factor class
data_with_factors <- data %>%
purrr::modify_at(c("A", "C", "E"), factor)
# Check the results:
str(data_with_factors)
# 'data.frame': 4 obs. of 10 variables:
# $ A: Factor w/ 4 levels "8","12","33",..: 1 3 4 2
# $ B: int 25 32 2 19
# $ C: Factor w/ 4 levels "5","15","35",..: 1 3 4 2
# $ D: int 11 7 27 6
# $ E: Factor w/ 4 levels "1","4","16","20": 2 3 1 4
# $ F: int 21 23 39 18
# $ G: int 31 14 38 26
# $ H: int 17 24 34 10
# $ I: int 13 28 30 29
# $ J: int 3 22 37 9
似乎在 data.frame 上使用 SAPPLY 立即将变量转换为因子不起作用,因为它会生成矩阵/数组。我的做法是用LAPPLY代替,如下
## let us create a data.frame here
class <- c("7", "6", "5", "3")
cash <- c(100, 200, 300, 150)
height <- c(170, 180, 150, 165)
people <- data.frame(class, cash, height)
class(people) ## This is a dataframe
## We now apply lapply to the data.frame as follows.
bb <- lapply(people, as.factor) %>% data.frame()
## The lapply part returns a list which we coerce back to a data.frame
class(bb) ## A data.frame
##Now let us check the classes of the variables
class(bb$class)
class(bb$height)
class(bb$cash) ## as expected, are all factors.
一个简单且更新的解决方案
data <- data %>%
mutate_at(cols, list(~factor(.)))
截至 2021 年,当前的 tidyverse/dplyr
方法将使用 across
和 <tidy-select>
语句。
library(dplyr)
data %>% mutate(across(*<tidy-select>*, *function*))
across(<tidy-select>)
允许非常一致和轻松地选择要转换的列。
一些例子:
data %>% mutate(across(c(A, B, C, E), as.factor)) # select columns A to C, and E (by name)
data %>% mutate(across(where(is.character), as.factor)) # select character columns
data %>% mutate(across(1:5, as.factor)) # select first 5 columns (by index)