将变量拆分为多个多因子变量
Split variable into multiple multiple factor variables
我有一些类似的数据集:
df <- data.frame(n = seq(1:1000000), x = sample(LETTERS, 1000000, replace = T))
我正在寻找将变量 x
拆分为范围为 0-1
的多个分类变量的方法的指导
最后看起来像这样:
n x A B C D E F G H . . .
1 D 0 0 0 1 0 0 0 0 . . .
2 B 0 1 0 0 0 0 0 0 . . .
3 F 0 0 0 0 0 1 0 0 . . .
在我的数据集中,变量 x 中有更多代码,因此手动添加每个新变量太耗时了。
我正在考虑对 var x 中的代码进行排序,并为每个代码分配一个唯一的编号,然后创建一个迭代循环,为变量 x 中的每个代码创建新变量。
但我觉得我把事情复杂化了
使用match
。首先创建一个零向量,然后将 df
行的字母与字母表中的向量匹配,然后转向 1
。您可以使用内置 LETTERS
常量。最后 Vectorize
事情和 cbind
.
f <- \(x) {
z <- numeric(length(LETTERS))
z[match(x, LETTERS)] <- 1
setNames(z, LETTERS)
}
cbind(df, t(Vectorize(f)(df$x)))
# n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
# Q 1 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# E 2 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# A 3 A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Y 4 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
# J 5 J 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# D 6 D 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# R 7 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
# Z 8 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
# Q.1 9 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# O 10 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
或者,transform
x
以 LETTERS
作为水平的因子并使用 model.matrix
.
df <- transform(df, x=factor(x, levels=LETTERS))
cbind(df, `colnames<-`(model.matrix(~ 0 + x, df), LETTERS))
# n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
# 1 1 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# 2 2 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 3 3 A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 4 4 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
# 5 5 J 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 6 6 D 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 7 7 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
# 8 8 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
# 9 9 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# 10 10 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
数据:
n <- 10
set.seed(42)
df <- data.frame(n = seq(1:n), x = sample(LETTERS, n, replace = T))
这里的主要问题是资源?我认为。我发现使用 nnet
是一个快速的解决方案:
library(nnet)
library(dplyr)
df %>% cbind(class.ind(.$x) == 1) %>%
mutate(across(-c(n, x), ~.*1))
n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1 1 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2 H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 3 L 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 M 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
5 5 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
6 6 A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 7 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
8 8 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
9 9 F 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 10 U 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
11 11 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
12 12 I 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13 13 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
14 14 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
15 15 P 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
16 16 T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
17 17 F 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18 18 K 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 19 H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
20 20 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
21 21 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
22 22 G 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 23 P 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
24 24 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
25 25 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
26 26 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
27 27 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
28 28 B 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
29 29 D 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
30 30 M 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
31 31 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
32 32 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
33 33 S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
34 34 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
35 35 T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
[ reached 'max' / getOption("max.print") -- omitted 999965 rows ]
>
另一种选择是使用 ==
。
. <- unique(df$x)
cbind(df, +do.call(cbind, lapply(setNames(., .), `==`, df$x)))
# n x C I L T Y
#1 1 I 0 1 0 0 0
#2 2 C 1 0 0 0 0
#3 3 C 1 0 0 0 0
#4 4 Y 0 0 0 0 1
#5 5 L 0 0 1 0 0
#6 6 T 0 0 0 1 0
#...
或在一行中使用 sapply
。
cbind(df, +sapply(unique(df$x), `==`, df$x))
或使用 contrasts
和 match
到 df$x
。
. <- contrasts(as.factor(df$x), FALSE)
#. <- contrasts(as.factor(unique(df$x)), FALSE) #Alternative
cbind(df, .[match(df$x, rownames(.)),])
#cbind(df, .[fastmatch::fmatch(df$x, rownames(.)),]) #Alternative
或在 matrix
中建立索引。
. <- unique(df$x) #Could be sorted
#. <- collapse::funique(df$x) #Alternative
#. <- kit::funique(df$x) #Alternative
i <- match(df$x, .)
#i <- fastmatch::fmatch(df$x, .) #Alternative
#i <- data.table::chmatch(df$x, .) #Alternative
nc <- length(.)
nr <- length(i)
cbind(df, matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)))
或使用outer
.
. <- unique(df$x)
cbind(df, +outer(df$x, setNames(., .), `==`))
或使用 rep
和 m̀atrix`。
. <- unique(df$x)
n <- nrow(df)
cbind(df, +matrix(df$x == rep(., each=n), n, dimnames=list(NULL, .)))
一些方法的基准测试将适用于 变量 x 中的更多代码,而不仅仅是例如LETTERS
.
set.seed(42)
df <- data.frame(n = seq(1:1000000), x = sample(LETTERS, 1000000, replace = T))
library(nnet)
library(dplyr)
microbenchmark::microbenchmark(times = 10L, setup = gc(FALSE), control=list(order="block")
, "nnet" = df %>% cbind(class.ind(.$x) == 1) %>%
mutate(across(-c(n, x), ~.*1))
, "contrasts" = {. <- contrasts(as.factor(df$x), FALSE)
cbind(df, .[match(df$x, rownames(.)),])}
, "==" = {. <- unique(df$x)
cbind(df, +do.call(cbind, lapply(setNames(., .), `==`, df$x)))}
, "==Sapply" = cbind(df, +sapply(unique(df$x), `==`, df$x))
, "matrix" = {. <- unique(df$x)
i <- match(df$x, .)
nc <- length(.)
nr <- length(i)
cbind(df, matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)))}
, "outer" = {. <- unique(df$x)
cbind(df, +outer(df$x, setNames(., .), `==`))}
, "rep" = {. <- unique(df$x)
n <- nrow(df)
cbind(df, +matrix(df$x == rep(., each=n), n, dimnames=list(NULL, .)))}
)
结果
Unit: milliseconds
expr min lq mean median uq max neval
nnet 208.6898 220.2304 326.2210 305.5752 386.3385 541.0621 10
contrasts 1110.0123 1168.7651 1263.5357 1216.1403 1357.0532 1514.4411 10
== 146.2217 156.8141 208.2733 185.1860 275.3909 278.8497 10
==Sapply 290.0458 291.4543 301.3010 295.0557 298.0274 358.0531 10
matrix 302.9993 304.8305 312.9748 306.8981 310.0781 363.0773 10
outer 524.5230 583.5224 603.3300 586.3054 595.4086 807.0260 10
rep 276.2110 285.3983 389.8187 434.2754 435.8607 442.3403 10
使用data.table
library(data.table)
setDT(df) #make df a data.table if needed
merge(df, dcast(df, n ~ x, fun.agg = length), by = c("n"))
一种快速简便的方法是使用 fastDummies::dummy_cols
:
fastDummies::dummy_cols(df, "x")
具有 tidyverse 函数的替代方案:
library(tidyverse)
df %>%
left_join(., df %>% mutate(value = 1) %>%
pivot_wider(names_from = x, values_from = value, values_fill = 0) %>%
relocate(n, sort(colnames(.)[-1])))
输出
> dummmy <- fastDummies::dummy_cols(df, "x")
> colnames(dummy)[-c(1,2)] <- LETTERS
> dummy
n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1 1 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
2 2 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
3 3 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 5 T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
6 6 X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
7 7 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
8 8 F 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9 9 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
10 10 S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
基准
由于有很多解决方案并且问题涉及大型数据集,基准测试可能会有所帮助。根据基准测试,nnet 解决方案是最快的。
set.seed(1)
df <- data.frame(n = seq(1:1000000), x = sample(LETTERS, 1000000, replace = T))
library(microbenchmark)
bm <- microbenchmark(
fModel.matrix(),
fContrasts(),
fnnet(),
fdata.table(),
fFastDummies(),
fDplyr(),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)
我有一些类似的数据集:
df <- data.frame(n = seq(1:1000000), x = sample(LETTERS, 1000000, replace = T))
我正在寻找将变量 x
拆分为范围为 0-1
最后看起来像这样:
n x A B C D E F G H . . .
1 D 0 0 0 1 0 0 0 0 . . .
2 B 0 1 0 0 0 0 0 0 . . .
3 F 0 0 0 0 0 1 0 0 . . .
在我的数据集中,变量 x 中有更多代码,因此手动添加每个新变量太耗时了。
我正在考虑对 var x 中的代码进行排序,并为每个代码分配一个唯一的编号,然后创建一个迭代循环,为变量 x 中的每个代码创建新变量。 但我觉得我把事情复杂化了
使用match
。首先创建一个零向量,然后将 df
行的字母与字母表中的向量匹配,然后转向 1
。您可以使用内置 LETTERS
常量。最后 Vectorize
事情和 cbind
.
f <- \(x) {
z <- numeric(length(LETTERS))
z[match(x, LETTERS)] <- 1
setNames(z, LETTERS)
}
cbind(df, t(Vectorize(f)(df$x)))
# n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
# Q 1 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# E 2 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# A 3 A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Y 4 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
# J 5 J 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# D 6 D 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# R 7 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
# Z 8 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
# Q.1 9 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# O 10 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
或者,transform
x
以 LETTERS
作为水平的因子并使用 model.matrix
.
df <- transform(df, x=factor(x, levels=LETTERS))
cbind(df, `colnames<-`(model.matrix(~ 0 + x, df), LETTERS))
# n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
# 1 1 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# 2 2 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 3 3 A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 4 4 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
# 5 5 J 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 6 6 D 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 7 7 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
# 8 8 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
# 9 9 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
# 10 10 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
数据:
n <- 10
set.seed(42)
df <- data.frame(n = seq(1:n), x = sample(LETTERS, n, replace = T))
这里的主要问题是资源?我认为。我发现使用 nnet
是一个快速的解决方案:
library(nnet)
library(dplyr)
df %>% cbind(class.ind(.$x) == 1) %>%
mutate(across(-c(n, x), ~.*1))
n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1 1 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2 H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 3 L 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 M 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
5 5 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
6 6 A 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 7 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
8 8 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
9 9 F 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 10 U 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
11 11 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
12 12 I 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13 13 O 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
14 14 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
15 15 P 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
16 16 T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
17 17 F 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18 18 K 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 19 H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
20 20 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
21 21 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
22 22 G 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 23 P 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
24 24 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
25 25 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
26 26 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
27 27 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
28 28 B 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
29 29 D 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
30 30 M 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
31 31 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
32 32 V 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
33 33 S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
34 34 Y 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
35 35 T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
[ reached 'max' / getOption("max.print") -- omitted 999965 rows ]
>
另一种选择是使用 ==
。
. <- unique(df$x)
cbind(df, +do.call(cbind, lapply(setNames(., .), `==`, df$x)))
# n x C I L T Y
#1 1 I 0 1 0 0 0
#2 2 C 1 0 0 0 0
#3 3 C 1 0 0 0 0
#4 4 Y 0 0 0 0 1
#5 5 L 0 0 1 0 0
#6 6 T 0 0 0 1 0
#...
或在一行中使用 sapply
。
cbind(df, +sapply(unique(df$x), `==`, df$x))
或使用 contrasts
和 match
到 df$x
。
. <- contrasts(as.factor(df$x), FALSE)
#. <- contrasts(as.factor(unique(df$x)), FALSE) #Alternative
cbind(df, .[match(df$x, rownames(.)),])
#cbind(df, .[fastmatch::fmatch(df$x, rownames(.)),]) #Alternative
或在 matrix
中建立索引。
. <- unique(df$x) #Could be sorted
#. <- collapse::funique(df$x) #Alternative
#. <- kit::funique(df$x) #Alternative
i <- match(df$x, .)
#i <- fastmatch::fmatch(df$x, .) #Alternative
#i <- data.table::chmatch(df$x, .) #Alternative
nc <- length(.)
nr <- length(i)
cbind(df, matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)))
或使用outer
.
. <- unique(df$x)
cbind(df, +outer(df$x, setNames(., .), `==`))
或使用 rep
和 m̀atrix`。
. <- unique(df$x)
n <- nrow(df)
cbind(df, +matrix(df$x == rep(., each=n), n, dimnames=list(NULL, .)))
一些方法的基准测试将适用于 变量 x 中的更多代码,而不仅仅是例如LETTERS
.
set.seed(42)
df <- data.frame(n = seq(1:1000000), x = sample(LETTERS, 1000000, replace = T))
library(nnet)
library(dplyr)
microbenchmark::microbenchmark(times = 10L, setup = gc(FALSE), control=list(order="block")
, "nnet" = df %>% cbind(class.ind(.$x) == 1) %>%
mutate(across(-c(n, x), ~.*1))
, "contrasts" = {. <- contrasts(as.factor(df$x), FALSE)
cbind(df, .[match(df$x, rownames(.)),])}
, "==" = {. <- unique(df$x)
cbind(df, +do.call(cbind, lapply(setNames(., .), `==`, df$x)))}
, "==Sapply" = cbind(df, +sapply(unique(df$x), `==`, df$x))
, "matrix" = {. <- unique(df$x)
i <- match(df$x, .)
nc <- length(.)
nr <- length(i)
cbind(df, matrix(`[<-`(integer(nc * nr), 1:nr + nr * (i - 1), 1), nr, nc,
dimnames=list(NULL, .)))}
, "outer" = {. <- unique(df$x)
cbind(df, +outer(df$x, setNames(., .), `==`))}
, "rep" = {. <- unique(df$x)
n <- nrow(df)
cbind(df, +matrix(df$x == rep(., each=n), n, dimnames=list(NULL, .)))}
)
结果
Unit: milliseconds
expr min lq mean median uq max neval
nnet 208.6898 220.2304 326.2210 305.5752 386.3385 541.0621 10
contrasts 1110.0123 1168.7651 1263.5357 1216.1403 1357.0532 1514.4411 10
== 146.2217 156.8141 208.2733 185.1860 275.3909 278.8497 10
==Sapply 290.0458 291.4543 301.3010 295.0557 298.0274 358.0531 10
matrix 302.9993 304.8305 312.9748 306.8981 310.0781 363.0773 10
outer 524.5230 583.5224 603.3300 586.3054 595.4086 807.0260 10
rep 276.2110 285.3983 389.8187 434.2754 435.8607 442.3403 10
使用data.table
library(data.table)
setDT(df) #make df a data.table if needed
merge(df, dcast(df, n ~ x, fun.agg = length), by = c("n"))
一种快速简便的方法是使用 fastDummies::dummy_cols
:
fastDummies::dummy_cols(df, "x")
具有 tidyverse 函数的替代方案:
library(tidyverse)
df %>%
left_join(., df %>% mutate(value = 1) %>%
pivot_wider(names_from = x, values_from = value, values_fill = 0) %>%
relocate(n, sort(colnames(.)[-1])))
输出
> dummmy <- fastDummies::dummy_cols(df, "x")
> colnames(dummy)[-c(1,2)] <- LETTERS
> dummy
n x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
1 1 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
2 2 Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
3 3 E 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 H 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 5 T 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
6 6 X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
7 7 R 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
8 8 F 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9 9 Z 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
10 10 S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
基准 由于有很多解决方案并且问题涉及大型数据集,基准测试可能会有所帮助。根据基准测试,nnet 解决方案是最快的。
set.seed(1)
df <- data.frame(n = seq(1:1000000), x = sample(LETTERS, 1000000, replace = T))
library(microbenchmark)
bm <- microbenchmark(
fModel.matrix(),
fContrasts(),
fnnet(),
fdata.table(),
fFastDummies(),
fDplyr(),
times = 10L,
setup = gc(FALSE)
)
autoplot(bm)