将具有虚拟变量的数据框转换为分类变量
Convert data frame with dummy variables into categorical variables
我需要将虚拟变量转换为分类变量。作为 R 的新手,我只知道如何反过来做。有人能指出我正确的方向吗?
数据帧是:
data <- data.frame(id=c(1,2,3,4,5,6,7,8,9),
red=c("1","0","1","0","1","0","0","0","0"),
blue=c("1","1","1","1","0","1","1","1","0"),
yellow=c("0","0","0","0","0","0","0","1","1"))
预期输出为:
由于'red'、'blue'、'yellow'列为factor
,我们将其强制为numeric
并使用索引替换为相应的Map
内的列名
data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)],
data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
# id c1 c2 c3
#1 1 red blue None
#2 2 None blue None
#3 3 red blue None
#4 4 None blue None
#5 5 red None None
#6 6 None blue None
#7 7 None blue None
#8 8 None blue yellow
#9 9 None None yellow
或通过更改 levels
的另一个选项
data[-1] <- Map(function(x, y) {levels(x) <- c('None', y)
x},data[-1], names(data)[-1])
或者使用lapply
,我们遍历列序列,提取列,将其更改为numeric
,并使用索引将值更改为列名和'None'
data[-1] <- lapply(seq_along(data[-1]), function(i)
c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )
注意:给出了预期的输出。
或者使用向量化方法,我们创建一个逻辑矩阵,乘以列索引并将索引更改为列名
data[-1] <- `dim<-`(names(data)[-1][col(data[-1]) *
(NA ^(data[-1] == 0))], dim(data[-1]))
或 replace
的另一个选项
data[-1] <- replace(as.matrix(data[-1]), data[-1]==1,
rep(names(data)[-1], colSums(data[-1] == 1)))
或使用tidyverse
library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
bind_cols(data[1], .) %>%
rename_at(2:4, ~ paste0("c", 1:3))
# id c1 c2 c3
#1 1 red blue none
#2 2 none blue none
#3 3 red blue none
#4 4 none blue none
#5 5 red none none
#6 6 none blue none
#7 7 none blue none
#8 8 none blue yellow
#9 9 none none yellow
或 gather/spread
data %>%
gather(key, val, -id) %>%
mutate(val = case_when(val == 1 ~ key),
key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>%
spread(key, val)
基准
这里有一些基准
data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.065 0.014 0.078
system.time({
`dim<-`(names(data1)[-1][col(data1[-1]) *
(NA ^(data1[-1] == 0))], dim(data1[-1]))
})
# user system elapsed
# 0.387 0.036 0.422
system.time({
imap(data1[-1], ~ c('none', .y)[as.numeric(.x)])
})
# user system elapsed
# 0.047 0.006 0.054
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 0.555 0.067 0.621
system.time({
ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)
})
# user system elapsed
# 0.711 0.060 0.770
在 1e6 数据集上
data1 <- data[rep(seq_len(nrow(data)), 1e6),]
system.time({Map( function(x, y) {levels(x) <- c('None', y)
x},data1[-1], names(data1)[-1])})
# user system elapsed
# 0.123 0.016 0.139
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.328 0.074 0.402
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 7.125 0.463 7.561
带有微基准测试
library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1]),
ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr min lq mean median uq max nev
#ak 6.14964 4.048205 2.401768 1.741373 2.47268 2.43698 10
#ak2 1.00000 1.000000 1.000000 1.000000 1.00000 1.00000 10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278 10
数据
data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")),
class = "data.frame", row.names = c(NA,
-9L))
一个选项lapply
通过忽略第一列(id
),我们检查哪些列中的值为1并将它们替换为相应的列名,其他的可以更改为NA
.
data[-1] <- lapply(names(data[-1]), function(x) ifelse(data[x] == 1, x, NA))
data
# id red blue yellow
#1 1 red blue <NA>
#2 2 <NA> blue <NA>
#3 3 red blue <NA>
#4 4 <NA> blue <NA>
#5 5 red <NA> <NA>
#6 6 <NA> blue <NA>
#7 7 <NA> blue <NA>
#8 8 <NA> blue yellow
#9 9 <NA> <NA> yellow
不使用lapply
循环的另一种方法
data[-1] <- ifelse(data[-1] == 1, names(data[-1])[col(data[-1])], NA)
data
# id red blue yellow
#1 1 red blue <NA>
#2 2 <NA> blue <NA>
#3 3 red blue <NA>
#4 4 <NA> blue <NA>
#5 5 red <NA> <NA>
#6 6 <NA> blue <NA>
#7 7 <NA> blue <NA>
#8 8 <NA> blue yellow
#9 9 <NA> <NA> yellow
也许是这个?
library(tidyverse)
data <- data %>%
mutate(red = case_when(
red == 1 ~ "Red", red == 0 ~ "None")) %>%
mutate(blue = case_when(
blue == 1 ~ "Blue", blue == 0 ~ "None")) %>%
mutate(yellow = case_when(
yellow == 1 ~ "Yellow", yellow == 0 ~ "None"
))
data
我需要将虚拟变量转换为分类变量。作为 R 的新手,我只知道如何反过来做。有人能指出我正确的方向吗?
数据帧是:
data <- data.frame(id=c(1,2,3,4,5,6,7,8,9),
red=c("1","0","1","0","1","0","0","0","0"),
blue=c("1","1","1","1","0","1","1","1","0"),
yellow=c("0","0","0","0","0","0","0","1","1"))
预期输出为:
由于'red'、'blue'、'yellow'列为factor
,我们将其强制为numeric
并使用索引替换为相应的Map
data[-1] <- Map(function(x, y) c('None', y)[as.numeric(x)],
data[-1], names(data)[-1])
names(data)[-1] <- paste0("c", 1:3)
data
# id c1 c2 c3
#1 1 red blue None
#2 2 None blue None
#3 3 red blue None
#4 4 None blue None
#5 5 red None None
#6 6 None blue None
#7 7 None blue None
#8 8 None blue yellow
#9 9 None None yellow
或通过更改 levels
data[-1] <- Map(function(x, y) {levels(x) <- c('None', y)
x},data[-1], names(data)[-1])
或者使用lapply
,我们遍历列序列,提取列,将其更改为numeric
,并使用索引将值更改为列名和'None'
data[-1] <- lapply(seq_along(data[-1]), function(i)
c("None", names(data)[-1][i])[as.numeric(data[-1][[i]])] )
注意:给出了预期的输出。
或者使用向量化方法,我们创建一个逻辑矩阵,乘以列索引并将索引更改为列名
data[-1] <- `dim<-`(names(data)[-1][col(data[-1]) *
(NA ^(data[-1] == 0))], dim(data[-1]))
或 replace
data[-1] <- replace(as.matrix(data[-1]), data[-1]==1,
rep(names(data)[-1], colSums(data[-1] == 1)))
或使用tidyverse
library(tidyverse)
imap(data[-1], ~ c('none', .y)[as.numeric(.x)]) %>%
bind_cols(data[1], .) %>%
rename_at(2:4, ~ paste0("c", 1:3))
# id c1 c2 c3
#1 1 red blue none
#2 2 none blue none
#3 3 red blue none
#4 4 none blue none
#5 5 red none none
#6 6 none blue none
#7 7 none blue none
#8 8 none blue yellow
#9 9 none none yellow
或 gather/spread
data %>%
gather(key, val, -id) %>%
mutate(val = case_when(val == 1 ~ key),
key = factor(key, levels = unique(key), labels = paste0("c", 1:3))) %>%
spread(key, val)
基准
这里有一些基准
data1 <- data[rep(seq_len(nrow(data)), 1e5),]
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.065 0.014 0.078
system.time({
`dim<-`(names(data1)[-1][col(data1[-1]) *
(NA ^(data1[-1] == 0))], dim(data1[-1]))
})
# user system elapsed
# 0.387 0.036 0.422
system.time({
imap(data1[-1], ~ c('none', .y)[as.numeric(.x)])
})
# user system elapsed
# 0.047 0.006 0.054
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 0.555 0.067 0.621
system.time({
ifelse(data1[-1] == 1, names(data1[-1])[col(data1[-1])], NA)
})
# user system elapsed
# 0.711 0.060 0.770
在 1e6 数据集上
data1 <- data[rep(seq_len(nrow(data)), 1e6),]
system.time({Map( function(x, y) {levels(x) <- c('None', y)
x},data1[-1], names(data1)[-1])})
# user system elapsed
# 0.123 0.016 0.139
system.time({
Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1])
})
# user system elapsed
# 0.328 0.074 0.402
system.time({
lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA))
}
)
# user system elapsed
# 7.125 0.463 7.561
带有微基准测试
library(microbenchmark)
microbenchmark(ak = Map(function(x, y) c('None', y)[as.numeric(x)],
data1[-1], names(data1)[-1]),
ak2 = Map( function(x, y) {levels(x) <- c('None', y); x},data1[-1], names(data1)[-1]),
rs = lapply(names(data1[-1]), function(x) ifelse(data1[x] == 1, x, NA)), unit = 'relative', times = 10L)
#Unit: relative
#expr min lq mean median uq max nev
#ak 6.14964 4.048205 2.401768 1.741373 2.47268 2.43698 10
#ak2 1.00000 1.000000 1.000000 1.000000 1.00000 1.00000 10
#rs 70.73601 45.468868 23.020272 20.408306 18.63263 16.01278 10
数据
data <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9), red = structure(c(2L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
blue = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), yellow = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor")),
class = "data.frame", row.names = c(NA,
-9L))
一个选项lapply
通过忽略第一列(id
),我们检查哪些列中的值为1并将它们替换为相应的列名,其他的可以更改为NA
.
data[-1] <- lapply(names(data[-1]), function(x) ifelse(data[x] == 1, x, NA))
data
# id red blue yellow
#1 1 red blue <NA>
#2 2 <NA> blue <NA>
#3 3 red blue <NA>
#4 4 <NA> blue <NA>
#5 5 red <NA> <NA>
#6 6 <NA> blue <NA>
#7 7 <NA> blue <NA>
#8 8 <NA> blue yellow
#9 9 <NA> <NA> yellow
不使用lapply
循环的另一种方法
data[-1] <- ifelse(data[-1] == 1, names(data[-1])[col(data[-1])], NA)
data
# id red blue yellow
#1 1 red blue <NA>
#2 2 <NA> blue <NA>
#3 3 red blue <NA>
#4 4 <NA> blue <NA>
#5 5 red <NA> <NA>
#6 6 <NA> blue <NA>
#7 7 <NA> blue <NA>
#8 8 <NA> blue yellow
#9 9 <NA> <NA> yellow
也许是这个?
library(tidyverse)
data <- data %>%
mutate(red = case_when(
red == 1 ~ "Red", red == 0 ~ "None")) %>%
mutate(blue = case_when(
blue == 1 ~ "Blue", blue == 0 ~ "None")) %>%
mutate(yellow = case_when(
yellow == 1 ~ "Yellow", yellow == 0 ~ "None"
))
data