R如何根据现有数据创建columns/features
R how to create columns/features based on existing data
我有一个数据框 df
:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
1. userID is factor variable
2. Score is numeric
3. All the 'Task_' features are factor variables with possible values 'Hard', 'Easy', 'Match' or NA
我想为每个 userID
创建新列,其中包含 Task_
特征的每个可能状态的出现次数。对于上面的玩具示例,所需的输出将是要附加在 df
末尾的三个新列,如下所示:
userID Hard Match Easy
3108 0 0 4
3207 1 2 1
3350 3 0 1
3961 2 0 1
4021 1 0 2
更新:
这个问题不是重复的,原始问题的相关部分已移至:
R How to counting the factors in ordered sequence
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
# userID NA Easy Hard Match
# 1: 3108 0 4 0 0
# 2: 3207 0 1 1 2
# 3: 3350 0 1 3 0
# 4: 3961 1 1 2 0
# 5: 4021 1 2 1 0
第一部分的答案可以通过使用 apply
逐行获得,并使用 table
计算每行中因子水平的出现次数
cbind(df[1], t(apply(df[-c(1, 2)], 1, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match"))))))
# userID Easy Hard Match
#1 3108 4 0 0
#2 3207 1 1 2
#3 3350 1 3 0
#4 3961 1 2 0
#5 4021 2 1 0
在tidyverse
中,我们可以将数据转换为长格式,删除NA
值,count
出现userID
和value
并得到数据恢复为宽格式。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with("Task"), values_drop_na = TRUE) %>%
count(userID, value) %>%
pivot_wider(names_from = value, values_from = n, values_fill = list(n = 0))
数据
df <- structure(list(userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10), Task_Alpha = structure(c(1L,
2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy",
"Hard", "Match"), class = "factor"), Task_Delta = structure(c(1L,
3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
您可以将数据帧 df
与 map*
或 *apply
函数中的每个值进行比较,计算结果布尔矩阵的逐行总和,然后将输出与原始数据框:
library(dplyr)
library(purrr)
facs <- c("Easy", "Match", "Hard")
bind_cols(df, set_names(map_dfc(facs, ~ rowSums(df == ., na.rm = T)), facs))
#### OUTPUT ####
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Easy Match Hard
1 3108 -8.00 Easy Easy Easy Easy 4 0 0
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 1 0 3
4 3961 10.00 Easy <NA> Hard Hard 1 0 2
5 4021 10.00 Easy Easy <NA> Hard 2 0 1
另一个选项使用 Rfast::rowTabulate
v <- c('Hard', 'Match', 'Easy', NA)
DT[, (v) := as.data.table(Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N))),
.SDcols=Task_Alpha:Task_Delta]
输出:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy NA
1: 3108 -8.00 Easy Easy Easy Easy 0 0 4 0
2: 3207 3.00 Hard Easy Match Match 1 2 1 0
3: 3350 5.78 Hard Easy Hard Hard 3 0 1 0
4: 3961 10.00 Easy <NA> Hard Hard 2 0 1 1
5: 4021 10.00 Easy Easy <NA> Hard 1 0 2 1
来自 Wimpel 的数据:
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
想知道这种方法在实际数据集上的工作速度有多快以及实际数据集是否很大。
编辑:添加时间
library(data.table)
set.seed(0L)
nr <- 1e6
v <- c('Hard', 'Match', 'Easy', NA)
DT <- data.table(userID=1:nr, Task_Alpha=sample(v, nr, TRUE),
Task_Beta=sample(v, nr, TRUE), Task_Charlie=sample(v, nr, TRUE),
Task_Delta=sample(v, nr, TRUE))
df <- as.data.frame(DT)
mtd0 <- function() {
t(apply(df[-1L], 1L, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match")))))
}
mtd1 <- function() {
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
}
mtd2 <- function() {
DT[, Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N)),
.SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 54.7s 54.7s 0.0183 137MB 1.70 1 93 54.7s <int[,3] [1,000,000 x 3]> <df[,3] [107,168 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 2.4s 2.4s 0.417 398MB 0.833 1 2 2.4s <df[,5] [1,000,000 x 5]> <df[,3] [12,517 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 252.8ms 264.4ms 3.78 107MB 3.78 2 2 528.7ms <int[,4] [1,000,000 x 4]> <df[,3] [6,509 x 3]> <bch:tm> <tibble [2 x 3]>
如果您正在使用 base R
,那么以下内容可能对您有所帮助:
df <- cbind(df,as.data.frame(sapply(c('Hard','Match','Easy'), function(v) rowSums(df == v, na.rm = T))))
输出:
> df
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy
1 3108 -8.00 Easy Easy Easy Easy 0 0 4
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 3 0 1
4 3961 10.00 Easy <NA> Hard Hard 2 0 1
5 4021 10.00 Easy Easy <NA> Hard 1 0 2
我有一个数据框 df
:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
1. userID is factor variable
2. Score is numeric
3. All the 'Task_' features are factor variables with possible values 'Hard', 'Easy', 'Match' or NA
我想为每个 userID
创建新列,其中包含 Task_
特征的每个可能状态的出现次数。对于上面的玩具示例,所需的输出将是要附加在 df
末尾的三个新列,如下所示:
userID Hard Match Easy
3108 0 0 4
3207 1 2 1
3350 3 0 1
3961 2 0 1
4021 1 0 2
更新: 这个问题不是重复的,原始问题的相关部分已移至: R How to counting the factors in ordered sequence
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
# userID NA Easy Hard Match
# 1: 3108 0 4 0 0
# 2: 3207 0 1 1 2
# 3: 3350 0 1 3 0
# 4: 3961 1 1 2 0
# 5: 4021 1 2 1 0
第一部分的答案可以通过使用 apply
逐行获得,并使用 table
cbind(df[1], t(apply(df[-c(1, 2)], 1, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match"))))))
# userID Easy Hard Match
#1 3108 4 0 0
#2 3207 1 1 2
#3 3350 1 3 0
#4 3961 1 2 0
#5 4021 2 1 0
在tidyverse
中,我们可以将数据转换为长格式,删除NA
值,count
出现userID
和value
并得到数据恢复为宽格式。
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = starts_with("Task"), values_drop_na = TRUE) %>%
count(userID, value) %>%
pivot_wider(names_from = value, values_from = n, values_fill = list(n = 0))
数据
df <- structure(list(userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10), Task_Alpha = structure(c(1L,
2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy",
"Hard", "Match"), class = "factor"), Task_Delta = structure(c(1L,
3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
您可以将数据帧 df
与 map*
或 *apply
函数中的每个值进行比较,计算结果布尔矩阵的逐行总和,然后将输出与原始数据框:
library(dplyr)
library(purrr)
facs <- c("Easy", "Match", "Hard")
bind_cols(df, set_names(map_dfc(facs, ~ rowSums(df == ., na.rm = T)), facs))
#### OUTPUT ####
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Easy Match Hard
1 3108 -8.00 Easy Easy Easy Easy 4 0 0
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 1 0 3
4 3961 10.00 Easy <NA> Hard Hard 1 0 2
5 4021 10.00 Easy Easy <NA> Hard 2 0 1
另一个选项使用 Rfast::rowTabulate
v <- c('Hard', 'Match', 'Easy', NA)
DT[, (v) := as.data.table(Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N))),
.SDcols=Task_Alpha:Task_Delta]
输出:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy NA
1: 3108 -8.00 Easy Easy Easy Easy 0 0 4 0
2: 3207 3.00 Hard Easy Match Match 1 2 1 0
3: 3350 5.78 Hard Easy Hard Hard 3 0 1 0
4: 3961 10.00 Easy <NA> Hard Hard 2 0 1 1
5: 4021 10.00 Easy Easy <NA> Hard 1 0 2 1
来自 Wimpel 的数据:
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
想知道这种方法在实际数据集上的工作速度有多快以及实际数据集是否很大。
编辑:添加时间
library(data.table)
set.seed(0L)
nr <- 1e6
v <- c('Hard', 'Match', 'Easy', NA)
DT <- data.table(userID=1:nr, Task_Alpha=sample(v, nr, TRUE),
Task_Beta=sample(v, nr, TRUE), Task_Charlie=sample(v, nr, TRUE),
Task_Delta=sample(v, nr, TRUE))
df <- as.data.frame(DT)
mtd0 <- function() {
t(apply(df[-1L], 1L, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match")))))
}
mtd1 <- function() {
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
}
mtd2 <- function() {
DT[, Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N)),
.SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
时间:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 54.7s 54.7s 0.0183 137MB 1.70 1 93 54.7s <int[,3] [1,000,000 x 3]> <df[,3] [107,168 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 2.4s 2.4s 0.417 398MB 0.833 1 2 2.4s <df[,5] [1,000,000 x 5]> <df[,3] [12,517 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 252.8ms 264.4ms 3.78 107MB 3.78 2 2 528.7ms <int[,4] [1,000,000 x 4]> <df[,3] [6,509 x 3]> <bch:tm> <tibble [2 x 3]>
如果您正在使用 base R
,那么以下内容可能对您有所帮助:
df <- cbind(df,as.data.frame(sapply(c('Hard','Match','Easy'), function(v) rowSums(df == v, na.rm = T))))
输出:
> df
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy
1 3108 -8.00 Easy Easy Easy Easy 0 0 4
2 3207 3.00 Hard Easy Match Match 1 2 1
3 3350 5.78 Hard Easy Hard Hard 3 0 1
4 3961 10.00 Easy <NA> Hard Hard 2 0 1
5 4021 10.00 Easy Easy <NA> Hard 1 0 2