R:没有 for 循环的子集和排序大 data.frame
R: subsetting and ordering large data.frame without forloop
我有 9700 万行的长 table。每行包含一个人采取的行动的信息和该行动的时间戳,形式为:
actions <- c("walk","sleep", "run","eat")
people <- c("John","Paul","Ringo","George")
timespan <- seq(1000,2000,1)
set.seed(28100)
df.in <- data.frame(who = sample(people, 10, replace=TRUE),
what = sample(actions, 10, replace=TRUE),
when = sample(timespan, 10, replace=TRUE))
df.in
# who what when
# 1 Paul eat 1834
# 2 Paul sleep 1295
# 3 Paul eat 1312
# 4 Ringo eat 1635
# 5 John sleep 1424
# 6 George run 1092
# 7 Paul walk 1849
# 8 John run 1854
# 9 George sleep 1036
# 10 Ringo walk 1823
每个动作都可以由一个人采取或不采取,并且可以按任何顺序采取行动。
我有兴趣总结我的数据集中的操作顺序。特别是对于每个人,我想找出第一个、第二个、第三个和第四个采取了哪个动作。 如果多次执行某个操作,我只对第一次出现感兴趣。然后如果有人跑,吃,吃,跑和睡我感兴趣的总结如run
,eat
,sleep
。
df.out <- data.frame(who = factor(character(), levels=people),
action1 = factor(character(), levels=actions),
action2 = factor(character(), levels=actions),
action3 = factor(character(), levels=actions),
action4 = factor(character(), levels=actions))
我可以用forloop得到我想要的:
for (person in people) {
tmp <- subset(df.in, who==person)
tmp <- tmp[order(tmp$when),]
chrono_list <- unique(tmp$what)
df.out <- rbind(df.out, data.frame(who = person,
action1 = chrono_list[1],
action2 = chrono_list[2],
action3 = chrono_list[3],
action4 = chrono_list[4]))
}
df.out
# who action1 action2 action3 action4
# 1 John sleep run <NA> <NA>
# 2 Paul sleep eat walk <NA>
# 3 Ringo eat walk <NA> <NA>
# 4 George sleep run <NA> <NA>
是否可以在没有循环的情况下以更有效的方式获得此结果?
我们可以使用 data.table
的开发版本中的 dcast
,即。 v1.9.5
。我们可以从 here
安装它
library(data.table)#v1.9.5+
dcast(setDT(df.in)[order(when),action:= paste0('action', 1:.N) ,who],
who~action, value.var='what')
如果每个'who'
需要unique
'what'
dcast(setDT(df.in)[, .SD[!duplicated(what)], who][order(when),
action:= paste0('action', 1:.N), who], who~action, value.var='what')
# who action1 action2 action3
#1: George sleep run NA
#2: John sleep run NA
#3: Paul sleep eat walk
#4: Ringo eat walk NA
或者使用.I
会快一点
ind <- setDT(df.in)[,.I[!duplicated(what)], who]$V1
dcast(df.in[ind][order(when),action:= paste0('action', 1:.N) ,who],
who~action, value.var='what')
或者使用 setorder
和 unique
这可能是一种内存效率,因为 setorder
通过引用重新排序数据集。
dcast(unique(setorder(setDT(df.in), who, when), by=c('who', 'what'))[,
action:= paste0('action', 1:.N), who], who~action, value.var='what')
# who action1 action2 action3
#1: George sleep run NA
#2: John sleep run NA
#3: Paul sleep eat walk
#4: Ringo eat walk NA
我看到你已经标记了 plyr,但你也可以用 dplyr 来标记。像下面这样的东西应该可以工作:
df.in %>%
group_by(who) %>%
arrange(when) %>%
summarise(action1 = first(what),
action2 = nth(what, 2),
action3 = nth(what, 3),
action4 = last(what))
您也可以使用组合 dplyr
+ tidyr
library(dplyr)
library(tidyr)
df.in %>%
group_by(who) %>%
mutate(when = rank(when), when = paste0("action", when)) %>%
spread(key = when, value = what)
## who action1 action2 action3 action4
## 1 George sleep run NA NA
## 2 John sleep run NA NA
## 3 Paul sleep eat eat walk
## 4 Ringo eat walk NA NA
编辑
如果您只需要 what
列的第一次出现,您可以先过滤数据
df.in %>%
arrange(when) %>%
group_by(who) %>%
filter(!duplicated(what)) %>%
mutate(when = rank(when), when = paste0("action", when)) %>%
spread(key = when, value = what)
## who action1 action2 action3
## 1 George sleep run NA
## 2 John sleep run NA
## 3 Paul sleep eat walk
## 4 Ringo eat walk NA
这里有一个使用比较传统的方法split-apply-combine
。它是比 for
循环更惯用的 R 代码,尽管 {dplyr} 和 {data.table} 解决方案似乎比这种类型的 {base} R 解决方案更常见。此方法使用来自 {reshape2} 的 dcast
,但它也可以使用 reshape()
作为纯 {base} R 解决方案。
此方法可能不会比问题中给出的 for
循环快多少。我很想知道给出的三种方法如何比较大型数据集。我是初学者,最近一直在学习 R 数据操作。欢迎任何反馈。
library(reshape2)
#Split the data by person and apply the function
actions <- lapply(split(df.in, df.in$who), function(tmp) {
tmp <- tmp[order(tmp$when),]
dup <- duplicated(tmp$what)
df.out <- data.frame(who = tmp$who[!dup], what = tmp$what[!dup])
df.out$actionNo <- paste("action", c(1:nrow(df.out)), sep = "")
return(df.out)
})
#Combine the results
act_rbind <- do.call(rbind, actions)
act_cast <- dcast(act_rbind, who ~ actionNo, value.var = "what")
print(act_cast)
# who action1 action2 action3
# 1 George sleep run <NA>
# 2 John sleep run <NA>
# 3 Paul sleep eat walk
# 4 Ringo eat walk <NA>
我有 9700 万行的长 table。每行包含一个人采取的行动的信息和该行动的时间戳,形式为:
actions <- c("walk","sleep", "run","eat")
people <- c("John","Paul","Ringo","George")
timespan <- seq(1000,2000,1)
set.seed(28100)
df.in <- data.frame(who = sample(people, 10, replace=TRUE),
what = sample(actions, 10, replace=TRUE),
when = sample(timespan, 10, replace=TRUE))
df.in
# who what when
# 1 Paul eat 1834
# 2 Paul sleep 1295
# 3 Paul eat 1312
# 4 Ringo eat 1635
# 5 John sleep 1424
# 6 George run 1092
# 7 Paul walk 1849
# 8 John run 1854
# 9 George sleep 1036
# 10 Ringo walk 1823
每个动作都可以由一个人采取或不采取,并且可以按任何顺序采取行动。
我有兴趣总结我的数据集中的操作顺序。特别是对于每个人,我想找出第一个、第二个、第三个和第四个采取了哪个动作。 如果多次执行某个操作,我只对第一次出现感兴趣。然后如果有人跑,吃,吃,跑和睡我感兴趣的总结如run
,eat
,sleep
。
df.out <- data.frame(who = factor(character(), levels=people),
action1 = factor(character(), levels=actions),
action2 = factor(character(), levels=actions),
action3 = factor(character(), levels=actions),
action4 = factor(character(), levels=actions))
我可以用forloop得到我想要的:
for (person in people) {
tmp <- subset(df.in, who==person)
tmp <- tmp[order(tmp$when),]
chrono_list <- unique(tmp$what)
df.out <- rbind(df.out, data.frame(who = person,
action1 = chrono_list[1],
action2 = chrono_list[2],
action3 = chrono_list[3],
action4 = chrono_list[4]))
}
df.out
# who action1 action2 action3 action4
# 1 John sleep run <NA> <NA>
# 2 Paul sleep eat walk <NA>
# 3 Ringo eat walk <NA> <NA>
# 4 George sleep run <NA> <NA>
是否可以在没有循环的情况下以更有效的方式获得此结果?
我们可以使用 data.table
的开发版本中的 dcast
,即。 v1.9.5
。我们可以从 here
library(data.table)#v1.9.5+
dcast(setDT(df.in)[order(when),action:= paste0('action', 1:.N) ,who],
who~action, value.var='what')
如果每个'who'
需要unique
'what'
dcast(setDT(df.in)[, .SD[!duplicated(what)], who][order(when),
action:= paste0('action', 1:.N), who], who~action, value.var='what')
# who action1 action2 action3
#1: George sleep run NA
#2: John sleep run NA
#3: Paul sleep eat walk
#4: Ringo eat walk NA
或者使用.I
会快一点
ind <- setDT(df.in)[,.I[!duplicated(what)], who]$V1
dcast(df.in[ind][order(when),action:= paste0('action', 1:.N) ,who],
who~action, value.var='what')
或者使用 setorder
和 unique
这可能是一种内存效率,因为 setorder
通过引用重新排序数据集。
dcast(unique(setorder(setDT(df.in), who, when), by=c('who', 'what'))[,
action:= paste0('action', 1:.N), who], who~action, value.var='what')
# who action1 action2 action3
#1: George sleep run NA
#2: John sleep run NA
#3: Paul sleep eat walk
#4: Ringo eat walk NA
我看到你已经标记了 plyr,但你也可以用 dplyr 来标记。像下面这样的东西应该可以工作:
df.in %>%
group_by(who) %>%
arrange(when) %>%
summarise(action1 = first(what),
action2 = nth(what, 2),
action3 = nth(what, 3),
action4 = last(what))
您也可以使用组合 dplyr
+ tidyr
library(dplyr)
library(tidyr)
df.in %>%
group_by(who) %>%
mutate(when = rank(when), when = paste0("action", when)) %>%
spread(key = when, value = what)
## who action1 action2 action3 action4
## 1 George sleep run NA NA
## 2 John sleep run NA NA
## 3 Paul sleep eat eat walk
## 4 Ringo eat walk NA NA
编辑
如果您只需要 what
列的第一次出现,您可以先过滤数据
df.in %>%
arrange(when) %>%
group_by(who) %>%
filter(!duplicated(what)) %>%
mutate(when = rank(when), when = paste0("action", when)) %>%
spread(key = when, value = what)
## who action1 action2 action3
## 1 George sleep run NA
## 2 John sleep run NA
## 3 Paul sleep eat walk
## 4 Ringo eat walk NA
这里有一个使用比较传统的方法split-apply-combine
。它是比 for
循环更惯用的 R 代码,尽管 {dplyr} 和 {data.table} 解决方案似乎比这种类型的 {base} R 解决方案更常见。此方法使用来自 {reshape2} 的 dcast
,但它也可以使用 reshape()
作为纯 {base} R 解决方案。
此方法可能不会比问题中给出的 for
循环快多少。我很想知道给出的三种方法如何比较大型数据集。我是初学者,最近一直在学习 R 数据操作。欢迎任何反馈。
library(reshape2)
#Split the data by person and apply the function
actions <- lapply(split(df.in, df.in$who), function(tmp) {
tmp <- tmp[order(tmp$when),]
dup <- duplicated(tmp$what)
df.out <- data.frame(who = tmp$who[!dup], what = tmp$what[!dup])
df.out$actionNo <- paste("action", c(1:nrow(df.out)), sep = "")
return(df.out)
})
#Combine the results
act_rbind <- do.call(rbind, actions)
act_cast <- dcast(act_rbind, who ~ actionNo, value.var = "what")
print(act_cast)
# who action1 action2 action3
# 1 George sleep run <NA>
# 2 John sleep run <NA>
# 3 Paul sleep eat walk
# 4 Ringo eat walk <NA>