计算 data.frame 中的后继组合
Counting successor combinations in a data.frame
我得到一个 data.frame
看起来像下面的:
OBJECT ID TASK
1 A
1 C
1 D
1 E
2 A
2 B
2 C
2 D
2 F
现在我想计算 data.frame
中唯一的连续组合以获得以下结果:
PREDECESSOR SUCCESSOR COUNT
A C 1
C D 2
D E 1
A B 1
B C 1
D F 1
我已经想出在两个 for
循环的帮助下提取连续值的方法,但是我在新的 data.frame
(或 list
).
使用 data.table
的解决方案:
代码:
library(data.table)
setDT(df)
df[, TASK0 := shift(TASK), OBJECT]
df[!is.na(TASK0), .N, .(TASK, TASK0)][, .(
COUNT = sum(N)), .(PREDECESSOR = TASK0, SUCCESSOR = TASK)]
结果:
PREDECESSOR SUCCESSOR COUNT
1: A C 1
2: C D 2
3: D E 1
4: A B 1
5: B C 1
6: D F 1
解释:
setDT(df)
: 把data.frame变成一个data.table对象
[, TASK0 := shift(TASK), OBJECT]
:获取每个 OBJECT
的前一个字母
!is.na(TASK0)
:去掉每个 OBJECT
的第一行(他们没有 PREDECESSOR
)
.N, .(TASK, TASK0)
:计算 TASK
和 TASK0
(前面的字母组合) 的出现次数
sum(N)
:求和计数
数据(df
):
structure(list(OBJECT = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
TASK = c("A", "C", "D", "E", "A", "B", "C", "D", "F")), .Names = c("OBJECT",
"TASK"), row.names = c(NA, -9L), class = c("data.table", "data.frame"
))
aggregate(COUNT~.,
data.frame(PREDECESSOR = head(df1$TASK, -1),
SUCCESSOR = tail(df1$TASK, -1),
COUNT = 1),
length)
# PREDECESSOR SUCCESSOR COUNT
#1 E A 1
#2 A B 1
#3 A C 1
#4 B C 1
#5 C D 2
#6 D E 1
#7 D F 1
即使您想先 split
OBJECT.ID
,也可以使用类似的方法
temp = do.call(rbind, lapply(split(df1, df1$OBJECT.ID), function(X){
aggregate(COUNT~., data.frame(PREDECESSOR = head(X$TASK, -1),
SUCCESSOR = tail(X$TASK, -1),
COUNT = 1),
length)
}))
aggregate(COUNT~., temp, length)
# PREDECESSOR SUCCESSOR COUNT
#1 A C 1
#2 B C 1
#3 C D 2
#4 D E 1
#5 A B 1
#6 D F 1
数据
df1 = structure(list(OBJECT.ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), TASK = c("A", "C", "D", "E", "A", "B", "C", "D", "F")), .Names = c("OBJECT.ID",
"TASK"), class = "data.frame", row.names = c(NA, -9L))
为了获得计数,您可以使用以下两行:
cc <- cbind(df$TASK,c(df$TASK[-1],"LAST"))
table(paste(cc[,1],cc[2],sep="-"))
结果是
A-B A-C B-C C-D D-E D-F E-A F-LAST
1 1 1 2 1 1 1 1
我得到一个 data.frame
看起来像下面的:
OBJECT ID TASK
1 A
1 C
1 D
1 E
2 A
2 B
2 C
2 D
2 F
现在我想计算 data.frame
中唯一的连续组合以获得以下结果:
PREDECESSOR SUCCESSOR COUNT
A C 1
C D 2
D E 1
A B 1
B C 1
D F 1
我已经想出在两个 for
循环的帮助下提取连续值的方法,但是我在新的 data.frame
(或 list
).
使用 data.table
的解决方案:
代码:
library(data.table)
setDT(df)
df[, TASK0 := shift(TASK), OBJECT]
df[!is.na(TASK0), .N, .(TASK, TASK0)][, .(
COUNT = sum(N)), .(PREDECESSOR = TASK0, SUCCESSOR = TASK)]
结果:
PREDECESSOR SUCCESSOR COUNT
1: A C 1
2: C D 2
3: D E 1
4: A B 1
5: B C 1
6: D F 1
解释:
setDT(df)
: 把data.frame变成一个data.table对象[, TASK0 := shift(TASK), OBJECT]
:获取每个OBJECT
的前一个字母
!is.na(TASK0)
:去掉每个OBJECT
的第一行(他们没有PREDECESSOR
).N, .(TASK, TASK0)
:计算TASK
和TASK0
(前面的字母组合) 的出现次数
sum(N)
:求和计数
数据(df
):
structure(list(OBJECT = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
TASK = c("A", "C", "D", "E", "A", "B", "C", "D", "F")), .Names = c("OBJECT",
"TASK"), row.names = c(NA, -9L), class = c("data.table", "data.frame"
))
aggregate(COUNT~.,
data.frame(PREDECESSOR = head(df1$TASK, -1),
SUCCESSOR = tail(df1$TASK, -1),
COUNT = 1),
length)
# PREDECESSOR SUCCESSOR COUNT
#1 E A 1
#2 A B 1
#3 A C 1
#4 B C 1
#5 C D 2
#6 D E 1
#7 D F 1
即使您想先 split
OBJECT.ID
temp = do.call(rbind, lapply(split(df1, df1$OBJECT.ID), function(X){
aggregate(COUNT~., data.frame(PREDECESSOR = head(X$TASK, -1),
SUCCESSOR = tail(X$TASK, -1),
COUNT = 1),
length)
}))
aggregate(COUNT~., temp, length)
# PREDECESSOR SUCCESSOR COUNT
#1 A C 1
#2 B C 1
#3 C D 2
#4 D E 1
#5 A B 1
#6 D F 1
数据
df1 = structure(list(OBJECT.ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), TASK = c("A", "C", "D", "E", "A", "B", "C", "D", "F")), .Names = c("OBJECT.ID",
"TASK"), class = "data.frame", row.names = c(NA, -9L))
为了获得计数,您可以使用以下两行:
cc <- cbind(df$TASK,c(df$TASK[-1],"LAST"))
table(paste(cc[,1],cc[2],sep="-"))
结果是
A-B A-C B-C C-D D-E D-F E-A F-LAST
1 1 1 2 1 1 1 1