计算 R 中的序列
counts sequences in R
id random count
a 0 -1
a 1 1
a 1 2
a 0 -1
a 0 -2
a 1 1
a 0 -1
a 1 1
a 0 -1
b 0 -1
b 0 -2
b 1 1
b 0 -1
b 1 1
b 0 -1
b 0 -2
b 0 -3
id
是玩家,random
是二进制 0
或 1
,我想创建一个计数列,按玩家计算 1 和 0 的序列,最好不要循环,因为数据库很大。
这是一个dplyr
解决方案
dat %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
Source: local data frame [17 x 3]
id random count
1 a 0 -1
2 a 1 1
3 a 1 2
4 a 0 -1
5 a 0 -2
6 a 1 1
7 a 0 -1
8 a 1 1
9 a 0 -1
10 b 0 -1
11 b 0 -2
12 b 1 1
13 b 0 -1
14 b 1 1
15 b 0 -1
16 b 0 -2
17 b 0 -3
我想这就是您要找的:
library(data.table)
setDT(DF)[, count := seq_len(.N), by=.(id,rleid(random))]
这给出了
id random count
1: a 0 1
2: a 1 1
3: a 1 2
4: a 0 1
5: a 0 2
6: a 1 1
7: a 0 1
8: a 1 1
9: a 0 1
10: b 0 2
11: b 0 3
12: b 1 1
13: b 0 1
14: b 1 1
15: b 0 1
16: b 0 2
17: b 0 3
(在data.table包的下一个版本1.9.8中,会有一个小快捷方式setDT(DF)[, count := rowid(rleid(random)), by=id]
。我做这个笔记,以便稍后更新答案。)
您可能还需要运行组的标识符:
DF[, rid := rleid(random), by=id]
这给出
id random count rid
1: a 0 1 1
2: a 1 1 2
3: a 1 2 2
4: a 0 1 3
5: a 0 2 3
6: a 1 1 4
7: a 0 1 5
8: a 1 1 6
9: a 0 1 7
10: b 0 1 1
11: b 0 2 1
12: b 1 1 2
13: b 0 1 3
14: b 1 1 4
15: b 0 1 5
16: b 0 2 5
17: b 0 3 5
如果您通读 introductory materials on the package,您会发现这些变量也可以一步创建。
我认为实现此目的的最简单方法是 runner 包中的 streak_run
函数。 streak_run
也是最快的,如以下部分所示
解决方案
library(runner)
df <- data.frame( id = 1:10, random = sample(c(0,1), 10, replace=T))
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
df
# id random count
#1 1 0 -1
#2 2 0 -2
#3 3 1 1
#4 4 1 2
#5 5 1 3
#6 6 1 4
#7 7 0 -1
#8 8 0 -2
#9 9 0 -3
#10 10 0 -4
基准测试
runner_example <- function(df){
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
return(df)}
dplyr_example <- function(df){
df %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
return(df)}
dt_example <- function(df){
setDT(df)[, count := seq_len(.N), by=.(id,rleid(random))]
return(df)}
library(dplyr);library(data.table)
library(microbenchmark); library(magrittr)
df <- data.frame( id = 1:2000L, random = sample(letters[1:2], 2000L, replace=T))
microbenchmark(
dplyr = dplyr_example(df),
dt = dt_example(df),
runner = runner_example(df),
times=100
)
#Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 134388.839 164274.611 204478.048 188548.4975 222777.298 526019.563 100
# dt 1306.139 1710.665 2181.989 1941.3420 2380.953 5581.682 100
# runner 284.522 741.145 1022.456 853.5715 1004.553 7398.019 100
id random count
a 0 -1
a 1 1
a 1 2
a 0 -1
a 0 -2
a 1 1
a 0 -1
a 1 1
a 0 -1
b 0 -1
b 0 -2
b 1 1
b 0 -1
b 1 1
b 0 -1
b 0 -2
b 0 -3
id
是玩家,random
是二进制 0
或 1
,我想创建一个计数列,按玩家计算 1 和 0 的序列,最好不要循环,因为数据库很大。
这是一个dplyr
解决方案
dat %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
Source: local data frame [17 x 3]
id random count
1 a 0 -1
2 a 1 1
3 a 1 2
4 a 0 -1
5 a 0 -2
6 a 1 1
7 a 0 -1
8 a 1 1
9 a 0 -1
10 b 0 -1
11 b 0 -2
12 b 1 1
13 b 0 -1
14 b 1 1
15 b 0 -1
16 b 0 -2
17 b 0 -3
我想这就是您要找的:
library(data.table)
setDT(DF)[, count := seq_len(.N), by=.(id,rleid(random))]
这给出了
id random count
1: a 0 1
2: a 1 1
3: a 1 2
4: a 0 1
5: a 0 2
6: a 1 1
7: a 0 1
8: a 1 1
9: a 0 1
10: b 0 2
11: b 0 3
12: b 1 1
13: b 0 1
14: b 1 1
15: b 0 1
16: b 0 2
17: b 0 3
(在data.table包的下一个版本1.9.8中,会有一个小快捷方式setDT(DF)[, count := rowid(rleid(random)), by=id]
。我做这个笔记,以便稍后更新答案。)
您可能还需要运行组的标识符:
DF[, rid := rleid(random), by=id]
这给出
id random count rid
1: a 0 1 1
2: a 1 1 2
3: a 1 2 2
4: a 0 1 3
5: a 0 2 3
6: a 1 1 4
7: a 0 1 5
8: a 1 1 6
9: a 0 1 7
10: b 0 1 1
11: b 0 2 1
12: b 1 1 2
13: b 0 1 3
14: b 1 1 4
15: b 0 1 5
16: b 0 2 5
17: b 0 3 5
如果您通读 introductory materials on the package,您会发现这些变量也可以一步创建。
我认为实现此目的的最简单方法是 runner 包中的 streak_run
函数。 streak_run
也是最快的,如以下部分所示
解决方案
library(runner)
df <- data.frame( id = 1:10, random = sample(c(0,1), 10, replace=T))
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
df
# id random count
#1 1 0 -1
#2 2 0 -2
#3 3 1 1
#4 4 1 2
#5 5 1 3
#6 6 1 4
#7 7 0 -1
#8 8 0 -2
#9 9 0 -3
#10 10 0 -4
基准测试
runner_example <- function(df){
df$count <- streak_run(df$random)
df$count[df$random==0] <- -df$count[df$random==0]
return(df)}
dplyr_example <- function(df){
df %>%
transform(idx = c(0,cumsum(random[-1L] != random[-length(random)]))) %>%
group_by(id, idx) %>%
mutate(count = -1*cumsum(random == 0) + cumsum(random == 1)) %>%
ungroup() %>%
select(-idx)
return(df)}
dt_example <- function(df){
setDT(df)[, count := seq_len(.N), by=.(id,rleid(random))]
return(df)}
library(dplyr);library(data.table)
library(microbenchmark); library(magrittr)
df <- data.frame( id = 1:2000L, random = sample(letters[1:2], 2000L, replace=T))
microbenchmark(
dplyr = dplyr_example(df),
dt = dt_example(df),
runner = runner_example(df),
times=100
)
#Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 134388.839 164274.611 204478.048 188548.4975 222777.298 526019.563 100
# dt 1306.139 1710.665 2181.989 1941.3420 2380.953 5581.682 100
# runner 284.522 741.145 1022.456 853.5715 1004.553 7398.019 100