R中按组编码的顺序:组内的重复值
Code sequence by group in R: recurring values within group
我想根据三列的组合编写一个数字序列:ID、年份和位置。我想将一个人在一个地方度过的年数编号为一个序列。序列应该在位置更改的那一年重新开始,所以即使一个人 returns 到他们以前去过的地方,序列也应该重新开始。
带序列的 df 应如下所示:
ID yr loc seq
1 1990 A 1
1 1991 A 2
1 1992 B 1
1 1993 B 2
1 1994 B 3
2 1990 B 1
2 1991 B 2
2 1992 A 1
2 1993 B 1
2 1994 B 2
3 1990 C 1
3 1991 C 2
3 1992 C 3
3 1993 B 1
3 1994 C 1
您知道如何在 R 中编写代码吗?
在data.table
中,我们有rleid
函数,这里就很简单了。
library(data.table)
setDT(df)[, seq1 := seq_len(.N), .(ID, rleid(loc))]
df
# ID yr loc seq seq1
# 1: 1 1990 A 1 1
# 2: 1 1991 A 2 2
# 3: 1 1992 B 1 1
# 4: 1 1993 B 2 2
# 5: 1 1994 B 3 3
# 6: 2 1990 B 1 1
# 7: 2 1991 B 2 2
# 8: 2 1992 A 1 1
# 9: 2 1993 B 1 1
#10: 2 1994 B 2 2
#11: 3 1990 C 1 1
#12: 3 1991 C 2 2
#13: 3 1992 C 3 3
#14: 3 1993 B 1 1
#15: 3 1994 C 1 1
我们可以在 dplyr
中使用 rleid
和基础 R 方法来获得预期的输出。
library(dplyr)
df %>%
group_by(ID, grp = data.table::rleid(loc)) %>%
mutate(seq1 = row_number())
或以 R 为基数:
df$seq1 <- with(df, ave(yr, ID, data.table::rleid(loc), FUN = seq_along))
@chinsoon12 建议的一个简洁选项是使用 rowid
函数。
setDT(df)[, seq2 := rowid(ID, rleid(loc))]
数据
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L,
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L,
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L,
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))
dplyr
的方法:
library(dplyr)
df %>%
group_by(ID, idx = cumsum(+(loc != lag(loc, default = first(loc))))) %>%
mutate(seq = row_number()) %>%
ungroup() %>% select(-idx)
输出:
# A tibble: 15 x 4
ID yr loc seq
<int> <int> <fct> <int>
1 1 1990 A 1
2 1 1991 A 2
3 1 1992 B 1
4 1 1993 B 2
5 1 1994 B 3
6 2 1990 B 1
7 2 1991 B 2
8 2 1992 A 1
9 2 1993 B 1
10 2 1994 B 2
11 3 1990 C 1
12 3 1991 C 2
13 3 1992 C 3
14 3 1993 B 1
15 3 1994 C 1
在base R
中,我们可以使用rle
和ave
df$seq <- with(df, ave(yr, ID, with(rle(as.character(loc)),
rep(seq_along(values), lengths)), FUN = seq_along))
df$seq
#[1] 1 2 1 2 3 1 2 1 1 2 1 2 3 1
数据
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L,
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L,
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L,
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))
我想根据三列的组合编写一个数字序列:ID、年份和位置。我想将一个人在一个地方度过的年数编号为一个序列。序列应该在位置更改的那一年重新开始,所以即使一个人 returns 到他们以前去过的地方,序列也应该重新开始。
带序列的 df 应如下所示:
ID yr loc seq
1 1990 A 1
1 1991 A 2
1 1992 B 1
1 1993 B 2
1 1994 B 3
2 1990 B 1
2 1991 B 2
2 1992 A 1
2 1993 B 1
2 1994 B 2
3 1990 C 1
3 1991 C 2
3 1992 C 3
3 1993 B 1
3 1994 C 1
您知道如何在 R 中编写代码吗?
在data.table
中,我们有rleid
函数,这里就很简单了。
library(data.table)
setDT(df)[, seq1 := seq_len(.N), .(ID, rleid(loc))]
df
# ID yr loc seq seq1
# 1: 1 1990 A 1 1
# 2: 1 1991 A 2 2
# 3: 1 1992 B 1 1
# 4: 1 1993 B 2 2
# 5: 1 1994 B 3 3
# 6: 2 1990 B 1 1
# 7: 2 1991 B 2 2
# 8: 2 1992 A 1 1
# 9: 2 1993 B 1 1
#10: 2 1994 B 2 2
#11: 3 1990 C 1 1
#12: 3 1991 C 2 2
#13: 3 1992 C 3 3
#14: 3 1993 B 1 1
#15: 3 1994 C 1 1
我们可以在 dplyr
中使用 rleid
和基础 R 方法来获得预期的输出。
library(dplyr)
df %>%
group_by(ID, grp = data.table::rleid(loc)) %>%
mutate(seq1 = row_number())
或以 R 为基数:
df$seq1 <- with(df, ave(yr, ID, data.table::rleid(loc), FUN = seq_along))
@chinsoon12 建议的一个简洁选项是使用 rowid
函数。
setDT(df)[, seq2 := rowid(ID, rleid(loc))]
数据
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L,
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L,
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L,
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))
dplyr
的方法:
library(dplyr)
df %>%
group_by(ID, idx = cumsum(+(loc != lag(loc, default = first(loc))))) %>%
mutate(seq = row_number()) %>%
ungroup() %>% select(-idx)
输出:
# A tibble: 15 x 4
ID yr loc seq
<int> <int> <fct> <int>
1 1 1990 A 1
2 1 1991 A 2
3 1 1992 B 1
4 1 1993 B 2
5 1 1994 B 3
6 2 1990 B 1
7 2 1991 B 2
8 2 1992 A 1
9 2 1993 B 1
10 2 1994 B 2
11 3 1990 C 1
12 3 1991 C 2
13 3 1992 C 3
14 3 1993 B 1
15 3 1994 C 1
在base R
中,我们可以使用rle
和ave
df$seq <- with(df, ave(yr, ID, with(rle(as.character(loc)),
rep(seq_along(values), lengths)), FUN = seq_along))
df$seq
#[1] 1 2 1 2 3 1 2 1 1 2 1 2 3 1
数据
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L), yr = c(1990L, 1991L, 1992L, 1993L, 1994L,
1990L, 1991L, 1992L, 1993L, 1994L, 1990L, 1991L, 1992L, 1993L,
1994L), loc = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 3L, 3L, 3L, 2L, 3L), .Label = c("A", "B", "C"), class = "factor"),
seq = c(1L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 3L,
1L, 1L)), class = "data.frame", row.names = c(NA, -15L))