创建采用序列中另一个变量值的变量
Create variable that takes value of another variable within a sequence
我有一个类似于下面示例的数据集。我想创建一个变量(即 Submission_date.last),它根据另一个变量(即重复)的序列获取另一个变量(即 Submission_date)的最后一个值,其中 0 表示开始一个新序列。
我有下面的代码,但不知道从这里去哪里。感谢任何帮助,谢谢!
df %>%
group_by(ID) %>%
arrange(Submission_date) %>%
mutate(Submission_date.last = Submission_date[]) # I'm not sure what to put on this line to correctly create the variable?
ID Submission_date Repeat Submission_date.last
1 25/07/19 0 31/07/19
1 30/07/19 1 31/07/19
1 31/07/19 2 31/07/19
1 11/08/20 0 14/08/20
1 14/08/20 1 14/08/20
2 30/07/19 0 30/07/19
2 31/08/20 0 31/08/20
3 13/09/20 0 15/09/20
3 15/09/20 1 15/09/20
3 18/07/21 0 22/07/21
3 21/07/21 1 22/07/21
3 22/07/21 2 22/07/21
每当 Repeat = 0
时创建一个组,并为每个组获取 last
Submission_date
。
library(dplyr)
df %>%
group_by(ID, grp = cumsum(Repeat == 0)) %>%
mutate(Submission_date.last = last(Submission_date)) %>%
ungroup %>%
select(-grp)
# ID Submission_date Repeat Submission_date.last
# <int> <chr> <int> <chr>
# 1 1 25/07/19 0 31/07/19
# 2 1 30/07/19 1 31/07/19
# 3 1 31/07/19 2 31/07/19
# 4 1 11/08/20 0 14/08/20
# 5 1 14/08/20 1 14/08/20
# 6 2 30/07/19 0 30/07/19
# 7 2 31/08/20 0 31/08/20
# 8 3 13/09/20 0 15/09/20
# 9 3 15/09/20 1 15/09/20
#10 3 18/07/21 0 22/07/21
#11 3 21/07/21 1 22/07/21
#12 3 22/07/21 2 22/07/21
数据
如果您以可重现的格式提供数据,就更容易提供帮助。
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L,
3L, 3L), Submission_date = c("25/07/19", "30/07/19", "31/07/19",
"11/08/20", "14/08/20", "30/07/19", "31/08/20", "13/09/20", "15/09/20",
"18/07/21", "21/07/21", "22/07/21"), Repeat = c(0L, 1L, 2L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 2L)), row.names = c(NA, -12L), class = "data.frame")
我们可以使用
library(data.table)
setDT(df)[, Submission_date.last := last(Submission_date),
by = .(ID, cumsum(Repeat == 0))]
-输出
> df
ID Submission_date Repeat Submission_date.last
1: 1 25/07/19 0 31/07/19
2: 1 30/07/19 1 31/07/19
3: 1 31/07/19 2 31/07/19
4: 1 11/08/20 0 14/08/20
5: 1 14/08/20 1 14/08/20
6: 2 30/07/19 0 30/07/19
7: 2 31/08/20 0 31/08/20
8: 3 13/09/20 0 15/09/20
9: 3 15/09/20 1 15/09/20
10: 3 18/07/21 0 22/07/21
11: 3 21/07/21 1 22/07/21
12: 3 22/07/21 2 22/07/21
日期
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L,
3L, 3L), Submission_date = c("25/07/19", "30/07/19", "31/07/19",
"11/08/20", "14/08/20", "30/07/19", "31/08/20", "13/09/20", "15/09/20",
"18/07/21", "21/07/21", "22/07/21"), Repeat = c(0L, 1L, 2L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 2L)), row.names = c(NA, -12L), class = "data.frame")
我有一个类似于下面示例的数据集。我想创建一个变量(即 Submission_date.last),它根据另一个变量(即重复)的序列获取另一个变量(即 Submission_date)的最后一个值,其中 0 表示开始一个新序列。
我有下面的代码,但不知道从这里去哪里。感谢任何帮助,谢谢!
df %>%
group_by(ID) %>%
arrange(Submission_date) %>%
mutate(Submission_date.last = Submission_date[]) # I'm not sure what to put on this line to correctly create the variable?
ID Submission_date Repeat Submission_date.last
1 25/07/19 0 31/07/19
1 30/07/19 1 31/07/19
1 31/07/19 2 31/07/19
1 11/08/20 0 14/08/20
1 14/08/20 1 14/08/20
2 30/07/19 0 30/07/19
2 31/08/20 0 31/08/20
3 13/09/20 0 15/09/20
3 15/09/20 1 15/09/20
3 18/07/21 0 22/07/21
3 21/07/21 1 22/07/21
3 22/07/21 2 22/07/21
每当 Repeat = 0
时创建一个组,并为每个组获取 last
Submission_date
。
library(dplyr)
df %>%
group_by(ID, grp = cumsum(Repeat == 0)) %>%
mutate(Submission_date.last = last(Submission_date)) %>%
ungroup %>%
select(-grp)
# ID Submission_date Repeat Submission_date.last
# <int> <chr> <int> <chr>
# 1 1 25/07/19 0 31/07/19
# 2 1 30/07/19 1 31/07/19
# 3 1 31/07/19 2 31/07/19
# 4 1 11/08/20 0 14/08/20
# 5 1 14/08/20 1 14/08/20
# 6 2 30/07/19 0 30/07/19
# 7 2 31/08/20 0 31/08/20
# 8 3 13/09/20 0 15/09/20
# 9 3 15/09/20 1 15/09/20
#10 3 18/07/21 0 22/07/21
#11 3 21/07/21 1 22/07/21
#12 3 22/07/21 2 22/07/21
数据
如果您以可重现的格式提供数据,就更容易提供帮助。
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L,
3L, 3L), Submission_date = c("25/07/19", "30/07/19", "31/07/19",
"11/08/20", "14/08/20", "30/07/19", "31/08/20", "13/09/20", "15/09/20",
"18/07/21", "21/07/21", "22/07/21"), Repeat = c(0L, 1L, 2L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 2L)), row.names = c(NA, -12L), class = "data.frame")
我们可以使用
library(data.table)
setDT(df)[, Submission_date.last := last(Submission_date),
by = .(ID, cumsum(Repeat == 0))]
-输出
> df
ID Submission_date Repeat Submission_date.last
1: 1 25/07/19 0 31/07/19
2: 1 30/07/19 1 31/07/19
3: 1 31/07/19 2 31/07/19
4: 1 11/08/20 0 14/08/20
5: 1 14/08/20 1 14/08/20
6: 2 30/07/19 0 30/07/19
7: 2 31/08/20 0 31/08/20
8: 3 13/09/20 0 15/09/20
9: 3 15/09/20 1 15/09/20
10: 3 18/07/21 0 22/07/21
11: 3 21/07/21 1 22/07/21
12: 3 22/07/21 2 22/07/21
日期
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L,
3L, 3L), Submission_date = c("25/07/19", "30/07/19", "31/07/19",
"11/08/20", "14/08/20", "30/07/19", "31/08/20", "13/09/20", "15/09/20",
"18/07/21", "21/07/21", "22/07/21"), Repeat = c(0L, 1L, 2L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 2L)), row.names = c(NA, -12L), class = "data.frame")