计算复杂的持续时间 table_1
calculate duration in a complex table_1
我有一个数据框
df <- data.frame("name" = c("jack", "william", "david", "john"),
"01-Jan-19" = c(NA,"A", NA,"A"),
"01-Feb-19" = c("A","A",NA,"A"),
"01-Mar-19" = c("S","A","A","A"),
"01-Apr-19" = c("A","A","A","S"),
"01-May-19" = c(NA,"A","A","A"),
"01-Jun-19" = c("A","S","A","S"),
"01-Jul-19" = c("A","S","A","S"),
"01-Aug-19" = c(NA,"S","A","A"),
"01-Sep-19" = c(NA,"S","A","S"),
"01-Oct-19" = c("S","S","A","S"),
"01-Nov-19" = c("S","S",NA,"S"),
"01-Dec-19" = c("S","S","S",NA),
"01-Jan-20" = c("S","M","A","M"),
"01-Feb-20" = c("M","M","M","M"))
要计算每个人从第一个 A 到最后一个 A 的持续时间,我可以通过以下方法实现
duration <- df %>%
tidyr::pivot_longer(cols = -name, names_to = 'person', values_drop_na = TRUE) %>%
dplyr::mutate(person = dmy(sub('X', '', person))) %>%
group_by(name) %>%
dplyr::summarise(avg_duration = person[max(which(value == 'A'))] - person[min(which(value == 'A'))])
请问如何修改代码达到以下目的?
两个 As 之间的周期,如何减去与其他值(不是 A 的任何值,例如 S、NA)的周期?
非常感谢。
我很难理解句号在这种情况下的含义。从您的示例看来,您需要一个时期的两个值。减去的期间也是这样吗?例如,如果一个人有序列:A、S、A,他们是否有 0、1 或 2 个周期 A?那么 A, S, A, A, S, A 呢?
如果每个日期代表一个时间段,该时间段在下一个日期之前有效,则每个人值组合的总持续时间可以计算如下:
duration <- df %>%
tidyr::pivot_longer(cols = -name, names_to = 'date') %>%
dplyr::mutate(date = lubridate::dmy(sub('X', '', date))) %>%
group_by(name) %>%
dplyr::arrange(name, date) %>%
dplyr::mutate(duration = c(diff(date), 0)) %>%
dplyr::group_by(name, value) %>%
dplyr::summarise(summed_duration = sum(duration))
# A tibble: 15 x 3
# Groups: name [4]
name value summed_duration
<chr> <chr> <drtn>
1 david A 276 days
2 david M 0 days
3 david S 31 days
4 david NA 89 days
5 jack A 119 days
6 jack M 0 days
7 jack S 154 days
8 jack NA 123 days
9 john A 152 days
10 john M 31 days
11 john S 182 days
12 john NA 31 days
13 william A 151 days
14 william M 31 days
15 william S 214 days
根据评论编辑
df %>%
tidyr::pivot_longer(cols = -name, names_to = 'date') %>%
dplyr::mutate(date = lubridate::dmy(sub('X', '', date))) %>%
dplyr::group_by(name) %>%
dplyr::arrange(name, date) %>%
dplyr::mutate(duration = c(diff(date), 0)) %>%
dplyr::group_by(name, value) %>%
dplyr::filter(dplyr::row_number() < dplyr::last(dplyr::row_number()) | value != 'A') %>%
dplyr::summarise(summed_duration = sum(duration)) %>%
dplyr::filter(value == 'A')
# A tibble: 4 x 3
# Groups: name [4]
name value summed_duration
<chr> <chr> <drtn>
1 david A 245 days
2 jack A 88 days
3 john A 121 days
4 william A 120 days
您可以计算最大索引和最小索引,其中每个索引 value = 'A'
name
并减去它们之间的月份天数,其中 value
不是 'A'
。
df %>%
tidyr::pivot_longer(cols = -name,names_to = 'person', values_drop_na = TRUE) %>%
dplyr::mutate(person = lubridate::dmy(sub('X', '', person))) %>%
dplyr::group_by(name) %>%
dplyr::summarise(min_ind = min(which(value == 'A')),
max_ind = max(which(value == 'A')),
duration = person[max_ind] - person[min_ind] -
sum(lubridate::days_in_month(person[value[min_ind:max_ind] != 'A'])))
# name min_ind max_ind duration
# <chr> <int> <int> <drtn>
#1 david 1 10 275 days
#2 jack 1 5 89 days
#3 john 1 8 90 days
#4 william 1 5 120 days
我有一个数据框
df <- data.frame("name" = c("jack", "william", "david", "john"),
"01-Jan-19" = c(NA,"A", NA,"A"),
"01-Feb-19" = c("A","A",NA,"A"),
"01-Mar-19" = c("S","A","A","A"),
"01-Apr-19" = c("A","A","A","S"),
"01-May-19" = c(NA,"A","A","A"),
"01-Jun-19" = c("A","S","A","S"),
"01-Jul-19" = c("A","S","A","S"),
"01-Aug-19" = c(NA,"S","A","A"),
"01-Sep-19" = c(NA,"S","A","S"),
"01-Oct-19" = c("S","S","A","S"),
"01-Nov-19" = c("S","S",NA,"S"),
"01-Dec-19" = c("S","S","S",NA),
"01-Jan-20" = c("S","M","A","M"),
"01-Feb-20" = c("M","M","M","M"))
要计算每个人从第一个 A 到最后一个 A 的持续时间,我可以通过以下方法实现
duration <- df %>%
tidyr::pivot_longer(cols = -name, names_to = 'person', values_drop_na = TRUE) %>%
dplyr::mutate(person = dmy(sub('X', '', person))) %>%
group_by(name) %>%
dplyr::summarise(avg_duration = person[max(which(value == 'A'))] - person[min(which(value == 'A'))])
请问如何修改代码达到以下目的? 两个 As 之间的周期,如何减去与其他值(不是 A 的任何值,例如 S、NA)的周期? 非常感谢。
我很难理解句号在这种情况下的含义。从您的示例看来,您需要一个时期的两个值。减去的期间也是这样吗?例如,如果一个人有序列:A、S、A,他们是否有 0、1 或 2 个周期 A?那么 A, S, A, A, S, A 呢?
如果每个日期代表一个时间段,该时间段在下一个日期之前有效,则每个人值组合的总持续时间可以计算如下:
duration <- df %>%
tidyr::pivot_longer(cols = -name, names_to = 'date') %>%
dplyr::mutate(date = lubridate::dmy(sub('X', '', date))) %>%
group_by(name) %>%
dplyr::arrange(name, date) %>%
dplyr::mutate(duration = c(diff(date), 0)) %>%
dplyr::group_by(name, value) %>%
dplyr::summarise(summed_duration = sum(duration))
# A tibble: 15 x 3
# Groups: name [4]
name value summed_duration
<chr> <chr> <drtn>
1 david A 276 days
2 david M 0 days
3 david S 31 days
4 david NA 89 days
5 jack A 119 days
6 jack M 0 days
7 jack S 154 days
8 jack NA 123 days
9 john A 152 days
10 john M 31 days
11 john S 182 days
12 john NA 31 days
13 william A 151 days
14 william M 31 days
15 william S 214 days
根据评论编辑
df %>%
tidyr::pivot_longer(cols = -name, names_to = 'date') %>%
dplyr::mutate(date = lubridate::dmy(sub('X', '', date))) %>%
dplyr::group_by(name) %>%
dplyr::arrange(name, date) %>%
dplyr::mutate(duration = c(diff(date), 0)) %>%
dplyr::group_by(name, value) %>%
dplyr::filter(dplyr::row_number() < dplyr::last(dplyr::row_number()) | value != 'A') %>%
dplyr::summarise(summed_duration = sum(duration)) %>%
dplyr::filter(value == 'A')
# A tibble: 4 x 3
# Groups: name [4]
name value summed_duration
<chr> <chr> <drtn>
1 david A 245 days
2 jack A 88 days
3 john A 121 days
4 william A 120 days
您可以计算最大索引和最小索引,其中每个索引 value = 'A'
name
并减去它们之间的月份天数,其中 value
不是 'A'
。
df %>%
tidyr::pivot_longer(cols = -name,names_to = 'person', values_drop_na = TRUE) %>%
dplyr::mutate(person = lubridate::dmy(sub('X', '', person))) %>%
dplyr::group_by(name) %>%
dplyr::summarise(min_ind = min(which(value == 'A')),
max_ind = max(which(value == 'A')),
duration = person[max_ind] - person[min_ind] -
sum(lubridate::days_in_month(person[value[min_ind:max_ind] != 'A'])))
# name min_ind max_ind duration
# <chr> <int> <int> <drtn>
#1 david 1 10 275 days
#2 jack 1 5 89 days
#3 john 1 8 90 days
#4 william 1 5 120 days