计算复杂的持续时间 table_1

calculate duration in a complex table_1

我有一个数据框

df <- data.frame("name" = c("jack", "william", "david", "john"),
                 "01-Jan-19" = c(NA,"A", NA,"A"),
                 "01-Feb-19" = c("A","A",NA,"A"),
                 "01-Mar-19" = c("S","A","A","A"),
                 "01-Apr-19" = c("A","A","A","S"),
                 "01-May-19" = c(NA,"A","A","A"),
                 "01-Jun-19" = c("A","S","A","S"),
                 "01-Jul-19" = c("A","S","A","S"),
                 "01-Aug-19" = c(NA,"S","A","A"),
                 "01-Sep-19" = c(NA,"S","A","S"),
                 "01-Oct-19" = c("S","S","A","S"),
                 "01-Nov-19" = c("S","S",NA,"S"),
                 "01-Dec-19" = c("S","S","S",NA),
                 "01-Jan-20" = c("S","M","A","M"),
                 "01-Feb-20" = c("M","M","M","M"))

要计算每个人从第一个 A 到最后一个 A 的持续时间,我可以通过以下方法实现

duration <- df %>%
  tidyr::pivot_longer(cols = -name, names_to = 'person', values_drop_na = TRUE) %>%
  dplyr::mutate(person = dmy(sub('X', '', person))) %>%
  group_by(name) %>%
  dplyr::summarise(avg_duration = person[max(which(value == 'A'))] - person[min(which(value == 'A'))])

请问如何修改代码达到以下目的? 两个 As 之间的周期,如何减去与其他值(不是 A 的任何值,例如 S、NA)的周期? 非常感谢。

我很难理解句号在这种情况下的含义。从您的示例看来,您需要一个时期的两个值。减去的期间也是这样吗?例如,如果一个人有序列:A、S、A,他们是否有 0、1 或 2 个周期 A?那么 A, S, A, A, S, A 呢?

如果每个日期代表一个时间段,该时间段在下一个日期之前有效,则每个人值组合的总持续时间可以计算如下:

duration <- df %>%
  tidyr::pivot_longer(cols = -name, names_to = 'date') %>%
  dplyr::mutate(date = lubridate::dmy(sub('X', '', date))) %>%
  group_by(name) %>%
  dplyr::arrange(name, date) %>% 
  dplyr::mutate(duration = c(diff(date), 0)) %>% 
  dplyr::group_by(name, value) %>% 
  dplyr::summarise(summed_duration = sum(duration))
# A tibble: 15 x 3
# Groups:   name [4]
   name    value summed_duration
   <chr>   <chr> <drtn>         
 1 david   A     276 days       
 2 david   M       0 days       
 3 david   S      31 days       
 4 david   NA     89 days       
 5 jack    A     119 days       
 6 jack    M       0 days       
 7 jack    S     154 days       
 8 jack    NA    123 days       
 9 john    A     152 days       
10 john    M      31 days       
11 john    S     182 days       
12 john    NA     31 days       
13 william A     151 days       
14 william M      31 days       
15 william S     214 days 

根据评论编辑

df %>%
  tidyr::pivot_longer(cols = -name, names_to = 'date') %>%
  dplyr::mutate(date = lubridate::dmy(sub('X', '', date))) %>%
  dplyr::group_by(name) %>%
  dplyr::arrange(name, date) %>% 
  dplyr::mutate(duration = c(diff(date), 0)) %>% 
  dplyr::group_by(name, value) %>% 
  dplyr::filter(dplyr::row_number() < dplyr::last(dplyr::row_number()) | value != 'A') %>% 
  dplyr::summarise(summed_duration = sum(duration)) %>%
  dplyr::filter(value == 'A')

# A tibble: 4 x 3
# Groups:   name [4]
  name    value summed_duration
  <chr>   <chr> <drtn>         
1 david   A     245 days       
2 jack    A      88 days       
3 john    A     121 days       
4 william A     120 days   

您可以计算最大索引和最小索引,其中每个索引 value = 'A' name 并减去它们之间的月份天数,其中 value 不是 'A'

df %>%
  tidyr::pivot_longer(cols = -name,names_to = 'person', values_drop_na = TRUE) %>%
  dplyr::mutate(person = lubridate::dmy(sub('X', '', person))) %>%
  dplyr::group_by(name) %>%
  dplyr::summarise(min_ind = min(which(value == 'A')), 
                   max_ind = max(which(value == 'A')), 
                   duration = person[max_ind] - person[min_ind] - 
             sum(lubridate::days_in_month(person[value[min_ind:max_ind] != 'A'])))


#  name    min_ind max_ind duration
#  <chr>     <int>   <int> <drtn>  
#1 david         1      10 275 days
#2 jack          1       5  89 days
#3 john          1       8  90 days
#4 william       1       5 120 days