根据R中的日期变量重组多个变量

Question

如果我的数据集包含在不同时间点收集的相同度量的分数，我该如何组织这些 dates/times 以便它们代表特定日期之后的时间点？这可以在 R 中完成，还是我在另一个程序中更容易完成？

我有一个目前看起来像这样的数据集：

id  date        score1_date score1  score2_date score2  score3_date score3
101 1/6/2020    1/1/2020    20      1/8/2020    18      1/15/2020   16
102 2/27/2020   2/14/2020   16      2/21/2020   16      2/28/2020   10
103 1/10/2020   1/7/2020    30      1/14/2020   25      1/21/2020   20
104 3/5/2020    3/6/2020    40      3/13/2020   42      3/20/2020   40

我想找到最接近 [date] 的 [score#_date] 并将其标识为 [time1]，然后将后面的所有内容设为 [time2]、[time3] 等

这是上面的代码 table:

structure(list(id = c(101, 102, 103, 104), date = structure(c(18267, 
18319, 18271, 18326), class = "Date"), score1_date = structure(c(18262, 
18306, 18268, 18327), class = "Date"), score1 = c(20, 16, 30, 
40), score2_date = structure(c(18269, 18313, 18275, 18334), class = "Date"), 
    score2 = c(18, 16, 25, 42), score3_date = structure(c(18276, 
    18320, 18282, 18341), class = "Date"), score3 = c(16, 10, 
    20, 40)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", 
"data.frame"))

所以我最终希望数据集具有如下所示的变量：

id  date        time1_date  time1_score time2_date  time2_score time3_date  time3_score
101 1/6/2020    1/8/2020    18          1/15/2020   16          NA          NA
102 2/27/2020   2/28/2020   10          NA          NA          NA          NA
103 1/10/2020   1/7/2020    30          1/14/2020   25          1/21/2020   20
104 3/5/2020    3/6/2020    40          3/13/2020   42          3/20/2020   40

非常感谢！

Answer 1

使用 tidyverse 功能你可以做：

library(dplyr)
library(tidyr)

df %>%
  #Rename date column to base_date
  rename(base_date = date) %>%
  #Rename score1, score2 etc to score1_value, score2_value etc
  rename_with(~paste0(., '_value'), matches('^score\d+$')) %>%
  #get the data in long format with date and value as two columns
  pivot_longer(cols = starts_with('score'), 
               names_to = c('score', '.value'), 
               names_sep = '_') %>%
  group_by(id) %>%
  #Keep only those date where the date is greater than closest date
  filter(date >= date[which.min(abs(date  - base_date))]) %>%
  #Arrange the data
  arrange(id, date) %>%
  #Create new column name
  mutate(score = paste0('time', row_number())) %>%
  ungroup %>%
  #Get the data in wide format
  pivot_wider(names_from = score, values_from = c(date, value)) %>%
  #Arrange the columns
  select(id, base_date, order(suppressWarnings(readr::parse_number(names(.)))))

#    id base_date  date_time1 value_time1 date_time2 value_time2 date_time3 value_time3
#  <dbl> <date>     <date>           <dbl> <date>           <dbl> <date>           <dbl>
#1   101 2020-01-06 2020-01-08          18 2020-01-15          16 NA                  NA
#2   102 2020-02-27 2020-02-28          10 NA                  NA NA                  NA
#3   103 2020-01-10 2020-01-07          30 2020-01-14          25 2020-01-21          20
#4   104 2020-03-05 2020-03-06          40 2020-03-13          42 2020-03-20          40

Answer 2

这里有一个稍微不同的方法，包括重新排序最后的列：

library(tidyverse)
data %>% 
  rename(target = date) %>%
  mutate(across(everything(),as.character)) %>%
  pivot_longer(-c(id,target),names_pattern = "score([0-9]+)_?(.*)", names_to = c("num","variable")) %>%
  mutate(variable = case_when(variable == "" ~ "score", TRUE ~ variable)) %>%
  pivot_wider(id_cols = c("id","num","target"),names_from = variable, values_from = "value") %>%
  filter(date >= target) %>%
  group_by(id) %>%
  arrange(date) %>%
  mutate(new_num = row_number()) %>%
  select(id,target,new_num,date,score) %>%
  pivot_wider(id_cols = c("id","target"), names_from = new_num, values_from = c("date","score")) %>%
  group_by(id,target) %>%
  select(paste0(rep(c("date","score"),times = ncol(.)/2),"_",rep(1:3,each = ncol(.)/3)))
#Adding missing grouping variables: `id`, `target`
## A tibble: 4 x 8
## Groups:   id, target [4]
#  id    target     date_1     score_1 date_2     score_2 date_3     score_3
#  <chr> <chr>      <chr>      <chr>   <chr>      <chr>   <chr>      <chr>  
#1 101   2020-01-06 2020-01-08 18      2020-01-15 16      NA         NA     
#2 103   2020-01-10 2020-01-14 25      2020-01-21 20      NA         NA     
#3 102   2020-02-27 2020-02-28 10      NA         NA      NA         NA     
#4 104   2020-03-05 2020-03-06 40      2020-03-13 42      2020-03-20 40

Answer 3

我建议采用不同的策略来过滤行，使用 cummin 和 abs 函数作为 under

df %>%
  rename(base_date = date) %>%
  rename_with(~paste0(., '_value'), matches('^score\d+$')) %>%
  pivot_longer(cols = starts_with('score'), 
               names_to = c('score', '.value'), 
               names_sep = '_') %>%
  group_by(id) %>%
  mutate(d = abs(as.numeric(difftime(base_date, date, units = "days"))),
         score = str_remove(score, "\d$")) %>%
  filter(cumsum(cummin(d) == min(d)) != 0) %>%
  mutate(score = paste0(score, row_number())) %>%
  pivot_wider(id_cols = c(id,base_date), names_from = score, values_from = c(date, value))

# A tibble: 4 x 8
# Groups:   id [4]
     id base_date  date_score1 date_score2 date_score3 value_score1 value_score2 value_score3
  <dbl> <date>     <date>      <date>      <date>             <dbl>        <dbl>        <dbl>
1   101 2020-01-06 2020-01-08  2020-01-15  NA                    18           16           NA
2   102 2020-02-27 2020-02-28  NA          NA                    10           NA           NA
3   103 2020-01-10 2020-01-07  2020-01-14  2020-01-21            30           25           20
4   104 2020-03-05 2020-03-06  2020-03-13  2020-03-20            40           42           40

pivot_wider 进行了列重排 issue #839 still open on Github, so either you can manually rearrange the columns or adopt the strategy proposed by Ian in last line of his

根据R中的日期变量重组多个变量

Reorganize multiple variables based on date variable in R

reorganize

r

date

organization

longitudinal