如何将按日期排序的数据集中的变量块重复压缩到 R 中的宽 table？

Question

我有以下数据

structure(list(station = c("61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002", 
"61R002", "61R002", "61R002"), pollutant = c(17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002", 
"002", "002", "002", "002", "002", "002", "002", "002", "002"
), concentration = c(NA, 0.41, 0.41, 0.41, 0.41, 0.41, 0.41, 
0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.39, 0.39, 0.39, 0.39, 
0.39, 0.39, 0.39, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.33, 
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.4, 0.4, 0.4, 0.4, 0.4, 
0.4, 0.4, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, NA, 0.38, 
0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 
0.38, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.38, 0.38, 0.38, 
0.38, 0.38, 0.38, 0.38, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.38, 0.38, 0.38, 0.38, 
0.38, 0.38, 0.38), date = structure(c(1514764800, 1514851200, 
1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 1515369600, 
1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 1515888000, 
1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 1516406400, 
1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 1516924800, 
1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 1517443200, 
1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 1517961600, 
1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 1518480000, 
1518566400, 1518652800, 1518739200, 1518825600, 1518912000, 1514764800, 
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 
1516406400, 1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 
1516924800, 1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 
1517443200, 1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 
1517961600, 1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 
1518480000, 1518566400, 1518652800, 1518739200, 1518825600, 1518912000
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-98L), class = c("tbl_df", "tbl", "data.frame"))

我想将其转换为

tag   station start_date end_date   `17201` `17204`
<chr> <chr>   <date>     <date>       <dbl>   <dbl>
002   61R002  2018-01-02 2018-01-07    0.41    0.38
002   61R002  2018-01-08 2018-01-14    0.42    0.38
002   61R002  2018-01-15 2018-01-21    0.39    0.37
002   61R002  2018-01-22 2018-01-28    0.46    0.38
002   61R002  2018-01-29 2018-02-04    0.33    0.31
002   61R002  2018-02-05 2018-02-11    0.4     0.33
002   61R002  2018-02-12 2018-02-18    0.38    0.38

即进入每个站的非重叠日期间隔。

我怎样才能做到这一点（例如，使用 dplyr 和管道运算符）？

请注意，站点和污染物变量可以取更多值，并且 start_date 和 end_date 之间的间隔不固定。

上一个问题（和）让我达到了

station pollutant tag   concentration start_date          end_date           
<chr>       <int> <chr>         <dbl> <dttm>              <dttm>             
61R002      17201 002            0.41 2018-01-02 00:00:00 2018-01-07 00:00:00
61R002      17201 002            0.42 2018-01-08 00:00:00 2018-01-14 00:00:00
61R002      17201 002            0.39 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002      17201 002            0.46 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002      17201 002            0.33 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002      17201 002            0.4  2018-02-05 00:00:00 2018-02-11 00:00:00
61R002      17201 002            0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
61R002      17204 002            0.38 2018-01-02 00:00:00 2018-01-14 00:00:00
61R002      17204 002            0.37 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002      17204 002            0.38 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002      17204 002            0.31 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002      17204 002            0.33 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002      17204 002            0.38 2018-02-12 00:00:00 2018-02-18 00:00:00

非常感谢。

Answer 1

解决方案的灵感来自于结合这个 and this 。

data_new <- data %>%
  arrange(station, pollutant, date) %>%
  group_by(tag, station, pollutant, grp = rleid(concentration)) %>%
  summarise(concentration = first(concentration), start_date = min(date), end_date = max(date), .groups = 'drop') %>%
  select(-grp) %>%
  mutate(date_range = interval(start_date, end_date))
  xls <- NULL
for (station in unique(data$station))
{
  tmp <- data_new %>%
    filter(station == !!station)

  all_dates <- tmp %>%
    select(start_date, end_date, concentration) %>%
    pivot_longer(!concentration, names_to = "date_type", values_to="date") %>%
    arrange(date) %>%
    select(date) %>%
    distinct()

  tmp2 <- tmp %>%
    rowwise() %>%
    mutate(bounded_dates = list(filter(all_dates, all_dates$date %within% date_range) %>% pull(date)),
           bounded_intervals = list(int_diff(bounded_dates))) %>%
    select(tag, station, pollutant, concentration, bounded_intervals) %>%
    unnest(bounded_intervals) %>%
    filter(bounded_intervals != as.interval(days(1), start = int_start(bounded_intervals))) %>%
    mutate(start_date = as_date(int_start(bounded_intervals)),
           end_date = as_date(int_end(bounded_intervals))) %>%
    pivot_wider(id_cols = -bounded_intervals, names_from = "pollutant", values_from = "concentration")

  xls <- bind_rows(xls, tmp2)
}

给我想要的输出

tag   station start_date end_date   `17201` `17204`
<chr> <chr>   <date>     <date>       <dbl>   <dbl>
002   61R002  2018-01-02 2018-01-07    0.41    0.38
002   61R002  2018-01-08 2018-01-14    0.42    0.38
002   61R002  2018-01-15 2018-01-21    0.39    0.37
002   61R002  2018-01-22 2018-01-28    0.46    0.38
002   61R002  2018-01-29 2018-02-04    0.33    0.31
002   61R002  2018-02-05 2018-02-11    0.4     0.33
002   61R002  2018-02-12 2018-02-18    0.38    0.38

如何将按日期排序的数据集中的变量块重复压缩到 R 中的宽 table？

How to compact block-duplicates of variables from a date-ordered dataset into a wide-table in R?

datetime

r

duplicates

intervals