如何将按日期排序的数据集中的变量块重复压缩到 R 中的宽 table?
How to compact block-duplicates of variables from a date-ordered dataset into a wide-table in R?
我有以下数据
structure(list(station = c("61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002"), pollutant = c(17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002"
), concentration = c(NA, 0.41, 0.41, 0.41, 0.41, 0.41, 0.41,
0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.39, 0.39, 0.39, 0.39,
0.39, 0.39, 0.39, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.33,
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.4, 0.4, 0.4, 0.4, 0.4,
0.4, 0.4, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, NA, 0.38,
0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38,
0.38, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.38, 0.38, 0.38,
0.38, 0.38, 0.38, 0.38, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31,
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.38, 0.38, 0.38, 0.38,
0.38, 0.38, 0.38), date = structure(c(1514764800, 1514851200,
1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 1515369600,
1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 1515888000,
1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 1516406400,
1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 1516924800,
1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 1517443200,
1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 1517961600,
1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 1518480000,
1518566400, 1518652800, 1518739200, 1518825600, 1518912000, 1514764800,
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200,
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600,
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000,
1516406400, 1516492800, 1516579200, 1516665600, 1516752000, 1516838400,
1516924800, 1517011200, 1517097600, 1517184000, 1517270400, 1517356800,
1517443200, 1517529600, 1517616000, 1517702400, 1517788800, 1517875200,
1517961600, 1518048000, 1518134400, 1518220800, 1518307200, 1518393600,
1518480000, 1518566400, 1518652800, 1518739200, 1518825600, 1518912000
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-98L), class = c("tbl_df", "tbl", "data.frame"))
我想将其转换为
tag station start_date end_date `17201` `17204`
<chr> <chr> <date> <date> <dbl> <dbl>
002 61R002 2018-01-02 2018-01-07 0.41 0.38
002 61R002 2018-01-08 2018-01-14 0.42 0.38
002 61R002 2018-01-15 2018-01-21 0.39 0.37
002 61R002 2018-01-22 2018-01-28 0.46 0.38
002 61R002 2018-01-29 2018-02-04 0.33 0.31
002 61R002 2018-02-05 2018-02-11 0.4 0.33
002 61R002 2018-02-12 2018-02-18 0.38 0.38
即进入每个站的非重叠日期间隔。
我怎样才能做到这一点(例如,使用 dplyr 和管道运算符)?
请注意,站点和污染物变量可以取更多值,并且 start_date 和 end_date 之间的间隔不固定。
上一个问题(和 )让我达到了
station pollutant tag concentration start_date end_date
<chr> <int> <chr> <dbl> <dttm> <dttm>
61R002 17201 002 0.41 2018-01-02 00:00:00 2018-01-07 00:00:00
61R002 17201 002 0.42 2018-01-08 00:00:00 2018-01-14 00:00:00
61R002 17201 002 0.39 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002 17201 002 0.46 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002 17201 002 0.33 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002 17201 002 0.4 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002 17201 002 0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
61R002 17204 002 0.38 2018-01-02 00:00:00 2018-01-14 00:00:00
61R002 17204 002 0.37 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002 17204 002 0.38 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002 17204 002 0.31 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002 17204 002 0.33 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002 17204 002 0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
非常感谢。
解决方案的灵感来自于结合这个 and this 。
data_new <- data %>%
arrange(station, pollutant, date) %>%
group_by(tag, station, pollutant, grp = rleid(concentration)) %>%
summarise(concentration = first(concentration), start_date = min(date), end_date = max(date), .groups = 'drop') %>%
select(-grp) %>%
mutate(date_range = interval(start_date, end_date))
xls <- NULL
for (station in unique(data$station))
{
tmp <- data_new %>%
filter(station == !!station)
all_dates <- tmp %>%
select(start_date, end_date, concentration) %>%
pivot_longer(!concentration, names_to = "date_type", values_to="date") %>%
arrange(date) %>%
select(date) %>%
distinct()
tmp2 <- tmp %>%
rowwise() %>%
mutate(bounded_dates = list(filter(all_dates, all_dates$date %within% date_range) %>% pull(date)),
bounded_intervals = list(int_diff(bounded_dates))) %>%
select(tag, station, pollutant, concentration, bounded_intervals) %>%
unnest(bounded_intervals) %>%
filter(bounded_intervals != as.interval(days(1), start = int_start(bounded_intervals))) %>%
mutate(start_date = as_date(int_start(bounded_intervals)),
end_date = as_date(int_end(bounded_intervals))) %>%
pivot_wider(id_cols = -bounded_intervals, names_from = "pollutant", values_from = "concentration")
xls <- bind_rows(xls, tmp2)
}
给我想要的输出
tag station start_date end_date `17201` `17204`
<chr> <chr> <date> <date> <dbl> <dbl>
002 61R002 2018-01-02 2018-01-07 0.41 0.38
002 61R002 2018-01-08 2018-01-14 0.42 0.38
002 61R002 2018-01-15 2018-01-21 0.39 0.37
002 61R002 2018-01-22 2018-01-28 0.46 0.38
002 61R002 2018-01-29 2018-02-04 0.33 0.31
002 61R002 2018-02-05 2018-02-11 0.4 0.33
002 61R002 2018-02-12 2018-02-18 0.38 0.38
我有以下数据
structure(list(station = c("61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002", "61R002", "61R002", "61R002", "61R002",
"61R002", "61R002", "61R002"), pollutant = c(17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L,
17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L, 17204L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002"
), concentration = c(NA, 0.41, 0.41, 0.41, 0.41, 0.41, 0.41,
0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.42, 0.39, 0.39, 0.39, 0.39,
0.39, 0.39, 0.39, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.46, 0.33,
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.4, 0.4, 0.4, 0.4, 0.4,
0.4, 0.4, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, NA, 0.38,
0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38, 0.38,
0.38, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.37, 0.38, 0.38, 0.38,
0.38, 0.38, 0.38, 0.38, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31, 0.31,
0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.33, 0.38, 0.38, 0.38, 0.38,
0.38, 0.38, 0.38), date = structure(c(1514764800, 1514851200,
1514937600, 1515024000, 1515110400, 1515196800, 1515283200, 1515369600,
1515456000, 1515542400, 1515628800, 1515715200, 1515801600, 1515888000,
1515974400, 1516060800, 1516147200, 1516233600, 1516320000, 1516406400,
1516492800, 1516579200, 1516665600, 1516752000, 1516838400, 1516924800,
1517011200, 1517097600, 1517184000, 1517270400, 1517356800, 1517443200,
1517529600, 1517616000, 1517702400, 1517788800, 1517875200, 1517961600,
1518048000, 1518134400, 1518220800, 1518307200, 1518393600, 1518480000,
1518566400, 1518652800, 1518739200, 1518825600, 1518912000, 1514764800,
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200,
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600,
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000,
1516406400, 1516492800, 1516579200, 1516665600, 1516752000, 1516838400,
1516924800, 1517011200, 1517097600, 1517184000, 1517270400, 1517356800,
1517443200, 1517529600, 1517616000, 1517702400, 1517788800, 1517875200,
1517961600, 1518048000, 1518134400, 1518220800, 1518307200, 1518393600,
1518480000, 1518566400, 1518652800, 1518739200, 1518825600, 1518912000
), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-98L), class = c("tbl_df", "tbl", "data.frame"))
我想将其转换为
tag station start_date end_date `17201` `17204`
<chr> <chr> <date> <date> <dbl> <dbl>
002 61R002 2018-01-02 2018-01-07 0.41 0.38
002 61R002 2018-01-08 2018-01-14 0.42 0.38
002 61R002 2018-01-15 2018-01-21 0.39 0.37
002 61R002 2018-01-22 2018-01-28 0.46 0.38
002 61R002 2018-01-29 2018-02-04 0.33 0.31
002 61R002 2018-02-05 2018-02-11 0.4 0.33
002 61R002 2018-02-12 2018-02-18 0.38 0.38
即进入每个站的非重叠日期间隔。
我怎样才能做到这一点(例如,使用 dplyr 和管道运算符)?
请注意,站点和污染物变量可以取更多值,并且 start_date 和 end_date 之间的间隔不固定。
上一个问题(和
station pollutant tag concentration start_date end_date
<chr> <int> <chr> <dbl> <dttm> <dttm>
61R002 17201 002 0.41 2018-01-02 00:00:00 2018-01-07 00:00:00
61R002 17201 002 0.42 2018-01-08 00:00:00 2018-01-14 00:00:00
61R002 17201 002 0.39 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002 17201 002 0.46 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002 17201 002 0.33 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002 17201 002 0.4 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002 17201 002 0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
61R002 17204 002 0.38 2018-01-02 00:00:00 2018-01-14 00:00:00
61R002 17204 002 0.37 2018-01-15 00:00:00 2018-01-21 00:00:00
61R002 17204 002 0.38 2018-01-22 00:00:00 2018-01-28 00:00:00
61R002 17204 002 0.31 2018-01-29 00:00:00 2018-02-04 00:00:00
61R002 17204 002 0.33 2018-02-05 00:00:00 2018-02-11 00:00:00
61R002 17204 002 0.38 2018-02-12 00:00:00 2018-02-18 00:00:00
非常感谢。
解决方案的灵感来自于结合这个
data_new <- data %>%
arrange(station, pollutant, date) %>%
group_by(tag, station, pollutant, grp = rleid(concentration)) %>%
summarise(concentration = first(concentration), start_date = min(date), end_date = max(date), .groups = 'drop') %>%
select(-grp) %>%
mutate(date_range = interval(start_date, end_date))
xls <- NULL
for (station in unique(data$station))
{
tmp <- data_new %>%
filter(station == !!station)
all_dates <- tmp %>%
select(start_date, end_date, concentration) %>%
pivot_longer(!concentration, names_to = "date_type", values_to="date") %>%
arrange(date) %>%
select(date) %>%
distinct()
tmp2 <- tmp %>%
rowwise() %>%
mutate(bounded_dates = list(filter(all_dates, all_dates$date %within% date_range) %>% pull(date)),
bounded_intervals = list(int_diff(bounded_dates))) %>%
select(tag, station, pollutant, concentration, bounded_intervals) %>%
unnest(bounded_intervals) %>%
filter(bounded_intervals != as.interval(days(1), start = int_start(bounded_intervals))) %>%
mutate(start_date = as_date(int_start(bounded_intervals)),
end_date = as_date(int_end(bounded_intervals))) %>%
pivot_wider(id_cols = -bounded_intervals, names_from = "pollutant", values_from = "concentration")
xls <- bind_rows(xls, tmp2)
}
给我想要的输出
tag station start_date end_date `17201` `17204`
<chr> <chr> <date> <date> <dbl> <dbl>
002 61R002 2018-01-02 2018-01-07 0.41 0.38
002 61R002 2018-01-08 2018-01-14 0.42 0.38
002 61R002 2018-01-15 2018-01-21 0.39 0.37
002 61R002 2018-01-22 2018-01-28 0.46 0.38
002 61R002 2018-01-29 2018-02-04 0.33 0.31
002 61R002 2018-02-05 2018-02-11 0.4 0.33
002 61R002 2018-02-12 2018-02-18 0.38 0.38