如何压缩 R 中按日期排序的数据集中变量的块重复项?
How to compact block-duplicates of a variable in a date-ordered dataset in R?
我有以下数据
structure(list(station = c("61WOL2", "61WOL2", "61WOL2", "61WOL2",
"61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2",
"61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2",
"61WOL2", "61WOL2"), pollutant = c(17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002"), concentration = c(NA, 0.42, 0.42, 0.42,
0.42, 0.42, 0.42, 0.72, 0.72, 0.72, 0.72, 0.72, 0.72, 0.72, 0.31,
0.31, 0.31, 0.31, 0.31, 0.31), date = structure(c(1514764800,
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200,
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600,
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000,
1516406400), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
显示为
# A tibble: 20 x 5
station pollutant tag concentration date
<chr> <int> <chr> <dbl> <dttm>
1 61WOL2 17201 002 NA 2018-01-01 00:00:00
2 61WOL2 17201 002 0.42 2018-01-02 00:00:00
3 61WOL2 17201 002 0.42 2018-01-03 00:00:00
4 61WOL2 17201 002 0.42 2018-01-04 00:00:00
5 61WOL2 17201 002 0.42 2018-01-05 00:00:00
6 61WOL2 17201 002 0.42 2018-01-06 00:00:00
7 61WOL2 17201 002 0.42 2018-01-07 00:00:00
8 61WOL2 17201 002 0.72 2018-01-08 00:00:00
9 61WOL2 17201 002 0.72 2018-01-09 00:00:00
10 61WOL2 17201 002 0.72 2018-01-10 00:00:00
11 61WOL2 17201 002 0.72 2018-01-11 00:00:00
12 61WOL2 17201 002 0.72 2018-01-12 00:00:00
13 61WOL2 17201 002 0.72 2018-01-13 00:00:00
14 61WOL2 17201 002 0.72 2018-01-14 00:00:00
15 61WOL2 17201 002 0.31 2018-01-15 00:00:00
16 61WOL2 17201 002 0.31 2018-01-16 00:00:00
17 61WOL2 17201 002 0.31 2018-01-17 00:00:00
18 61WOL2 17201 002 0.31 2018-01-18 00:00:00
19 61WOL2 17201 002 0.31 2018-01-19 00:00:00
20 61WOL2 17201 002 0.31 2018-01-20 00:00:00
我想将其转换为
station pollutant tag concentration start_date end_date
<chr> <int> <chr> <dbl> <dttm> <dttm>
61WOL2 17201 002 0.42 2018-01-02 00:00:00 2018-01-07 00:00:00
61WOL2 17201 002 0.72 2018-01-08 00:00:00 2018-01-14 00:00:00
61WOL2 17201 002 0.31 2018-01-15 00:00:00 2018-01-20 00:00:00
请注意,变量 'station' 和 'pollutant' 可以取多个值。
我怎样才能做到这一点(例如,使用 dplyr 和管道运算符)?
非常感谢。
从 concentration
中删除 NA
值,并为 concentration
的每个唯一值获取最小和最大日期值。
library(dplyr)
library(data.table)
df %>%
filter(!is.na(concentration)) %>%
group_by(station, pollutant, tag, grp = rleid(concentration)) %>%
summarise(concentration = first(concentration),
start_date = min(date),
end_date = max(date), .groups = 'drop') %>%
select(-grp)
# station pollutant tag concentration start_date end_date
# <chr> <int> <chr> <dbl> <dttm> <dttm>
#1 61WOL2 17201 002 0.42 2018-01-02 00:00:00 2018-01-07 00:00:00
#2 61WOL2 17201 002 0.72 2018-01-08 00:00:00 2018-01-14 00:00:00
#3 61WOL2 17201 002 0.31 2018-01-15 00:00:00 2018-01-20 00:00:00
我有以下数据
structure(list(station = c("61WOL2", "61WOL2", "61WOL2", "61WOL2",
"61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2",
"61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2", "61WOL2",
"61WOL2", "61WOL2"), pollutant = c(17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L,
17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L, 17201L
), tag = c("002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002", "002", "002", "002", "002", "002", "002",
"002", "002", "002"), concentration = c(NA, 0.42, 0.42, 0.42,
0.42, 0.42, 0.42, 0.72, 0.72, 0.72, 0.72, 0.72, 0.72, 0.72, 0.31,
0.31, 0.31, 0.31, 0.31, 0.31), date = structure(c(1514764800,
1514851200, 1514937600, 1515024000, 1515110400, 1515196800, 1515283200,
1515369600, 1515456000, 1515542400, 1515628800, 1515715200, 1515801600,
1515888000, 1515974400, 1516060800, 1516147200, 1516233600, 1516320000,
1516406400), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
显示为
# A tibble: 20 x 5
station pollutant tag concentration date
<chr> <int> <chr> <dbl> <dttm>
1 61WOL2 17201 002 NA 2018-01-01 00:00:00
2 61WOL2 17201 002 0.42 2018-01-02 00:00:00
3 61WOL2 17201 002 0.42 2018-01-03 00:00:00
4 61WOL2 17201 002 0.42 2018-01-04 00:00:00
5 61WOL2 17201 002 0.42 2018-01-05 00:00:00
6 61WOL2 17201 002 0.42 2018-01-06 00:00:00
7 61WOL2 17201 002 0.42 2018-01-07 00:00:00
8 61WOL2 17201 002 0.72 2018-01-08 00:00:00
9 61WOL2 17201 002 0.72 2018-01-09 00:00:00
10 61WOL2 17201 002 0.72 2018-01-10 00:00:00
11 61WOL2 17201 002 0.72 2018-01-11 00:00:00
12 61WOL2 17201 002 0.72 2018-01-12 00:00:00
13 61WOL2 17201 002 0.72 2018-01-13 00:00:00
14 61WOL2 17201 002 0.72 2018-01-14 00:00:00
15 61WOL2 17201 002 0.31 2018-01-15 00:00:00
16 61WOL2 17201 002 0.31 2018-01-16 00:00:00
17 61WOL2 17201 002 0.31 2018-01-17 00:00:00
18 61WOL2 17201 002 0.31 2018-01-18 00:00:00
19 61WOL2 17201 002 0.31 2018-01-19 00:00:00
20 61WOL2 17201 002 0.31 2018-01-20 00:00:00
我想将其转换为
station pollutant tag concentration start_date end_date
<chr> <int> <chr> <dbl> <dttm> <dttm>
61WOL2 17201 002 0.42 2018-01-02 00:00:00 2018-01-07 00:00:00
61WOL2 17201 002 0.72 2018-01-08 00:00:00 2018-01-14 00:00:00
61WOL2 17201 002 0.31 2018-01-15 00:00:00 2018-01-20 00:00:00
请注意,变量 'station' 和 'pollutant' 可以取多个值。
我怎样才能做到这一点(例如,使用 dplyr 和管道运算符)?
非常感谢。
从 concentration
中删除 NA
值,并为 concentration
的每个唯一值获取最小和最大日期值。
library(dplyr)
library(data.table)
df %>%
filter(!is.na(concentration)) %>%
group_by(station, pollutant, tag, grp = rleid(concentration)) %>%
summarise(concentration = first(concentration),
start_date = min(date),
end_date = max(date), .groups = 'drop') %>%
select(-grp)
# station pollutant tag concentration start_date end_date
# <chr> <int> <chr> <dbl> <dttm> <dttm>
#1 61WOL2 17201 002 0.42 2018-01-02 00:00:00 2018-01-07 00:00:00
#2 61WOL2 17201 002 0.72 2018-01-08 00:00:00 2018-01-14 00:00:00
#3 61WOL2 17201 002 0.31 2018-01-15 00:00:00 2018-01-20 00:00:00