我如何 select 最近的样本,除非在某个月份有样本?
How do I select the most recent sample UNLESS there is a sample in a certain month?
我有一个包含采样点和日期的数据集。我试图只在每个站点保留最新的八月样本,但如果没有八月样本,我只想保留最新的样本(无论是在八月之前还是之后)。月份范围从 6 月到 9 月。
我不确定如何让 r 根据其他行中的内容来决定保留哪一行。
我试过选项 a,它基于对保留最新样本的问题的回答:
att <- att %>%
group_by(siteID) %>%
arrange(desc(SAMPLE_DATE)) %>%
slice(1:1)
但这不一定保留最近的 8 月 样本。
和选项b有点乱,但我认为更接近我想要的:
# get duplicates (multiple samples of same lake)
n_occur <- data.frame(table(att$siteID))
dupes <- att[att$siteID %in% n_occur$Var1[n_occur$Freq > 1],]
# keep most recent august sample
m_att <- att %>%
mutate(keep=case_when(
siteID %in% dupes$siteID & month(SAMPLE_DATE) != 9 & !is.na(`CHLOROPHYLL A_OW_TOTAL`) ~ "yes",
!(siteID %in% dupes$siteID) ~ "yes")) %>%
arrange(siteID,desc(SAMPLE_DATE)) %>%
filter(keep=="yes")
m_occur <- data.frame(table(m_att$siteID))
m_att <- m_att[!duplicated(m_att$siteID),]
att <- m_att %>%
distinct()
但这错过了一些网站,我最终在过滤器中丢失了一些网站。
例如,这是我的一大块数据:
SAMPLE_DATE siteID
1 2020-07-22 LAK20_NY-10001
2 <NA> LAK20_NY-10002
3 <NA> LAK20_NY-10003
4 <NA> LAK20_NY-10004
5 <NA> LAK20_NY-10005
6 <NA> LAK20_NY-10006
7 2020-09-01 LAK20_NY-10007
8 <NA> LAK20_NY-10008
9 <NA> LAK20_NY-10009
10 <NA> LAK20_NY-10010
11 2020-07-07 LAK20_NY-10011
12 2020-07-09 LAK20_NY-10012
13 2020-08-03 LAK20_NY-10013
14 2020-09-01 LAK20_NY-10014
15 <NA> LAK20_NY-10015
16 2020-09-20 LAK20_NY-10016
17 2020-08-09 LAK20_NY-10016
18 2020-07-26 LAK20_NY-10016
19 2020-08-23 LAK20_NY-10016
20 2020-06-14 LAK20_NY-10016
21 2020-09-08 LAK20_NY-10016
22 2020-06-28 LAK20_NY-10016
23 2020-07-12 LAK20_NY-10016
24 2020-09-02 LAK20_NY-10016
25 2020-07-22 LAK20_NY-10017
26 2020-09-30 LAK20_NY-10018
27 <NA> LAK20_NY-10019
28 2020-07-08 LAK20_NY-10020
29 2020-08-04 LAK20_NY-10021
30 <NA> LAK20_NY-10022
31 <NA> LAK20_NY-10023
32 2020-07-07 LAK20_NY-10024
33 2021-09-21 LAK20_NY-10025
34 2021-07-27 LAK20_NY-10025
35 <NA> LAK20_NY-10026
36 2020-06-20 LAK20_NY-10027
37 2020-08-02 LAK20_NY-10027
38 2020-06-07 LAK20_NY-10027
39 2020-08-15 LAK20_NY-10027
40 2020-09-06 LAK20_NY-10027
41 2020-07-18 LAK20_NY-10027
42 2020-08-31 LAK20_NY-10027
43 2020-08-30 LAK20_NY-10027
44 2020-06-21 LAK20_NY-10027
45 2020-09-12 LAK20_NY-10027
46 2020-08-16 LAK20_NY-10027
47 2021-09-13 LAK20_NY-10027
48 2020-09-20 LAK20_NY-10027
49 2020-07-19 LAK20_NY-10027
50 2020-07-05 LAK20_NY-10027
51 2020-08-01 LAK20_NY-10027
选项 a 会为 LAK20_NY-10016 保留错误样本(我希望将样本保留在 2020-08-09,而不是 2020-09-30) . 选项 b 适用于该湖,但对于 LAK20_NY-10025,当我想在 2021-09-21 保留样本时,选项 b 不会包括任何一个采样事件.
这是一个解决方案,您可以在其中注释月份以及特定湖泊是否有 8 月的读数,然后根据该值进行过滤。
library(tidyverse)
library(lubridate)
att <- structure(list(SAMPLE_DATE = c("2020-07-22", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "2020-09-01", "<NA>", "<NA>", "<NA>", "2020-07-07", "2020-07-09", "2020-08-03", "2020-09-01", "<NA>", "2020-09-20", "2020-08-09", "2020-07-26", "2020-08-23", "2020-06-14", "2020-09-08", "2020-06-28", "2020-07-12", "2020-09-02", "2020-07-22", "2020-09-30", "<NA>", "2020-07-08", "2020-08-04", "<NA>", "<NA>", "2020-07-07", "2021-09-21", "2021-07-27", "<NA>", "2020-06-20", "2020-08-02", "2020-06-07", "2020-08-15", "2020-09-06", "2020-07-18", "2020-08-31", "2020-08-30", "2020-06-21", "2020-09-12", "2020-08-16", "2021-09-13", "2020-09-20", "2020-07-19", "2020-07-05", "2020-08-01"), siteID = c("LAK20_NY-10001", "LAK20_NY-10002", "LAK20_NY-10003", "LAK20_NY-10004", "LAK20_NY-10005", "LAK20_NY-10006", "LAK20_NY-10007", "LAK20_NY-10008", "LAK20_NY-10009", "LAK20_NY-10010", "LAK20_NY-10011", "LAK20_NY-10012", "LAK20_NY-10013", "LAK20_NY-10014", "LAK20_NY-10015", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10017", "LAK20_NY-10018", "LAK20_NY-10019", "LAK20_NY-10020", "LAK20_NY-10021", "LAK20_NY-10022", "LAK20_NY-10023", "LAK20_NY-10024", "LAK20_NY-10025", "LAK20_NY-10025", "LAK20_NY-10026", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027")), row.names = c(NA, -51L), class = c("tbl_df", "tbl", "data.frame"))
att %>%
mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
group_by(siteID) %>%
mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
filter(has_aug & month == 8 |
!has_aug) %>% # filter only august dates from sites with or all from those without
slice_max(n = 1,
order_by = SAMPLE_DATE,
with_ties = F) %>% # change with_ties to TRUE to discard NA values
select(SAMPLE_DATE, siteID)
#> # A tibble: 27 x 2
#> # Groups: siteID [27]
#> SAMPLE_DATE siteID
#> <date> <chr>
#> 1 2020-07-22 LAK20_NY-10001
#> 2 NA LAK20_NY-10002
#> 3 NA LAK20_NY-10003
#> 4 NA LAK20_NY-10004
#> 5 NA LAK20_NY-10005
#> 6 NA LAK20_NY-10006
#> 7 2020-09-01 LAK20_NY-10007
#> 8 NA LAK20_NY-10008
#> 9 NA LAK20_NY-10009
#> 10 NA LAK20_NY-10010
#> # ... with 17 more rows
由 reprex package (v2.0.1)
创建于 2022-02-03
我有一个包含采样点和日期的数据集。我试图只在每个站点保留最新的八月样本,但如果没有八月样本,我只想保留最新的样本(无论是在八月之前还是之后)。月份范围从 6 月到 9 月。
我不确定如何让 r 根据其他行中的内容来决定保留哪一行。
我试过选项 a,它基于对保留最新样本的问题的回答:
att <- att %>%
group_by(siteID) %>%
arrange(desc(SAMPLE_DATE)) %>%
slice(1:1)
但这不一定保留最近的 8 月 样本。
和选项b有点乱,但我认为更接近我想要的:
# get duplicates (multiple samples of same lake)
n_occur <- data.frame(table(att$siteID))
dupes <- att[att$siteID %in% n_occur$Var1[n_occur$Freq > 1],]
# keep most recent august sample
m_att <- att %>%
mutate(keep=case_when(
siteID %in% dupes$siteID & month(SAMPLE_DATE) != 9 & !is.na(`CHLOROPHYLL A_OW_TOTAL`) ~ "yes",
!(siteID %in% dupes$siteID) ~ "yes")) %>%
arrange(siteID,desc(SAMPLE_DATE)) %>%
filter(keep=="yes")
m_occur <- data.frame(table(m_att$siteID))
m_att <- m_att[!duplicated(m_att$siteID),]
att <- m_att %>%
distinct()
但这错过了一些网站,我最终在过滤器中丢失了一些网站。
例如,这是我的一大块数据:
SAMPLE_DATE siteID
1 2020-07-22 LAK20_NY-10001
2 <NA> LAK20_NY-10002
3 <NA> LAK20_NY-10003
4 <NA> LAK20_NY-10004
5 <NA> LAK20_NY-10005
6 <NA> LAK20_NY-10006
7 2020-09-01 LAK20_NY-10007
8 <NA> LAK20_NY-10008
9 <NA> LAK20_NY-10009
10 <NA> LAK20_NY-10010
11 2020-07-07 LAK20_NY-10011
12 2020-07-09 LAK20_NY-10012
13 2020-08-03 LAK20_NY-10013
14 2020-09-01 LAK20_NY-10014
15 <NA> LAK20_NY-10015
16 2020-09-20 LAK20_NY-10016
17 2020-08-09 LAK20_NY-10016
18 2020-07-26 LAK20_NY-10016
19 2020-08-23 LAK20_NY-10016
20 2020-06-14 LAK20_NY-10016
21 2020-09-08 LAK20_NY-10016
22 2020-06-28 LAK20_NY-10016
23 2020-07-12 LAK20_NY-10016
24 2020-09-02 LAK20_NY-10016
25 2020-07-22 LAK20_NY-10017
26 2020-09-30 LAK20_NY-10018
27 <NA> LAK20_NY-10019
28 2020-07-08 LAK20_NY-10020
29 2020-08-04 LAK20_NY-10021
30 <NA> LAK20_NY-10022
31 <NA> LAK20_NY-10023
32 2020-07-07 LAK20_NY-10024
33 2021-09-21 LAK20_NY-10025
34 2021-07-27 LAK20_NY-10025
35 <NA> LAK20_NY-10026
36 2020-06-20 LAK20_NY-10027
37 2020-08-02 LAK20_NY-10027
38 2020-06-07 LAK20_NY-10027
39 2020-08-15 LAK20_NY-10027
40 2020-09-06 LAK20_NY-10027
41 2020-07-18 LAK20_NY-10027
42 2020-08-31 LAK20_NY-10027
43 2020-08-30 LAK20_NY-10027
44 2020-06-21 LAK20_NY-10027
45 2020-09-12 LAK20_NY-10027
46 2020-08-16 LAK20_NY-10027
47 2021-09-13 LAK20_NY-10027
48 2020-09-20 LAK20_NY-10027
49 2020-07-19 LAK20_NY-10027
50 2020-07-05 LAK20_NY-10027
51 2020-08-01 LAK20_NY-10027
选项 a 会为 LAK20_NY-10016 保留错误样本(我希望将样本保留在 2020-08-09,而不是 2020-09-30) . 选项 b 适用于该湖,但对于 LAK20_NY-10025,当我想在 2021-09-21 保留样本时,选项 b 不会包括任何一个采样事件.
这是一个解决方案,您可以在其中注释月份以及特定湖泊是否有 8 月的读数,然后根据该值进行过滤。
library(tidyverse)
library(lubridate)
att <- structure(list(SAMPLE_DATE = c("2020-07-22", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "2020-09-01", "<NA>", "<NA>", "<NA>", "2020-07-07", "2020-07-09", "2020-08-03", "2020-09-01", "<NA>", "2020-09-20", "2020-08-09", "2020-07-26", "2020-08-23", "2020-06-14", "2020-09-08", "2020-06-28", "2020-07-12", "2020-09-02", "2020-07-22", "2020-09-30", "<NA>", "2020-07-08", "2020-08-04", "<NA>", "<NA>", "2020-07-07", "2021-09-21", "2021-07-27", "<NA>", "2020-06-20", "2020-08-02", "2020-06-07", "2020-08-15", "2020-09-06", "2020-07-18", "2020-08-31", "2020-08-30", "2020-06-21", "2020-09-12", "2020-08-16", "2021-09-13", "2020-09-20", "2020-07-19", "2020-07-05", "2020-08-01"), siteID = c("LAK20_NY-10001", "LAK20_NY-10002", "LAK20_NY-10003", "LAK20_NY-10004", "LAK20_NY-10005", "LAK20_NY-10006", "LAK20_NY-10007", "LAK20_NY-10008", "LAK20_NY-10009", "LAK20_NY-10010", "LAK20_NY-10011", "LAK20_NY-10012", "LAK20_NY-10013", "LAK20_NY-10014", "LAK20_NY-10015", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10017", "LAK20_NY-10018", "LAK20_NY-10019", "LAK20_NY-10020", "LAK20_NY-10021", "LAK20_NY-10022", "LAK20_NY-10023", "LAK20_NY-10024", "LAK20_NY-10025", "LAK20_NY-10025", "LAK20_NY-10026", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027")), row.names = c(NA, -51L), class = c("tbl_df", "tbl", "data.frame"))
att %>%
mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
group_by(siteID) %>%
mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
filter(has_aug & month == 8 |
!has_aug) %>% # filter only august dates from sites with or all from those without
slice_max(n = 1,
order_by = SAMPLE_DATE,
with_ties = F) %>% # change with_ties to TRUE to discard NA values
select(SAMPLE_DATE, siteID)
#> # A tibble: 27 x 2
#> # Groups: siteID [27]
#> SAMPLE_DATE siteID
#> <date> <chr>
#> 1 2020-07-22 LAK20_NY-10001
#> 2 NA LAK20_NY-10002
#> 3 NA LAK20_NY-10003
#> 4 NA LAK20_NY-10004
#> 5 NA LAK20_NY-10005
#> 6 NA LAK20_NY-10006
#> 7 2020-09-01 LAK20_NY-10007
#> 8 NA LAK20_NY-10008
#> 9 NA LAK20_NY-10009
#> 10 NA LAK20_NY-10010
#> # ... with 17 more rows
由 reprex package (v2.0.1)
创建于 2022-02-03