我如何 select 最近的样本,除非在某个月份有样本?

How do I select the most recent sample UNLESS there is a sample in a certain month?

我有一个包含采样点和日期的数据集。我试图只在每个站点保留最新的八月样本,但如果没有八月样本,我只想保留最新的样本(无论是在八月之前还是之后)。月份范围从 6 月到 9 月。

我不确定如何让 r 根据其他行中的内容来决定保留哪一行。

我试过选项 a,它基于对保留最新样本的问题的回答:

att <- att %>%
  group_by(siteID) %>% 
  arrange(desc(SAMPLE_DATE)) %>% 
  slice(1:1)

但这不一定保留最近的 8 月 样本。

选项b有点乱,但我认为更接近我想要的:

# get duplicates (multiple samples of same lake)
n_occur <- data.frame(table(att$siteID))
dupes <- att[att$siteID %in% n_occur$Var1[n_occur$Freq > 1],]

# keep most recent august sample
m_att <- att %>%
  mutate(keep=case_when(
    siteID %in% dupes$siteID & month(SAMPLE_DATE) != 9 & !is.na(`CHLOROPHYLL A_OW_TOTAL`) ~ "yes",
    !(siteID %in% dupes$siteID) ~ "yes")) %>%
  arrange(siteID,desc(SAMPLE_DATE)) %>%
  filter(keep=="yes")

m_occur <- data.frame(table(m_att$siteID))
m_att <- m_att[!duplicated(m_att$siteID),]

att <- m_att %>%
  distinct()

但这错过了一些网站,我最终在过滤器中丢失了一些网站。

例如,这是我的一大块数据:

SAMPLE_DATE         siteID
1   2020-07-22 LAK20_NY-10001
2         <NA> LAK20_NY-10002
3         <NA> LAK20_NY-10003
4         <NA> LAK20_NY-10004
5         <NA> LAK20_NY-10005
6         <NA> LAK20_NY-10006
7   2020-09-01 LAK20_NY-10007
8         <NA> LAK20_NY-10008
9         <NA> LAK20_NY-10009
10        <NA> LAK20_NY-10010
11  2020-07-07 LAK20_NY-10011
12  2020-07-09 LAK20_NY-10012
13  2020-08-03 LAK20_NY-10013
14  2020-09-01 LAK20_NY-10014
15        <NA> LAK20_NY-10015
16  2020-09-20 LAK20_NY-10016
17  2020-08-09 LAK20_NY-10016
18  2020-07-26 LAK20_NY-10016
19  2020-08-23 LAK20_NY-10016
20  2020-06-14 LAK20_NY-10016
21  2020-09-08 LAK20_NY-10016
22  2020-06-28 LAK20_NY-10016
23  2020-07-12 LAK20_NY-10016
24  2020-09-02 LAK20_NY-10016
25  2020-07-22 LAK20_NY-10017
26  2020-09-30 LAK20_NY-10018
27        <NA> LAK20_NY-10019
28  2020-07-08 LAK20_NY-10020
29  2020-08-04 LAK20_NY-10021
30        <NA> LAK20_NY-10022
31        <NA> LAK20_NY-10023
32  2020-07-07 LAK20_NY-10024
33  2021-09-21 LAK20_NY-10025
34  2021-07-27 LAK20_NY-10025
35        <NA> LAK20_NY-10026
36  2020-06-20 LAK20_NY-10027
37  2020-08-02 LAK20_NY-10027
38  2020-06-07 LAK20_NY-10027
39  2020-08-15 LAK20_NY-10027
40  2020-09-06 LAK20_NY-10027
41  2020-07-18 LAK20_NY-10027
42  2020-08-31 LAK20_NY-10027
43  2020-08-30 LAK20_NY-10027
44  2020-06-21 LAK20_NY-10027
45  2020-09-12 LAK20_NY-10027
46  2020-08-16 LAK20_NY-10027
47  2021-09-13 LAK20_NY-10027
48  2020-09-20 LAK20_NY-10027
49  2020-07-19 LAK20_NY-10027
50  2020-07-05 LAK20_NY-10027
51  2020-08-01 LAK20_NY-10027

选项 a 会为 LAK20_NY-10016 保留错误样本(我希望将样本保留在 2020-08-09,而不是 2020-09-30) . 选项 b 适用于该湖,但对于 LAK20_NY-10025,当我想在 2021-09-21 保留样本时,选项 b 不会包括任何一个采样事件.

这是一个解决方案,您可以在其中注释月份以及特定湖泊是否有 8 月的读数,然后根据该值进行过滤。

library(tidyverse)
library(lubridate)

att <- structure(list(SAMPLE_DATE = c("2020-07-22", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "2020-09-01", "<NA>", "<NA>", "<NA>", "2020-07-07", "2020-07-09", "2020-08-03", "2020-09-01", "<NA>", "2020-09-20", "2020-08-09", "2020-07-26", "2020-08-23", "2020-06-14", "2020-09-08", "2020-06-28", "2020-07-12", "2020-09-02", "2020-07-22", "2020-09-30", "<NA>", "2020-07-08", "2020-08-04", "<NA>", "<NA>", "2020-07-07", "2021-09-21", "2021-07-27", "<NA>", "2020-06-20", "2020-08-02", "2020-06-07", "2020-08-15", "2020-09-06", "2020-07-18", "2020-08-31", "2020-08-30", "2020-06-21", "2020-09-12", "2020-08-16", "2021-09-13", "2020-09-20", "2020-07-19", "2020-07-05", "2020-08-01"), siteID = c("LAK20_NY-10001", "LAK20_NY-10002", "LAK20_NY-10003", "LAK20_NY-10004", "LAK20_NY-10005", "LAK20_NY-10006", "LAK20_NY-10007", "LAK20_NY-10008", "LAK20_NY-10009", "LAK20_NY-10010", "LAK20_NY-10011", "LAK20_NY-10012", "LAK20_NY-10013", "LAK20_NY-10014", "LAK20_NY-10015", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10016", "LAK20_NY-10017", "LAK20_NY-10018", "LAK20_NY-10019", "LAK20_NY-10020", "LAK20_NY-10021", "LAK20_NY-10022", "LAK20_NY-10023", "LAK20_NY-10024", "LAK20_NY-10025", "LAK20_NY-10025", "LAK20_NY-10026", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027", "LAK20_NY-10027")), row.names = c(NA, -51L), class = c("tbl_df", "tbl", "data.frame"))

att %>%
  mutate(SAMPLE_DATE = ymd(SAMPLE_DATE)) %>% # convert to date
  mutate(month = month(SAMPLE_DATE)) %>% # extract month from date
  group_by(siteID) %>%
  mutate(has_aug = 8 %in% month) %>% # annotate as having date in august
  filter(has_aug & month == 8 |
           !has_aug) %>% # filter only august dates from sites with or all from those without
  slice_max(n = 1,
            order_by = SAMPLE_DATE,
            with_ties = F) %>% # change with_ties to TRUE to discard NA values
  select(SAMPLE_DATE, siteID)

#> # A tibble: 27 x 2
#> # Groups:   siteID [27]
#>    SAMPLE_DATE siteID        
#>    <date>      <chr>         
#>  1 2020-07-22  LAK20_NY-10001
#>  2 NA          LAK20_NY-10002
#>  3 NA          LAK20_NY-10003
#>  4 NA          LAK20_NY-10004
#>  5 NA          LAK20_NY-10005
#>  6 NA          LAK20_NY-10006
#>  7 2020-09-01  LAK20_NY-10007
#>  8 NA          LAK20_NY-10008
#>  9 NA          LAK20_NY-10009
#> 10 NA          LAK20_NY-10010
#> # ... with 17 more rows

reprex package (v2.0.1)

创建于 2022-02-03