如何删除基于特定日期的特定值

How to remove a specific value that is based on a specific date

下面是我的数据框的一部分:

    Year Date        Location Tag Section domsec atDominantLocation
    2017 2017-08-13        a wnd       n      n                  1
    2017 2017-08-14        a wnd       n      n                  1
    2017 2017-08-15        a wnd       n      n                  1
    2017 2017-08-16        a wnd       n      n                  1
    2017 2017-08-17        a wnd       n      n                  1
    2017 2017-08-18        a wnd       n      n                  1
    2017 2017-08-19        a wnd       n      n                  1
    2017 2017-08-20        a wnd       n      n                  1
    2017 2017-08-21        a wnd       b      n                  0
    2017 2017-08-21        a wnd       u      n                  0
    2017 2017-08-23        a wnd       f      n                  0
    2018 2018-08-21        a wnd       f      s                  0
    2018 2018-08-18        a wnd       h      s                  0
    2018 2018-08-19        a wnd       o      s                  0
    2018 2018-08-15        a wnd       s      s                  1
    2018 2018-08-17        a wnd       s      s                  1
    2018 2018-08-14        c wnd      ss      s                  0
    2018 2018-08-16        a wnd       t      s                  0
    2018 2018-08-21        a wnd       t      s                  0
    2018 2018-08-13        c wnd      ww      s                  0
    2018 2018-08-20        a wnd       y      s                  0

“atDominantLocation”列包含“1”和“0”。我想保留所有的“0”,但只保留最早日期的“1”。因此,每个标签和年份应该只有一个“1”,并且“1”应该是那一年的最早日期。

下面是我想要的输出:

    Year Date        Location Tag Section domsec atDominantLocation
    2017 2017-08-13        a wnd       n      n                  1
    2017 2017-08-21        a wnd       b      n                  0
    2017 2017-08-21        a wnd       u      n                  0
    2017 2017-08-23        a wnd       f      n                  0
    2018 2018-08-21        a wnd       f      s                  0
    2018 2018-08-18        a wnd       h      s                  0
    2018 2018-08-19        a wnd       o      s                  0
    2018 2018-08-15        a wnd       s      s                  1
    2018 2018-08-14        c wnd      ss      s                  0
    2018 2018-08-16        a wnd       t      s                  0
    2018 2018-08-21        a wnd       t      s                  0
    2018 2018-08-13        c wnd      ww      s                  0
    2018 2018-08-20        a wnd       y      s                  0

我试过重复函数和唯一函数都没有成功。感谢您的帮助。

我们可以按 'Year'、'Tag' 分组并在 filter 中创建逻辑表达式以对行进行子集化 - 创建单独的逻辑表达式并将它们与 | 连接(或者)即..因为我们想要所有 0 值 (atDominantLocation == 0) 并且只有 minimum 'Date' 的行,其中 'atDominantLocation' 是 1 (Date == min(Date[atDominantLocation == 1]))

library(dplyr)
df1 %>%
   mutate(Date = as.Date(Date)) %>% 
   group_by(Year, Tag) %>%
   filter(atDominantLocation == 0| 
     ( Date == min(Date[atDominantLocation == 1]))) %>%
   ungroup

-输出

# A tibble: 13 × 7
    Year Date       Location Tag   Section domsec atDominantLocation
   <int> <date>     <chr>    <chr> <chr>   <chr>               <int>
 1  2017 2017-08-13 a        wnd   n       n                       1
 2  2017 2017-08-21 a        wnd   b       n                       0
 3  2017 2017-08-21 a        wnd   u       n                       0
 4  2017 2017-08-23 a        wnd   f       n                       0
 5  2018 2018-08-21 a        wnd   f       s                       0
 6  2018 2018-08-18 a        wnd   h       s                       0
 7  2018 2018-08-19 a        wnd   o       s                       0
 8  2018 2018-08-15 a        wnd   s       s                       1
 9  2018 2018-08-14 c        wnd   ss      s                       0
10  2018 2018-08-16 a        wnd   t       s                       0
11  2018 2018-08-21 a        wnd   t       s                       0
12  2018 2018-08-13 c        wnd   ww      s                       0
13  2018 2018-08-20 a        wnd   y       s                       0

数据

df1 <- structure(list(Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L), Date = c("2017-08-13", 
"2017-08-14", "2017-08-15", "2017-08-16", "2017-08-17", "2017-08-18", 
"2017-08-19", "2017-08-20", "2017-08-21", "2017-08-21", "2017-08-23", 
"2018-08-21", "2018-08-18", "2018-08-19", "2018-08-15", "2018-08-17", 
"2018-08-14", "2018-08-16", "2018-08-21", "2018-08-13", "2018-08-20"
), Location = c("a", "a", "a", "a", "a", "a", "a", "a", "a", 
"a", "a", "a", "a", "a", "a", "a", "c", "a", "a", "c", "a"), 
    Tag = c("wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
    "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
    "wnd", "wnd", "wnd", "wnd", "wnd"), Section = c("n", "n", 
    "n", "n", "n", "n", "n", "n", "b", "u", "f", "f", "h", "o", 
    "s", "s", "ss", "t", "t", "ww", "y"), domsec = c("n", "n", 
    "n", "n", "n", "n", "n", "n", "n", "n", "n", "s", "s", "s", 
    "s", "s", "s", "s", "s", "s", "s"), atDominantLocation = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
    0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
-21L))

另一个 tidyverse 选项是 filter atDominantLocation1。然后,按 YearTag 排序和分组,然后使用 slice 获取最早日期的行。然后,我们可以将这些行绑定回原始数据框,但只绑定到 atDominantLocation0.

的行
library(tidyverse)

df %>% 
  filter(atDominantLocation == 1) %>% 
  arrange(Date) %>% 
  group_by(Year, Tag) %>% 
  slice(1) %>% 
  bind_rows(df %>% filter(atDominantLocation == 0)) %>% 
  arrange(Date)

输出

    Year Date       Location Tag   Section domsec atDominantLocation
   <int> <chr>      <chr>    <chr> <chr>   <chr>               <int>
 1  2017 2017-08-13 a        wnd   n       n                       1
 2  2017 2017-08-21 a        wnd   b       n                       0
 3  2017 2017-08-21 a        wnd   u       n                       0
 4  2017 2017-08-23 a        wnd   f       n                       0
 5  2018 2018-08-13 c        wnd   ww      s                       0
 6  2018 2018-08-14 c        wnd   ss      s                       0
 7  2018 2018-08-15 a        wnd   s       s                       1
 8  2018 2018-08-16 a        wnd   t       s                       0
 9  2018 2018-08-18 a        wnd   h       s                       0
10  2018 2018-08-19 a        wnd   o       s                       0
11  2018 2018-08-20 a        wnd   y       s                       0
12  2018 2018-08-21 a        wnd   f       s                       0
13  2018 2018-08-21 a        wnd   t       s                       0

数据

df <- structure(list(Year = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
                              2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 2018L, 
                              2018L, 2018L, 2018L, 2018L, 2018L, 2018L), 
                     Date = c("2017-08-13", "2017-08-14", "2017-08-15", "2017-08-16", "2017-08-17", "2017-08-18", 
                              "2017-08-19", "2017-08-20", "2017-08-21", "2017-08-21", "2017-08-23", 
                              "2018-08-21", "2018-08-18", "2018-08-19", "2018-08-15", "2018-08-17", 
                               "2018-08-14", "2018-08-16", "2018-08-21", "2018-08-13", "2018-08-20"
                              ), Location = c("a", "a", "a", "a", "a", "a", "a", "a", "a", 
                                              "a", "a", "a", "a", "a", "a", "a", "c", "a", "a", "c", "a"), 
                     Tag = c("wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
                             "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", "wnd", 
                             "wnd", "wnd", "wnd", "wnd", "wnd"), 
                     Section = c("n", "n", "n", "n", "n", "n", "n", "n", "b", "u", "f", "f", "h", "o", 
                                 "s", "s", "ss", "t", "t", "ww", "y"), 
                     domsec = c("n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "n", "s", "s", "s", 
                                "s", "s", "s", "s", "s", "s", "s"), 
                     atDominantLocation = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
                                            0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -21L))
library(data.table)
library(magrittr)

setDT(df)

rbindlist(
  list(df[, .SD[atDominantLocation == 1][1], by = list(Year, Tag)],
       df[atDominantLocation == 0,]), use.names=TRUE) %>% 
  .[order(Tag, Date)]

#>     Year Tag       Date Location Section domsec atDominantLocation
#>  1: 2017 wnd 2017-08-13        a       n      n                  1
#>  2: 2017 wnd 2017-08-21        a       b      n                  0
#>  3: 2017 wnd 2017-08-21        a       u      n                  0
#>  4: 2017 wnd 2017-08-23        a       f      n                  0
#>  5: 2018 wnd 2018-08-13        c      ww      s                  0
#>  6: 2018 wnd 2018-08-14        c      ss      s                  0
#>  7: 2018 wnd 2018-08-15        a       s      s                  1
#>  8: 2018 wnd 2018-08-16        a       t      s                  0
#>  9: 2018 wnd 2018-08-18        a       h      s                  0
#> 10: 2018 wnd 2018-08-19        a       o      s                  0
#> 11: 2018 wnd 2018-08-20        a       y      s                  0
#> 12: 2018 wnd 2018-08-21        a       f      s                  0
#> 13: 2018 wnd 2018-08-21        a       t      s                  0

reprex package (v2.0.1)

于 2022-02-09 创建