按 R 中的组和日期识别最新值

Identify the most recent values by both group and date in R

当前 df:

tibble(
  period = c("2010END", "2011END", 
             "2010Q1","2010Q2","2010Q3","2010Q4","2010END",
             "2011Q1","2011Q2","2011Q3","2011Q4","2011END",
             "2011END","2012END"),
  date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010','30-11-2010', '31-12-2010',
           '30-04-2011','31-07-2011','30-09-2011','30-11-2011', '31-12-2011', 
           '31-12-2011', '31-12-2012'),
  website = c(
    "google",
    "google",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "youtube",
    "youtube"
  ),
  values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA)
)

我如何着手创建一个列,该列为期间和网站的分组标识按日期计算的最新非 na 值,因此

因此最终输出将如下所示:

tibble(
  period = c("2010END", "2011END", 
             "2010Q1","2010Q2","2010Q3","2010Q4","2010END",
             "2011Q1","2011Q2","2011Q3","2011Q4","2011END",
             "2011END","2012END"),
  date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010','30-11-2010', '31-12-2010',
           '30-04-2011','31-07-2011','30-09-2011','30-11-2011', '31-12-2011', 
           '31-12-2011', '31-12-2012'),
  website = c(
    "google",
    "google",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "youtube",
    "youtube"
  ),
  values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA), 
  most_recent = c('no','yes', 'no', 'no', 'no', 'no', 'no','yes','yes','yes','yes','yes','yes','no')
)

我正在尝试确定当按最近日期排序时期间和网站组的第一个非 na 值出现时,然后将此期间和网站的所有值标记为“是” most_recent列

所以你有以下内容:

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

data <- tibble(
  period = c(
    "2010END", "2011END",
    "2010Q1", "2010Q2", "2010Q3", "2010Q4", "2010END",
    "2011Q1", "2011Q2", "2011Q3", "2011Q4", "2011END",
    "2011END", "2012END"
  ),
  date = c(
    "31-12-2010", "31-12-2011", "30-04-2010", "31-07-2010", "30-09-2010", "30-11-2010", "31-12-2010",
    "30-04-2011", "31-07-2011", "30-09-2011", "30-11-2011", "31-12-2011",
    "31-12-2011", "31-12-2012"
  ),
  website = c(
    "google",
    "google",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "facebook",
    "youtube",
    "youtube"
  ),
  values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA)
)

# group only by website
data %>%
  mutate(date = date %>% parse_date(format = "%d-%m-%Y")) %>%
  group_by(website) %>%
  mutate(most_recent = ifelse(date == min(date, na.rm = TRUE), "yes", "no"))
#> # A tibble: 14 × 5
#> # Groups:   website [3]
#>    period  date       website  values most_recent
#>    <chr>   <date>     <chr>     <dbl> <chr>      
#>  1 2010END 2010-12-31 google        1 yes        
#>  2 2011END 2011-12-31 google        2 no         
#>  3 2010Q1  2010-04-30 facebook      1 yes        
#>  4 2010Q2  2010-07-31 facebook      2 no         
#>  5 2010Q3  2010-09-30 facebook      3 no         
#>  6 2010Q4  2010-11-30 facebook     NA no         
#>  7 2010END 2010-12-31 facebook      5 no         
#>  8 2011Q1  2011-04-30 facebook     NA no         
#>  9 2011Q2  2011-07-31 facebook     NA no         
#> 10 2011Q3  2011-09-30 facebook     NA no         
#> 11 2011Q4  2011-11-30 facebook     NA no         
#> 12 2011END 2011-12-31 facebook     10 no         
#> 13 2011END 2011-12-31 youtube      20 yes        
#> 14 2012END 2012-12-31 youtube      NA no

# group by period and website
data %>%
  mutate(date = date %>% parse_date(format = "%d-%m-%Y")) %>%
  group_by(period, website) %>%
  mutate(most_recent = ifelse(date == min(date, na.rm = TRUE), "yes", "no"))
#> # A tibble: 14 × 5
#> # Groups:   period, website [14]
#>    period  date       website  values most_recent
#>    <chr>   <date>     <chr>     <dbl> <chr>      
#>  1 2010END 2010-12-31 google        1 yes        
#>  2 2011END 2011-12-31 google        2 yes        
#>  3 2010Q1  2010-04-30 facebook      1 yes        
#>  4 2010Q2  2010-07-31 facebook      2 yes        
#>  5 2010Q3  2010-09-30 facebook      3 yes        
#>  6 2010Q4  2010-11-30 facebook     NA yes        
#>  7 2010END 2010-12-31 facebook      5 yes        
#>  8 2011Q1  2011-04-30 facebook     NA yes        
#>  9 2011Q2  2011-07-31 facebook     NA yes        
#> 10 2011Q3  2011-09-30 facebook     NA yes        
#> 11 2011Q4  2011-11-30 facebook     NA yes        
#> 12 2011END 2011-12-31 facebook     10 yes        
#> 13 2011END 2011-12-31 youtube      20 yes        
#> 14 2012END 2012-12-31 youtube      NA yes

reprex package (v2.0.0)

于 2022-05-25 创建