按 R 中的组和日期识别最新值
Identify the most recent values by both group and date in R
当前 df:
tibble(
period = c("2010END", "2011END",
"2010Q1","2010Q2","2010Q3","2010Q4","2010END",
"2011Q1","2011Q2","2011Q3","2011Q4","2011END",
"2011END","2012END"),
date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010','30-11-2010', '31-12-2010',
'30-04-2011','31-07-2011','30-09-2011','30-11-2011', '31-12-2011',
'31-12-2011', '31-12-2012'),
website = c(
"google",
"google",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"youtube",
"youtube"
),
values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA)
)
我如何着手创建一个列,该列为期间和网站的分组标识按日期计算的最新非 na 值,因此
因此最终输出将如下所示:
tibble(
period = c("2010END", "2011END",
"2010Q1","2010Q2","2010Q3","2010Q4","2010END",
"2011Q1","2011Q2","2011Q3","2011Q4","2011END",
"2011END","2012END"),
date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010','30-11-2010', '31-12-2010',
'30-04-2011','31-07-2011','30-09-2011','30-11-2011', '31-12-2011',
'31-12-2011', '31-12-2012'),
website = c(
"google",
"google",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"youtube",
"youtube"
),
values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA),
most_recent = c('no','yes', 'no', 'no', 'no', 'no', 'no','yes','yes','yes','yes','yes','yes','no')
)
我正在尝试确定当按最近日期排序时期间和网站组的第一个非 na 值出现时,然后将此期间和网站的所有值标记为“是” most_recent列
所以你有以下内容:
- google 2011END 是日期的最新值,所以是 - 2010END 的值较旧,因此未选择
- facebook 2011q1 到 2011END 的值为是,因为有一个非 na 值是 2011END,这是最近的日期并且有一个非 na 值
- youtube 2011END 是 - 因为它是我们按日期排序时出现的第一个非 na 值,因为 2012 年没有值,所以它是一个无值
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
data <- tibble(
period = c(
"2010END", "2011END",
"2010Q1", "2010Q2", "2010Q3", "2010Q4", "2010END",
"2011Q1", "2011Q2", "2011Q3", "2011Q4", "2011END",
"2011END", "2012END"
),
date = c(
"31-12-2010", "31-12-2011", "30-04-2010", "31-07-2010", "30-09-2010", "30-11-2010", "31-12-2010",
"30-04-2011", "31-07-2011", "30-09-2011", "30-11-2011", "31-12-2011",
"31-12-2011", "31-12-2012"
),
website = c(
"google",
"google",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"youtube",
"youtube"
),
values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA)
)
# group only by website
data %>%
mutate(date = date %>% parse_date(format = "%d-%m-%Y")) %>%
group_by(website) %>%
mutate(most_recent = ifelse(date == min(date, na.rm = TRUE), "yes", "no"))
#> # A tibble: 14 × 5
#> # Groups: website [3]
#> period date website values most_recent
#> <chr> <date> <chr> <dbl> <chr>
#> 1 2010END 2010-12-31 google 1 yes
#> 2 2011END 2011-12-31 google 2 no
#> 3 2010Q1 2010-04-30 facebook 1 yes
#> 4 2010Q2 2010-07-31 facebook 2 no
#> 5 2010Q3 2010-09-30 facebook 3 no
#> 6 2010Q4 2010-11-30 facebook NA no
#> 7 2010END 2010-12-31 facebook 5 no
#> 8 2011Q1 2011-04-30 facebook NA no
#> 9 2011Q2 2011-07-31 facebook NA no
#> 10 2011Q3 2011-09-30 facebook NA no
#> 11 2011Q4 2011-11-30 facebook NA no
#> 12 2011END 2011-12-31 facebook 10 no
#> 13 2011END 2011-12-31 youtube 20 yes
#> 14 2012END 2012-12-31 youtube NA no
# group by period and website
data %>%
mutate(date = date %>% parse_date(format = "%d-%m-%Y")) %>%
group_by(period, website) %>%
mutate(most_recent = ifelse(date == min(date, na.rm = TRUE), "yes", "no"))
#> # A tibble: 14 × 5
#> # Groups: period, website [14]
#> period date website values most_recent
#> <chr> <date> <chr> <dbl> <chr>
#> 1 2010END 2010-12-31 google 1 yes
#> 2 2011END 2011-12-31 google 2 yes
#> 3 2010Q1 2010-04-30 facebook 1 yes
#> 4 2010Q2 2010-07-31 facebook 2 yes
#> 5 2010Q3 2010-09-30 facebook 3 yes
#> 6 2010Q4 2010-11-30 facebook NA yes
#> 7 2010END 2010-12-31 facebook 5 yes
#> 8 2011Q1 2011-04-30 facebook NA yes
#> 9 2011Q2 2011-07-31 facebook NA yes
#> 10 2011Q3 2011-09-30 facebook NA yes
#> 11 2011Q4 2011-11-30 facebook NA yes
#> 12 2011END 2011-12-31 facebook 10 yes
#> 13 2011END 2011-12-31 youtube 20 yes
#> 14 2012END 2012-12-31 youtube NA yes
由 reprex package (v2.0.0)
于 2022-05-25 创建
当前 df:
tibble(
period = c("2010END", "2011END",
"2010Q1","2010Q2","2010Q3","2010Q4","2010END",
"2011Q1","2011Q2","2011Q3","2011Q4","2011END",
"2011END","2012END"),
date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010','30-11-2010', '31-12-2010',
'30-04-2011','31-07-2011','30-09-2011','30-11-2011', '31-12-2011',
'31-12-2011', '31-12-2012'),
website = c(
"google",
"google",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"youtube",
"youtube"
),
values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA)
)
我如何着手创建一个列,该列为期间和网站的分组标识按日期计算的最新非 na 值,因此
因此最终输出将如下所示:
tibble(
period = c("2010END", "2011END",
"2010Q1","2010Q2","2010Q3","2010Q4","2010END",
"2011Q1","2011Q2","2011Q3","2011Q4","2011END",
"2011END","2012END"),
date = c('31-12-2010','31-12-2011', '30-04-2010','31-07-2010','30-09-2010','30-11-2010', '31-12-2010',
'30-04-2011','31-07-2011','30-09-2011','30-11-2011', '31-12-2011',
'31-12-2011', '31-12-2012'),
website = c(
"google",
"google",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"youtube",
"youtube"
),
values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA),
most_recent = c('no','yes', 'no', 'no', 'no', 'no', 'no','yes','yes','yes','yes','yes','yes','no')
)
我正在尝试确定当按最近日期排序时期间和网站组的第一个非 na 值出现时,然后将此期间和网站的所有值标记为“是” most_recent列
所以你有以下内容:
- google 2011END 是日期的最新值,所以是 - 2010END 的值较旧,因此未选择
- facebook 2011q1 到 2011END 的值为是,因为有一个非 na 值是 2011END,这是最近的日期并且有一个非 na 值
- youtube 2011END 是 - 因为它是我们按日期排序时出现的第一个非 na 值,因为 2012 年没有值,所以它是一个无值
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
data <- tibble(
period = c(
"2010END", "2011END",
"2010Q1", "2010Q2", "2010Q3", "2010Q4", "2010END",
"2011Q1", "2011Q2", "2011Q3", "2011Q4", "2011END",
"2011END", "2012END"
),
date = c(
"31-12-2010", "31-12-2011", "30-04-2010", "31-07-2010", "30-09-2010", "30-11-2010", "31-12-2010",
"30-04-2011", "31-07-2011", "30-09-2011", "30-11-2011", "31-12-2011",
"31-12-2011", "31-12-2012"
),
website = c(
"google",
"google",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"facebook",
"youtube",
"youtube"
),
values = c(1, 2, 1, 2, 3, NA, 5, NA, NA, NA, NA, 10, 20, NA)
)
# group only by website
data %>%
mutate(date = date %>% parse_date(format = "%d-%m-%Y")) %>%
group_by(website) %>%
mutate(most_recent = ifelse(date == min(date, na.rm = TRUE), "yes", "no"))
#> # A tibble: 14 × 5
#> # Groups: website [3]
#> period date website values most_recent
#> <chr> <date> <chr> <dbl> <chr>
#> 1 2010END 2010-12-31 google 1 yes
#> 2 2011END 2011-12-31 google 2 no
#> 3 2010Q1 2010-04-30 facebook 1 yes
#> 4 2010Q2 2010-07-31 facebook 2 no
#> 5 2010Q3 2010-09-30 facebook 3 no
#> 6 2010Q4 2010-11-30 facebook NA no
#> 7 2010END 2010-12-31 facebook 5 no
#> 8 2011Q1 2011-04-30 facebook NA no
#> 9 2011Q2 2011-07-31 facebook NA no
#> 10 2011Q3 2011-09-30 facebook NA no
#> 11 2011Q4 2011-11-30 facebook NA no
#> 12 2011END 2011-12-31 facebook 10 no
#> 13 2011END 2011-12-31 youtube 20 yes
#> 14 2012END 2012-12-31 youtube NA no
# group by period and website
data %>%
mutate(date = date %>% parse_date(format = "%d-%m-%Y")) %>%
group_by(period, website) %>%
mutate(most_recent = ifelse(date == min(date, na.rm = TRUE), "yes", "no"))
#> # A tibble: 14 × 5
#> # Groups: period, website [14]
#> period date website values most_recent
#> <chr> <date> <chr> <dbl> <chr>
#> 1 2010END 2010-12-31 google 1 yes
#> 2 2011END 2011-12-31 google 2 yes
#> 3 2010Q1 2010-04-30 facebook 1 yes
#> 4 2010Q2 2010-07-31 facebook 2 yes
#> 5 2010Q3 2010-09-30 facebook 3 yes
#> 6 2010Q4 2010-11-30 facebook NA yes
#> 7 2010END 2010-12-31 facebook 5 yes
#> 8 2011Q1 2011-04-30 facebook NA yes
#> 9 2011Q2 2011-07-31 facebook NA yes
#> 10 2011Q3 2011-09-30 facebook NA yes
#> 11 2011Q4 2011-11-30 facebook NA yes
#> 12 2011END 2011-12-31 facebook 10 yes
#> 13 2011END 2011-12-31 youtube 20 yes
#> 14 2012END 2012-12-31 youtube NA yes
由 reprex package (v2.0.0)
于 2022-05-25 创建