如何从 R 中具有日期列条件的不同列获取最大值和平均值?
How do obtain max and mean values from different columns with a Date column condition in R?
我正在尝试获取每种污染物的每日平均值和最大值。对每个监测站计算每项污染物的日均值和最大值。
这是我的数据框示例:
df <- data.frame (Station = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 10),
Date = c("31/12/2018", "31/12/2018", "30/12/2018", "1/12/2018", "4/3/2018", "16/3/2018", "16/3/2018", "5/2/2018", "31/12/2018", "6/2/2018", "6/2/2018", "4/3/2018"),
NO2 = c(40,55,52,58,76,98,12,14,23,54,53,76),
O3 = c(13,12,45,87,72,54,88,102,63,52,64,53))
这是预期的数据帧:
df2 <- data.frame (Station = c(1, 1, 2, 2, 2, 3, 3, 3, 4, 10),
Date = c("31/12/2018", "30/12/2018", "1/12/2018", "4/3/2018", "16/3/2018", "16/3/2018", "5/2/2018", "31/12/2018", "6/2/2018", "4/3/2018"),
MAX_NO2 = c(55,52,58,76,98,12,14,23,54,76),
MEAN_NO2 = c(47.5,52,58,76,98,12,14,23,53.5,76),
MAX_O3 = c(13,45,87,72,54,88,102,63,64,53),
MEAN_O3 = c(12.5,45,87,72,54,88,102,63,58,53))
这是我在 中找到的代码。我试过了,但发生错误:
library(dplyr)
df %>%
group_by(Station,Date) %>%
mutate(max.O3 = max(O3), max.NO2 = max(NO2)) %>%
ungroup() %>%
mutate(max.O3 = case_when(O3 == max.O3 ~ max.O3, TRUE ~ NA_real_), max.NO2 = case_when(NO2 == max.NO2 ~ max.NO2, TRUE ~ NA_real_)) %>%
filter(!is.na(max.O3) | !is.na(max.NO2)) %>%
select(-O3,-NO2)
如何获得所需的输出?我首先想到使用 for 循环来迭代值,但我不知道这样做是否正确。类似于:
for i in df$Station{
for date in df$date{
Max_NO2 = max(a)
Mean_NO2 = mean(a)
...
}
}
我们可以先转换为日期格式,然后我们可以 summarise
across
2 列来获取每个组和列的平均值和最大值。然后,我们可以使用 .names
参数来定义列名。
library(tidyverse)
df %>%
mutate(Date = as.Date(Date, "%d/%m/%Y")) %>%
group_by(Station, Date) %>%
summarise(across(NO2:O3, list(MAX = max, MEAN = mean), .names = "{.fn}_{.col}")) %>%
ungroup
输出
Station Date MAX_NO2 MEAN_NO2 MAX_O3 MEAN_O3
<dbl> <date> <dbl> <dbl> <dbl> <dbl>
1 1 2018-12-30 52 52 45 45
2 1 2018-12-31 55 47.5 13 12.5
3 2 2018-03-04 76 76 72 72
4 2 2018-03-16 98 98 54 54
5 2 2018-12-01 58 58 87 87
6 3 2018-02-05 14 14 102 102
7 3 2018-03-16 12 12 88 88
8 3 2018-12-31 23 23 63 63
9 4 2018-02-06 54 53.5 64 58
10 10 2018-03-04 76 76 53 53
我正在尝试获取每种污染物的每日平均值和最大值。对每个监测站计算每项污染物的日均值和最大值。
这是我的数据框示例:
df <- data.frame (Station = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 10),
Date = c("31/12/2018", "31/12/2018", "30/12/2018", "1/12/2018", "4/3/2018", "16/3/2018", "16/3/2018", "5/2/2018", "31/12/2018", "6/2/2018", "6/2/2018", "4/3/2018"),
NO2 = c(40,55,52,58,76,98,12,14,23,54,53,76),
O3 = c(13,12,45,87,72,54,88,102,63,52,64,53))
这是预期的数据帧:
df2 <- data.frame (Station = c(1, 1, 2, 2, 2, 3, 3, 3, 4, 10),
Date = c("31/12/2018", "30/12/2018", "1/12/2018", "4/3/2018", "16/3/2018", "16/3/2018", "5/2/2018", "31/12/2018", "6/2/2018", "4/3/2018"),
MAX_NO2 = c(55,52,58,76,98,12,14,23,54,76),
MEAN_NO2 = c(47.5,52,58,76,98,12,14,23,53.5,76),
MAX_O3 = c(13,45,87,72,54,88,102,63,64,53),
MEAN_O3 = c(12.5,45,87,72,54,88,102,63,58,53))
这是我在
library(dplyr)
df %>%
group_by(Station,Date) %>%
mutate(max.O3 = max(O3), max.NO2 = max(NO2)) %>%
ungroup() %>%
mutate(max.O3 = case_when(O3 == max.O3 ~ max.O3, TRUE ~ NA_real_), max.NO2 = case_when(NO2 == max.NO2 ~ max.NO2, TRUE ~ NA_real_)) %>%
filter(!is.na(max.O3) | !is.na(max.NO2)) %>%
select(-O3,-NO2)
如何获得所需的输出?我首先想到使用 for 循环来迭代值,但我不知道这样做是否正确。类似于:
for i in df$Station{
for date in df$date{
Max_NO2 = max(a)
Mean_NO2 = mean(a)
...
}
}
我们可以先转换为日期格式,然后我们可以 summarise
across
2 列来获取每个组和列的平均值和最大值。然后,我们可以使用 .names
参数来定义列名。
library(tidyverse)
df %>%
mutate(Date = as.Date(Date, "%d/%m/%Y")) %>%
group_by(Station, Date) %>%
summarise(across(NO2:O3, list(MAX = max, MEAN = mean), .names = "{.fn}_{.col}")) %>%
ungroup
输出
Station Date MAX_NO2 MEAN_NO2 MAX_O3 MEAN_O3
<dbl> <date> <dbl> <dbl> <dbl> <dbl>
1 1 2018-12-30 52 52 45 45
2 1 2018-12-31 55 47.5 13 12.5
3 2 2018-03-04 76 76 72 72
4 2 2018-03-16 98 98 54 54
5 2 2018-12-01 58 58 87 87
6 3 2018-02-05 14 14 102 102
7 3 2018-03-16 12 12 88 88
8 3 2018-12-31 23 23 63 63
9 4 2018-02-06 54 53.5 64 58
10 10 2018-03-04 76 76 53 53