如何根据多组条件求平均值、最大值和最小值
How to find the average, max, and min based on multiple sets of criteria
我从其他帖子中找到了一些选项,但我无法根据我的特定需求重现代码。
我有气候数据,我想根据 1910 年至 2015 年的季节找到特定位置的平均值。
这是我需要的示例:1911 年冬季 CENTROID_ID c1763_1 的平均 PPT(前一年 [1910] 的第 12 个月,当年的第 1 个月和第 2 个月问题 [1911]),Spring(1911 年第 3、4、5 个月),夏季(1911 年第 6、7、8 个月)和秋季(1911 年第 9、10、11 个月)。然后需要对每年的所有单个质心 ID 执行此操作。我有超过 400 个独特的 CENTROID_IDs,涵盖 1910-2015 年。
我设想新数据框包含 CENTROID_ID、YEAR、WINT_PPT、SPR_PPT、SUM_PPT、FALL_PPT.[=11= 列]
CENTROID_ID YEAR MONTH PPT
1 c1763_1 1910 1 52.639
2 c1763_1 1910 2 20.870
3 c1763_1 1910 3 21.706
4 c1763_1 1910 4 9.347
5 c1763_1 1910 5 1.201
6 c1763_1 1910 6 11.267
7 c1763_1 1910 7 41.870
8 c1763_1 1910 8 61.260
9 c1763_1 1910 9 27.815
10 c1763_1 1910 10 67.377
11 c1763_1 1910 11 24.719
12 c1763_1 1910 12 30.212
13 c1763_1 1911 1 88.728
14 c1763_1 1911 2 50.035
15 c1763_1 1911 3 37.720
16 c1763_1 1911 4 12.831
17 c1763_1 1911 5 0.739
18 c1763_1 1911 6 18.198
19 c1763_1 1911 7 74.731
20 c1763_1 1911 8 40.873
21 c1763_1 1911 9 86.340
22 c1763_1 1911 10 36.423
23 c1763_1 1911 11 12.491
24 c1763_1 1911 12 19.428
25 c1763_1 1912 1 11.010
26 c1763_1 1912 2 16.339
27 c1763_1 1912 3 72.017
28 c1763_1 1912 4 25.887
29 c1763_1 1912 5 5.314
30 c1763_1 1912 6 8.595
31 c1763_1 1912 7 47.781
32 c1763_1 1912 8 51.188
33 c1763_1 1912 9 10.931
34 c1763_1 1912 10 119.725
35 c1763_1 1912 11 10.420
36 c1763_1 1912 12 8.777
37 c1763_1 1913 1 27.771
38 c1763_1 1913 2 62.622
39 c1763_1 1913 3 17.533
40 c1763_1 1913 4 8.008
41 c1763_1 1913 5 1.423
42 c1763_1 1913 6 3.773
43 c1763_1 1913 7 42.982
44 c1763_1 1913 8 40.541
45 c1763_1 1913 9 58.495
46 c1763_1 1913 10 22.729
47 c1763_1 1913 11 48.130
48 c1763_1 1913 12 32.049
49 c1763_1 1914 1 104.197
50 c1763_1 1914 2 31.707
是这个吗?
library(dplyr)
library(tibble)
# make fake data
dates <- expand.grid(1910:1950, 1:12)
dates <- dates[order(dates$Var1), ]
data <- tibble(
CENTROID_ID = rep("c1763_1", 240),
YEAR = dates$Var1[1:240],
MONTH = dates$Var2[1:240],
PPT = runif(min = 1, max = 100, n = 240)
)
然后我们可以根据YEAR
和MONTH
确定SEASON
,分组并计算每组的平均值:
data <- data %>%
mutate(SEASON = case_when(
MONTH == 12 | MONTH == 1 | MONTH == 2 ~ "WINTER",
MONTH == 3 | MONTH == 4 | MONTH == 5 ~ "SPRING",
MONTH == 6 | MONTH == 7 | MONTH == 8 ~ "SUMMER",
MONTH == 9 | MONTH == 10 | MONTH == 11 ~ "AUTUMN",
)) %>%
group_by(CENTROID_ID, YEAR, SEASON) %>%
summarise(PPT_AVG = mean(PPT))
结果如下:
> data
# A tibble: 80 x 4
# Groups: CENTROID_ID, YEAR [20]
CENTROID_ID YEAR SEASON PPT_AVG
<chr> <int> <chr> <dbl>
1 c1763_1 1910 AUTUMN 35.7
2 c1763_1 1910 SPRING 44.3
3 c1763_1 1910 SUMMER 63.3
4 c1763_1 1910 WINTER 37.1
5 c1763_1 1911 AUTUMN 40.7
6 c1763_1 1911 SPRING 52.3
7 c1763_1 1911 SUMMER 36.7
8 c1763_1 1911 WINTER 10.7
9 c1763_1 1912 AUTUMN 45.4
10 c1763_1 1912 SPRING 45.7
# ... with 70 more rows
更新
要改变年份,我们需要引入 lead(YEAR, 1)
。更新了代码并将 SEASON
作为具有定义级别的因素包含在内,以便按适当的时间顺序排序。
data <- data %>%
mutate(SEASON = case_when(
MONTH == 12 | MONTH == 1 |MONTH == 2 ~ "WINTER",
MONTH == 3 | MONTH == 4 | MONTH == 5 ~ "SPRING",
MONTH == 6 | MONTH == 7 | MONTH == 8 ~ "SUMMER",
MONTH == 9 | MONTH == 10 | MONTH == 11~ "AUTUMN",
)) %>%
mutate(SEASON = factor(SEASON, levels = c("WINTER", "SPRING", "SUMMER", "AUTUMN"))) %>%
mutate(YEAR_LEAD = lead(YEAR, 1)) %>%
group_by(CENTROID_ID, YEAR_LEAD, SEASON) %>%
summarise(PPT_AVG = mean(PPT),
PPT_MIN = min(PPT),
PPT_MAX = max(PPT))
结果如下:
> data
# A tibble: 81 x 6
# Groups: CENTROID_ID, YEAR_LEAD [21]
CENTROID_ID YEAR_LEAD SEASON PPT_AVG PPT_MIN PPT_MAX
<chr> <int> <fct> <dbl> <dbl> <dbl>
1 c1763_1 1910 WINTER 83.5 81.4 85.7
2 c1763_1 1910 SPRING 72.3 52.7 96.0
3 c1763_1 1910 SUMMER 49.9 10.9 90.0
4 c1763_1 1910 AUTUMN 26.4 7.17 63.1
5 c1763_1 1911 WINTER 60.9 19.0 92.6
6 c1763_1 1911 SPRING 62.9 58.6 67.4
7 c1763_1 1911 SUMMER 49.2 23.7 76.4
8 c1763_1 1911 AUTUMN 43.9 15.1 84.4
9 c1763_1 1912 WINTER 38.5 18.4 67.9
10 c1763_1 1912 SPRING 72.1 53.4 93.9
# ... with 71 more rows
只需分配一个 SEASON
列然后 aggregate
:
df <- within(df, {
SEASON <- ifelse(MONTH %in% c(12, 1, 2), 'Winter',
ifelse(MONTH %in% c(3, 4, 5), 'Spring',
ifelse (MONTH %in% c(6, 7, 8), 'Summer',
ifelse(MONTH %in% c(9, 10, 11), 'Fall', NA)
)
)
)
YEAR <- ifelse(MONTH == 12, YEAR + 1, YEAR)
})
agg_df <- aggregate(PPT ~ CENTROID_ID + SEASON, df, FUN=mean)
agg_df
# CENTROID_ID SEASON PPT
# 1 c1763_1 Fall 43.79958
# 2 c1763_1 Spring 17.81050
# 3 c1763_1 Summer 36.92158
# 4 c1763_1 Winter 39.74171
是否需要 PPT
的多个聚合:
agg_raw <- aggregate(PPT ~ CENTROID_ID + SEASON, df,
FUN=function(x) c(min=min(x), mean=mean(x), max=max(x)))
agg_df <- do.call(data.frame, agg_raw)
agg_df
# CENTROID_ID SEASON PPT.min PPT.mean PPT.max
# 1 c1763_1 Fall 10.420 43.79958 119.725
# 2 c1763_1 Spring 0.739 17.81050 72.017
# 3 c1763_1 Summer 3.773 36.92158 74.731
# 4 c1763_1 Winter 8.777 39.74171 104.197
您可以创建 x$YEAR + x$MONTH/12
的 auxiliary 列,这有助于将去年的 12 月带到今年。然后只需使用 aggregate
或 PPT
而不是 CENTROID_ID
、floor(aux)
和 floor(aux %% 1 * 4)
。然后 reshape
将季节放在同一行。
x$aux <- x$YEAR + x$MONTH/12
y <- aggregate(PPT ~ CENTROID_ID + cbind(YEAR=floor(aux)) + cbind(SEASON=c("WINT",
"SPR", "SUM", "FALL")[1+floor(aux %% 1 * 4)]), x, mean)
reshape(do.call(data.frame, y), v.names = "PPT", timevar = "SEASON", idvar =
c("CENTROID_ID", "YEAR"), direction = "wide")
# CENTROID_ID YEAR PPT.FALL PPT.SPR PPT.SUM PPT.WINT
#1 c1763_1 1910 39.97033 10.75133 38.13233 36.75450
#2 c1763_1 1911 45.08467 17.09667 44.60067 56.32500
#3 c1763_1 1912 47.02533 34.40600 35.85467 15.59233
#4 c1763_1 1913 43.11800 8.98800 29.09867 33.05667
#17 c1763_1 1914 NA NA NA 55.98433
我从其他帖子中找到了一些选项,但我无法根据我的特定需求重现代码。
我有气候数据,我想根据 1910 年至 2015 年的季节找到特定位置的平均值。
这是我需要的示例:1911 年冬季 CENTROID_ID c1763_1 的平均 PPT(前一年 [1910] 的第 12 个月,当年的第 1 个月和第 2 个月问题 [1911]),Spring(1911 年第 3、4、5 个月),夏季(1911 年第 6、7、8 个月)和秋季(1911 年第 9、10、11 个月)。然后需要对每年的所有单个质心 ID 执行此操作。我有超过 400 个独特的 CENTROID_IDs,涵盖 1910-2015 年。
我设想新数据框包含 CENTROID_ID、YEAR、WINT_PPT、SPR_PPT、SUM_PPT、FALL_PPT.[=11= 列]
CENTROID_ID YEAR MONTH PPT
1 c1763_1 1910 1 52.639
2 c1763_1 1910 2 20.870
3 c1763_1 1910 3 21.706
4 c1763_1 1910 4 9.347
5 c1763_1 1910 5 1.201
6 c1763_1 1910 6 11.267
7 c1763_1 1910 7 41.870
8 c1763_1 1910 8 61.260
9 c1763_1 1910 9 27.815
10 c1763_1 1910 10 67.377
11 c1763_1 1910 11 24.719
12 c1763_1 1910 12 30.212
13 c1763_1 1911 1 88.728
14 c1763_1 1911 2 50.035
15 c1763_1 1911 3 37.720
16 c1763_1 1911 4 12.831
17 c1763_1 1911 5 0.739
18 c1763_1 1911 6 18.198
19 c1763_1 1911 7 74.731
20 c1763_1 1911 8 40.873
21 c1763_1 1911 9 86.340
22 c1763_1 1911 10 36.423
23 c1763_1 1911 11 12.491
24 c1763_1 1911 12 19.428
25 c1763_1 1912 1 11.010
26 c1763_1 1912 2 16.339
27 c1763_1 1912 3 72.017
28 c1763_1 1912 4 25.887
29 c1763_1 1912 5 5.314
30 c1763_1 1912 6 8.595
31 c1763_1 1912 7 47.781
32 c1763_1 1912 8 51.188
33 c1763_1 1912 9 10.931
34 c1763_1 1912 10 119.725
35 c1763_1 1912 11 10.420
36 c1763_1 1912 12 8.777
37 c1763_1 1913 1 27.771
38 c1763_1 1913 2 62.622
39 c1763_1 1913 3 17.533
40 c1763_1 1913 4 8.008
41 c1763_1 1913 5 1.423
42 c1763_1 1913 6 3.773
43 c1763_1 1913 7 42.982
44 c1763_1 1913 8 40.541
45 c1763_1 1913 9 58.495
46 c1763_1 1913 10 22.729
47 c1763_1 1913 11 48.130
48 c1763_1 1913 12 32.049
49 c1763_1 1914 1 104.197
50 c1763_1 1914 2 31.707
是这个吗?
library(dplyr)
library(tibble)
# make fake data
dates <- expand.grid(1910:1950, 1:12)
dates <- dates[order(dates$Var1), ]
data <- tibble(
CENTROID_ID = rep("c1763_1", 240),
YEAR = dates$Var1[1:240],
MONTH = dates$Var2[1:240],
PPT = runif(min = 1, max = 100, n = 240)
)
然后我们可以根据YEAR
和MONTH
确定SEASON
,分组并计算每组的平均值:
data <- data %>%
mutate(SEASON = case_when(
MONTH == 12 | MONTH == 1 | MONTH == 2 ~ "WINTER",
MONTH == 3 | MONTH == 4 | MONTH == 5 ~ "SPRING",
MONTH == 6 | MONTH == 7 | MONTH == 8 ~ "SUMMER",
MONTH == 9 | MONTH == 10 | MONTH == 11 ~ "AUTUMN",
)) %>%
group_by(CENTROID_ID, YEAR, SEASON) %>%
summarise(PPT_AVG = mean(PPT))
结果如下:
> data
# A tibble: 80 x 4
# Groups: CENTROID_ID, YEAR [20]
CENTROID_ID YEAR SEASON PPT_AVG
<chr> <int> <chr> <dbl>
1 c1763_1 1910 AUTUMN 35.7
2 c1763_1 1910 SPRING 44.3
3 c1763_1 1910 SUMMER 63.3
4 c1763_1 1910 WINTER 37.1
5 c1763_1 1911 AUTUMN 40.7
6 c1763_1 1911 SPRING 52.3
7 c1763_1 1911 SUMMER 36.7
8 c1763_1 1911 WINTER 10.7
9 c1763_1 1912 AUTUMN 45.4
10 c1763_1 1912 SPRING 45.7
# ... with 70 more rows
更新
要改变年份,我们需要引入 lead(YEAR, 1)
。更新了代码并将 SEASON
作为具有定义级别的因素包含在内,以便按适当的时间顺序排序。
data <- data %>%
mutate(SEASON = case_when(
MONTH == 12 | MONTH == 1 |MONTH == 2 ~ "WINTER",
MONTH == 3 | MONTH == 4 | MONTH == 5 ~ "SPRING",
MONTH == 6 | MONTH == 7 | MONTH == 8 ~ "SUMMER",
MONTH == 9 | MONTH == 10 | MONTH == 11~ "AUTUMN",
)) %>%
mutate(SEASON = factor(SEASON, levels = c("WINTER", "SPRING", "SUMMER", "AUTUMN"))) %>%
mutate(YEAR_LEAD = lead(YEAR, 1)) %>%
group_by(CENTROID_ID, YEAR_LEAD, SEASON) %>%
summarise(PPT_AVG = mean(PPT),
PPT_MIN = min(PPT),
PPT_MAX = max(PPT))
结果如下:
> data
# A tibble: 81 x 6
# Groups: CENTROID_ID, YEAR_LEAD [21]
CENTROID_ID YEAR_LEAD SEASON PPT_AVG PPT_MIN PPT_MAX
<chr> <int> <fct> <dbl> <dbl> <dbl>
1 c1763_1 1910 WINTER 83.5 81.4 85.7
2 c1763_1 1910 SPRING 72.3 52.7 96.0
3 c1763_1 1910 SUMMER 49.9 10.9 90.0
4 c1763_1 1910 AUTUMN 26.4 7.17 63.1
5 c1763_1 1911 WINTER 60.9 19.0 92.6
6 c1763_1 1911 SPRING 62.9 58.6 67.4
7 c1763_1 1911 SUMMER 49.2 23.7 76.4
8 c1763_1 1911 AUTUMN 43.9 15.1 84.4
9 c1763_1 1912 WINTER 38.5 18.4 67.9
10 c1763_1 1912 SPRING 72.1 53.4 93.9
# ... with 71 more rows
只需分配一个 SEASON
列然后 aggregate
:
df <- within(df, {
SEASON <- ifelse(MONTH %in% c(12, 1, 2), 'Winter',
ifelse(MONTH %in% c(3, 4, 5), 'Spring',
ifelse (MONTH %in% c(6, 7, 8), 'Summer',
ifelse(MONTH %in% c(9, 10, 11), 'Fall', NA)
)
)
)
YEAR <- ifelse(MONTH == 12, YEAR + 1, YEAR)
})
agg_df <- aggregate(PPT ~ CENTROID_ID + SEASON, df, FUN=mean)
agg_df
# CENTROID_ID SEASON PPT
# 1 c1763_1 Fall 43.79958
# 2 c1763_1 Spring 17.81050
# 3 c1763_1 Summer 36.92158
# 4 c1763_1 Winter 39.74171
是否需要 PPT
的多个聚合:
agg_raw <- aggregate(PPT ~ CENTROID_ID + SEASON, df,
FUN=function(x) c(min=min(x), mean=mean(x), max=max(x)))
agg_df <- do.call(data.frame, agg_raw)
agg_df
# CENTROID_ID SEASON PPT.min PPT.mean PPT.max
# 1 c1763_1 Fall 10.420 43.79958 119.725
# 2 c1763_1 Spring 0.739 17.81050 72.017
# 3 c1763_1 Summer 3.773 36.92158 74.731
# 4 c1763_1 Winter 8.777 39.74171 104.197
您可以创建 x$YEAR + x$MONTH/12
的 auxiliary 列,这有助于将去年的 12 月带到今年。然后只需使用 aggregate
或 PPT
而不是 CENTROID_ID
、floor(aux)
和 floor(aux %% 1 * 4)
。然后 reshape
将季节放在同一行。
x$aux <- x$YEAR + x$MONTH/12
y <- aggregate(PPT ~ CENTROID_ID + cbind(YEAR=floor(aux)) + cbind(SEASON=c("WINT",
"SPR", "SUM", "FALL")[1+floor(aux %% 1 * 4)]), x, mean)
reshape(do.call(data.frame, y), v.names = "PPT", timevar = "SEASON", idvar =
c("CENTROID_ID", "YEAR"), direction = "wide")
# CENTROID_ID YEAR PPT.FALL PPT.SPR PPT.SUM PPT.WINT
#1 c1763_1 1910 39.97033 10.75133 38.13233 36.75450
#2 c1763_1 1911 45.08467 17.09667 44.60067 56.32500
#3 c1763_1 1912 47.02533 34.40600 35.85467 15.59233
#4 c1763_1 1913 43.11800 8.98800 29.09867 33.05667
#17 c1763_1 1914 NA NA NA 55.98433