如何根据多组条件求平均值、最大值和最小值

How to find the average, max, and min based on multiple sets of criteria

我从其他帖子中找到了一些选项,但我无法根据我的特定需求重现代码。

我有气候数据,我想根据 1910 年至 2015 年的季节找到特定位置的平均值。

这是我需要的示例:1911 年冬季 CENTROID_ID c1763_1 的平均 PPT(前一年 [1910] 的第 12 个月,当年的第 1 个月和第 2 个月问题 [1911]),Spring(1911 年第 3、4、5 个月),夏季(1911 年第 6、7、8 个月)和秋季(1911 年第 9、10、11 个月)。然后需要对每年的所有单个质心 ID 执行此操作。我有超过 400 个独特的 CENTROID_IDs,涵盖 1910-2015 年。

我设想新数据框包含 CENTROID_ID、YEAR、WINT_PPT、SPR_PPT、SUM_PPT、FALL_PPT.[=11= 列]

 CENTROID_ID    YEAR MONTH  PPT
1       c1763_1 1910     1  52.639
2       c1763_1 1910     2  20.870
3       c1763_1 1910     3  21.706
4       c1763_1 1910     4   9.347
5       c1763_1 1910     5   1.201
6       c1763_1 1910     6  11.267
7       c1763_1 1910     7  41.870
8       c1763_1 1910     8  61.260
9       c1763_1 1910     9  27.815
10      c1763_1 1910    10  67.377
11      c1763_1 1910    11  24.719
12      c1763_1 1910    12  30.212
13      c1763_1 1911     1  88.728
14      c1763_1 1911     2  50.035
15      c1763_1 1911     3  37.720
16      c1763_1 1911     4  12.831
17      c1763_1 1911     5   0.739
18      c1763_1 1911     6  18.198
19      c1763_1 1911     7  74.731
20      c1763_1 1911     8  40.873
21      c1763_1 1911     9  86.340
22      c1763_1 1911    10  36.423
23      c1763_1 1911    11  12.491
24      c1763_1 1911    12  19.428
25      c1763_1 1912     1  11.010
26      c1763_1 1912     2  16.339
27      c1763_1 1912     3  72.017
28      c1763_1 1912     4  25.887
29      c1763_1 1912     5   5.314
30      c1763_1 1912     6   8.595
31      c1763_1 1912     7  47.781
32      c1763_1 1912     8  51.188
33      c1763_1 1912     9  10.931
34      c1763_1 1912    10 119.725
35      c1763_1 1912    11  10.420
36      c1763_1 1912    12   8.777
37      c1763_1 1913     1  27.771
38      c1763_1 1913     2  62.622
39      c1763_1 1913     3  17.533
40      c1763_1 1913     4   8.008
41      c1763_1 1913     5   1.423
42      c1763_1 1913     6   3.773
43      c1763_1 1913     7  42.982
44      c1763_1 1913     8  40.541
45      c1763_1 1913     9  58.495
46      c1763_1 1913    10  22.729
47      c1763_1 1913    11  48.130
48      c1763_1 1913    12  32.049
49      c1763_1 1914     1 104.197
50      c1763_1 1914     2  31.707

是这个吗?

library(dplyr)
library(tibble)

# make fake data
dates <- expand.grid(1910:1950, 1:12)
dates <- dates[order(dates$Var1), ]
data <- tibble(
  CENTROID_ID = rep("c1763_1", 240),
  YEAR = dates$Var1[1:240],
  MONTH = dates$Var2[1:240],
  PPT = runif(min = 1, max = 100, n = 240)
)

然后我们可以根据YEARMONTH确定SEASON,分组并计算每组的平均值:

data <- data %>%
  mutate(SEASON = case_when(
    MONTH == 12 | MONTH == 1 | MONTH == 2 ~ "WINTER",
    MONTH == 3 | MONTH == 4 | MONTH == 5 ~ "SPRING",
    MONTH == 6 | MONTH == 7 | MONTH == 8 ~ "SUMMER",
    MONTH == 9 | MONTH == 10 | MONTH == 11 ~ "AUTUMN",
  )) %>%
  group_by(CENTROID_ID, YEAR, SEASON) %>%
  summarise(PPT_AVG = mean(PPT))

结果如下:

> data
# A tibble: 80 x 4
# Groups:   CENTROID_ID, YEAR [20]
   CENTROID_ID  YEAR SEASON PPT_AVG
   <chr>       <int> <chr>    <dbl>
 1 c1763_1      1910 AUTUMN    35.7
 2 c1763_1      1910 SPRING    44.3
 3 c1763_1      1910 SUMMER    63.3
 4 c1763_1      1910 WINTER    37.1
 5 c1763_1      1911 AUTUMN    40.7
 6 c1763_1      1911 SPRING    52.3
 7 c1763_1      1911 SUMMER    36.7
 8 c1763_1      1911 WINTER    10.7
 9 c1763_1      1912 AUTUMN    45.4
10 c1763_1      1912 SPRING    45.7
# ... with 70 more rows

更新 要改变年份,我们需要引入 lead(YEAR, 1)。更新了代码并将 SEASON 作为具有定义级别的因素包含在内,以便按适当的时间顺序排序。

data <- data %>%
  mutate(SEASON = case_when(
    MONTH == 12 | MONTH == 1 |MONTH == 2 ~ "WINTER",
    MONTH == 3 | MONTH == 4 | MONTH == 5 ~ "SPRING",
    MONTH == 6 | MONTH == 7 | MONTH == 8 ~ "SUMMER",
    MONTH == 9 | MONTH == 10 | MONTH == 11~ "AUTUMN",
  )) %>%
  mutate(SEASON = factor(SEASON, levels = c("WINTER", "SPRING", "SUMMER", "AUTUMN"))) %>%
  mutate(YEAR_LEAD = lead(YEAR, 1)) %>%
  group_by(CENTROID_ID, YEAR_LEAD, SEASON) %>%
  summarise(PPT_AVG = mean(PPT),
            PPT_MIN = min(PPT),
            PPT_MAX = max(PPT)) 

结果如下:

> data
# A tibble: 81 x 6
# Groups:   CENTROID_ID, YEAR_LEAD [21]
   CENTROID_ID YEAR_LEAD SEASON PPT_AVG PPT_MIN PPT_MAX
   <chr>           <int> <fct>    <dbl>   <dbl>   <dbl>
 1 c1763_1          1910 WINTER    83.5   81.4     85.7
 2 c1763_1          1910 SPRING    72.3   52.7     96.0
 3 c1763_1          1910 SUMMER    49.9   10.9     90.0
 4 c1763_1          1910 AUTUMN    26.4    7.17    63.1
 5 c1763_1          1911 WINTER    60.9   19.0     92.6
 6 c1763_1          1911 SPRING    62.9   58.6     67.4
 7 c1763_1          1911 SUMMER    49.2   23.7     76.4
 8 c1763_1          1911 AUTUMN    43.9   15.1     84.4
 9 c1763_1          1912 WINTER    38.5   18.4     67.9
10 c1763_1          1912 SPRING    72.1   53.4     93.9
# ... with 71 more rows

只需分配一个 SEASON 列然后 aggregate:

df <- within(df, {
          SEASON <- ifelse(MONTH %in% c(12, 1, 2), 'Winter',
                           ifelse(MONTH %in% c(3, 4, 5), 'Spring',
                                  ifelse (MONTH %in% c(6, 7, 8), 'Summer', 
                                          ifelse(MONTH %in% c(9, 10, 11), 'Fall', NA)
                                  )
                          )
                     )

          YEAR <- ifelse(MONTH == 12, YEAR + 1, YEAR)
      })

agg_df <- aggregate(PPT ~ CENTROID_ID + SEASON, df, FUN=mean)
agg_df
#   CENTROID_ID SEASON      PPT
# 1     c1763_1   Fall 43.79958
# 2     c1763_1 Spring 17.81050
# 3     c1763_1 Summer 36.92158
# 4     c1763_1 Winter 39.74171

是否需要 PPT 的多个聚合:

agg_raw <- aggregate(PPT ~ CENTROID_ID + SEASON, df, 
                     FUN=function(x) c(min=min(x), mean=mean(x), max=max(x)))

agg_df <- do.call(data.frame, agg_raw)
agg_df

#   CENTROID_ID SEASON PPT.min PPT.mean PPT.max
# 1     c1763_1   Fall  10.420 43.79958 119.725
# 2     c1763_1 Spring   0.739 17.81050  72.017
# 3     c1763_1 Summer   3.773 36.92158  74.731
# 4     c1763_1 Winter   8.777 39.74171 104.197

您可以创建 x$YEAR + x$MONTH/12auxiliary 列,这有助于将去年的 12 月带到今年。然后只需使用 aggregatePPT 而不是 CENTROID_IDfloor(aux)floor(aux %% 1 * 4)。然后 reshape 将季节放在同一行。

x$aux <- x$YEAR + x$MONTH/12
y <- aggregate(PPT ~ CENTROID_ID + cbind(YEAR=floor(aux)) + cbind(SEASON=c("WINT",
 "SPR", "SUM", "FALL")[1+floor(aux %% 1 * 4)]), x, mean)
reshape(do.call(data.frame, y), v.names = "PPT", timevar = "SEASON", idvar = 
 c("CENTROID_ID", "YEAR"), direction = "wide")
#   CENTROID_ID YEAR PPT.FALL  PPT.SPR  PPT.SUM PPT.WINT
#1      c1763_1 1910 39.97033 10.75133 38.13233 36.75450
#2      c1763_1 1911 45.08467 17.09667 44.60067 56.32500
#3      c1763_1 1912 47.02533 34.40600 35.85467 15.59233
#4      c1763_1 1913 43.11800  8.98800 29.09867 33.05667
#17     c1763_1 1914       NA       NA       NA 55.98433