按固定日期范围聚合 R
Aggregating by fixed date range R
对我的数据集进行简化,例如:
df <- data.frame("ID"= c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2),
"ForestType" = c("oak","oak","oak","oak","oak","oak","oak","oak","oak","oak","oak","oak",
"pine","pine","pine","pine","pine","pine","pine","pine","pine","pine","pine","pine"),
"Date"= c("1987.01.01","1987.06.01","1987.10.01","1987.11.01",
"1988.01.01","1988.03.01","1988.04.01","1988.06.01",
"1989.03.01","1989.05.01","1989.07.01","1989.08.01",
"1987.01.01","1987.06.01","1987.10.01","1987.11.01",
"1988.01.01","1988.03.01","1988.04.01","1988.06.01",
"1989.03.01","1989.05.01","1989.07.01","1989.08.01"),
"NDVI"= c(0.1,0.2,0.3,0.55,0.31,0.26,0.34,0.52,0.41,0.45,0.50,0.7,
0.2,0.3,0.4,0.53,0.52,0.54,0.78,0.73,0.72,0.71,0.76,0.9),
check.names = FALSE, stringsAsFactors = FALSE)
我想获得某个时间段的 NDVI 值的均值,在本例中为年份。 考虑到在我的真实数据集中我会需要它四季变化,所以它应该是可适应的。
这些方式应该考虑:
- 修剪异常值:例如最高值的 25% 和最低值的 25%。
- 它们应该是 by class,在本例中是 ID 字段。
所以输出应该类似于:
> desired_df
ID ForestType Date meanNDVI
1 1 oak 1987 0.250
2 1 oak 1988 0.325
3 1 oak 1989 0.430
4 2 pine 1987 0.350
5 2 pine 1988 0.635
6 2 pine 1989 0.740
在这种情况下,例如,0.250
对应于 ID=1
的 1987
的平均值 NDVI
并且它是那一年的 4 个值的平均值最低的和最高的。
非常感谢!
library(tidyverse)
library(lubridate)
df %>%
mutate(Date = as.Date(Date, format = "%Y.%m.%d")) %>%
group_by(ID, ForestType, Year = year(Date)) %>%
filter(NDVI > quantile(NDVI, .25) & NDVI < quantile(NDVI, .75)) %>%
summarise(meanNDVI = mean(NDVI))
输出
# A tibble: 6 x 4
# Groups: ID, ForestType [2]
ID ForestType Year meanNDVI
<dbl> <chr> <dbl> <dbl>
1 1 oak 1987 0.25
2 1 oak 1988 0.325
3 1 oak 1989 0.475
4 2 pine 1987 0.35
5 2 pine 1988 0.635
6 2 pine 1989 0.74
使用 aggregate
的经典基础 R 方法。可以使用 substr
.
获取年份
res <- with(df, aggregate(list(meanNDVI=NDVI),
by=list(ID=ID, ForestType=ForestType, date=substr(Date, 1, 4)),
FUN=mean))
res[order(res$ID), ]
# ID ForestType date meanNDVI
# 1 1 oak 1987 0.2875
# 3 1 oak 1988 0.3575
# 5 1 oak 1989 0.5150
# 2 2 pine 1987 0.3575
# 4 2 pine 1988 0.6425
# 6 2 pine 1989 0.7725
修剪版
针对 25% 的离群值进行了修剪。
res2 <- with(df, aggregate(list(meanNDVI=NDVI),
by=list(ID=ID, ForestType=ForestType, date=substr(Date, 1, 4)),
FUN=mean, trim=.25))
res2[order(res2$ID), ]
# ID ForestType date meanNDVI
# 1 1 oak 1987 0.250
# 3 1 oak 1988 0.325
# 5 1 oak 1989 0.475
# 2 2 pine 1987 0.350
# 4 2 pine 1988 0.635
# 6 2 pine 1989 0.740
使用data.table
包,您可以进行如下操作:
library(data.table)
setDT(df)[, Date := as.Date(Date, format = "%Y.%m.%d")][]
df[, .(meanNDVI = base::mean(NDVI, trim = 0.25)), by = .(ID, ForestType, year = year(Date))]
# ID ForestType year meanNDVI
# 1: 1 oak 1987 0.250
# 2: 1 oak 1988 0.325
# 3: 1 oak 1989 0.475
# 4: 2 pine 1987 0.350
# 5: 2 pine 1988 0.635
# 6: 2 pine 1989 0.740
另一种选择。您可以在 mean
中设置 trim
library(tidyverse)
library(lubridate)
df %>%
mutate(Date = ymd(Date) %>% year()) %>%
group_by(ID, ForestType, Date) %>%
summarise(mean = mean(NDVI, trim = 0.25, na.rm = T))
对我的数据集进行简化,例如:
df <- data.frame("ID"= c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2),
"ForestType" = c("oak","oak","oak","oak","oak","oak","oak","oak","oak","oak","oak","oak",
"pine","pine","pine","pine","pine","pine","pine","pine","pine","pine","pine","pine"),
"Date"= c("1987.01.01","1987.06.01","1987.10.01","1987.11.01",
"1988.01.01","1988.03.01","1988.04.01","1988.06.01",
"1989.03.01","1989.05.01","1989.07.01","1989.08.01",
"1987.01.01","1987.06.01","1987.10.01","1987.11.01",
"1988.01.01","1988.03.01","1988.04.01","1988.06.01",
"1989.03.01","1989.05.01","1989.07.01","1989.08.01"),
"NDVI"= c(0.1,0.2,0.3,0.55,0.31,0.26,0.34,0.52,0.41,0.45,0.50,0.7,
0.2,0.3,0.4,0.53,0.52,0.54,0.78,0.73,0.72,0.71,0.76,0.9),
check.names = FALSE, stringsAsFactors = FALSE)
我想获得某个时间段的 NDVI 值的均值,在本例中为年份。 考虑到在我的真实数据集中我会需要它四季变化,所以它应该是可适应的。
这些方式应该考虑:
- 修剪异常值:例如最高值的 25% 和最低值的 25%。
- 它们应该是 by class,在本例中是 ID 字段。
所以输出应该类似于:
> desired_df
ID ForestType Date meanNDVI
1 1 oak 1987 0.250
2 1 oak 1988 0.325
3 1 oak 1989 0.430
4 2 pine 1987 0.350
5 2 pine 1988 0.635
6 2 pine 1989 0.740
在这种情况下,例如,0.250
对应于 ID=1
的 1987
的平均值 NDVI
并且它是那一年的 4 个值的平均值最低的和最高的。
非常感谢!
library(tidyverse)
library(lubridate)
df %>%
mutate(Date = as.Date(Date, format = "%Y.%m.%d")) %>%
group_by(ID, ForestType, Year = year(Date)) %>%
filter(NDVI > quantile(NDVI, .25) & NDVI < quantile(NDVI, .75)) %>%
summarise(meanNDVI = mean(NDVI))
输出
# A tibble: 6 x 4
# Groups: ID, ForestType [2]
ID ForestType Year meanNDVI
<dbl> <chr> <dbl> <dbl>
1 1 oak 1987 0.25
2 1 oak 1988 0.325
3 1 oak 1989 0.475
4 2 pine 1987 0.35
5 2 pine 1988 0.635
6 2 pine 1989 0.74
使用 aggregate
的经典基础 R 方法。可以使用 substr
.
res <- with(df, aggregate(list(meanNDVI=NDVI),
by=list(ID=ID, ForestType=ForestType, date=substr(Date, 1, 4)),
FUN=mean))
res[order(res$ID), ]
# ID ForestType date meanNDVI
# 1 1 oak 1987 0.2875
# 3 1 oak 1988 0.3575
# 5 1 oak 1989 0.5150
# 2 2 pine 1987 0.3575
# 4 2 pine 1988 0.6425
# 6 2 pine 1989 0.7725
修剪版
针对 25% 的离群值进行了修剪。
res2 <- with(df, aggregate(list(meanNDVI=NDVI),
by=list(ID=ID, ForestType=ForestType, date=substr(Date, 1, 4)),
FUN=mean, trim=.25))
res2[order(res2$ID), ]
# ID ForestType date meanNDVI
# 1 1 oak 1987 0.250
# 3 1 oak 1988 0.325
# 5 1 oak 1989 0.475
# 2 2 pine 1987 0.350
# 4 2 pine 1988 0.635
# 6 2 pine 1989 0.740
使用data.table
包,您可以进行如下操作:
library(data.table)
setDT(df)[, Date := as.Date(Date, format = "%Y.%m.%d")][]
df[, .(meanNDVI = base::mean(NDVI, trim = 0.25)), by = .(ID, ForestType, year = year(Date))]
# ID ForestType year meanNDVI
# 1: 1 oak 1987 0.250
# 2: 1 oak 1988 0.325
# 3: 1 oak 1989 0.475
# 4: 2 pine 1987 0.350
# 5: 2 pine 1988 0.635
# 6: 2 pine 1989 0.740
另一种选择。您可以在 mean
trim
library(tidyverse)
library(lubridate)
df %>%
mutate(Date = ymd(Date) %>% year()) %>%
group_by(ID, ForestType, Date) %>%
summarise(mean = mean(NDVI, trim = 0.25, na.rm = T))