R:分别计算每个因子水平,然后计算 min/mean/max over levels
R: do calculation for each factor level separately, then calculate min/mean/max over levels
所以我确实有一个配水模型的输出,它是一条河流每小时的流入和排放值。我做了5个模型运行s
可重现的例子:
df <- data.frame(rep(seq(
from=as.POSIXct("2012-1-1 0:00", tz="UTC"),
to=as.POSIXct("2012-1-1 23:00", tz="UTC"),
by="hour"
),5),
as.factor(c(rep(1,24),rep(2,24),rep(3,24), rep(4,24),rep(5,24))),
rep(seq(1,300,length.out=24),5),
rep(seq(1,180, length.out=24),5) )
colnames(df)<-c("time", "run", "inflow", "discharge")
当然,在现实中,运行 的值是不同的。 (而且我确实有更多的数据,因为我有 100 运行s 和 35 年的每小时值)。
所以,首先我想计算每个 运行 的缺水系数,这意味着我需要计算类似(1 -(6 小时前的排放/流入))的东西,因为水需要 6 小时才能 运行 通过集水区。
scarcityfactor <- 1 - (discharge / lag(inflow,6))
然后我想计算所有 运行s 的稀缺性因素的平均值、最大值和最小值(以找出每个时间步可能发生的稀缺性的最高、最低和平均值; 根据不同的模型 运行s)。所以我想说,我可以计算每个时间步长的平均值、最大值和最小值:
f1 <- function(x) c(Mean = (mean(x)), Max = (max(x)), Min = (min(x)))
results <- do.call(data.frame, aggregate(scarcityfactor ~ time,
data = df,
FUN = f1))
有人可以帮我写代码吗?
library(tidyverse)
df %>%
group_by(run) %>%
mutate(scarcityfactor = 1 - discharge / lag(inflow,6)) %>%
group_by(time) %>%
summarise(Mean = mean(scarcityfactor),
Max = max(scarcityfactor),
Min = min(scarcityfactor))
# # A tibble: 24 x 4
# time Mean Max Min
# <dttm> <dbl> <dbl> <dbl>
# 1 2012-01-01 00:00:00 NA NA NA
# 2 2012-01-01 01:00:00 NA NA NA
# 3 2012-01-01 02:00:00 NA NA NA
# 4 2012-01-01 03:00:00 NA NA NA
# 5 2012-01-01 04:00:00 NA NA NA
# 6 2012-01-01 05:00:00 NA NA NA
# 7 2012-01-01 06:00:00 -46.7 -46.7 -46.7
# 8 2012-01-01 07:00:00 -2.96 -2.96 -2.96
# 9 2012-01-01 08:00:00 -1.34 -1.34 -1.34
#10 2012-01-01 09:00:00 -0.776 -0.776 -0.776
# # ... with 14 more rows
如果我正确理解问题描述,我相信这就是你想要的。
我将使用 data.table
:
library(data.table)
setDT(df)
# add scarcity_factor (group by run)
df[ , scarcity_factor := 1 - discharge/shift(inflow, 6L), by = run]
# group by time, excluding times for which the
# scarcity factor is missing
df[!is.na(scarcity_factor), by = time,
.(min_scarcity = min(scarcity_factor),
mean_scarcity = mean(scarcity_factor),
max_scarcity = max(scarcity_factor))]
# time min_scarcity mean_scarcity max_scarcity
# 1: 2012-01-01 06:00:00 -46.695652174 -46.695652174 -46.695652174
# 2: 2012-01-01 07:00:00 -2.962732919 -2.962732919 -2.962732919
# 3: 2012-01-01 08:00:00 -1.342995169 -1.342995169 -1.342995169
# 4: 2012-01-01 09:00:00 -0.776086957 -0.776086957 -0.776086957
# 5: 2012-01-01 10:00:00 -0.487284660 -0.487284660 -0.487284660
# 6: 2012-01-01 11:00:00 -0.312252964 -0.312252964 -0.312252964
# 7: 2012-01-01 12:00:00 -0.194826637 -0.194826637 -0.194826637
# 8: 2012-01-01 13:00:00 -0.110586011 -0.110586011 -0.110586011
# 9: 2012-01-01 14:00:00 -0.047204969 -0.047204969 -0.047204969
# 10: 2012-01-01 15:00:00 0.002210759 0.002210759 0.002210759
# 11: 2012-01-01 16:00:00 0.041818785 0.041818785 0.041818785
# 12: 2012-01-01 17:00:00 0.074275362 0.074275362 0.074275362
# 13: 2012-01-01 18:00:00 0.101356965 0.101356965 0.101356965
# 14: 2012-01-01 19:00:00 0.124296675 0.124296675 0.124296675
# 15: 2012-01-01 20:00:00 0.143977192 0.143977192 0.143977192
# 16: 2012-01-01 21:00:00 0.161047028 0.161047028 0.161047028
# 17: 2012-01-01 22:00:00 0.175993343 0.175993343 0.175993343
# 18: 2012-01-01 23:00:00 0.189189189 0.189189189 0.189189189
您可以通过 lapply
遍历不同的聚合器来更加简洁:
df[!is.na(scarcity_factor), by = time,
lapply(list(min, mean, max), function(f) f(scarcity_factor))]
最后,您可以将其视为聚合重塑并使用 dcast
:
dcast(df, time ~ ., value.var = 'scarcity_factor',
fun.aggregate = list(min, mean, max))
(如果要排除无意义的行,请在 dcast
的第一个参数中使用 df[!is.na(scarcity_factor)]
)
所以我确实有一个配水模型的输出,它是一条河流每小时的流入和排放值。我做了5个模型运行s
可重现的例子:
df <- data.frame(rep(seq(
from=as.POSIXct("2012-1-1 0:00", tz="UTC"),
to=as.POSIXct("2012-1-1 23:00", tz="UTC"),
by="hour"
),5),
as.factor(c(rep(1,24),rep(2,24),rep(3,24), rep(4,24),rep(5,24))),
rep(seq(1,300,length.out=24),5),
rep(seq(1,180, length.out=24),5) )
colnames(df)<-c("time", "run", "inflow", "discharge")
当然,在现实中,运行 的值是不同的。 (而且我确实有更多的数据,因为我有 100 运行s 和 35 年的每小时值)。
所以,首先我想计算每个 运行 的缺水系数,这意味着我需要计算类似(1 -(6 小时前的排放/流入))的东西,因为水需要 6 小时才能 运行 通过集水区。
scarcityfactor <- 1 - (discharge / lag(inflow,6))
然后我想计算所有 运行s 的稀缺性因素的平均值、最大值和最小值(以找出每个时间步可能发生的稀缺性的最高、最低和平均值; 根据不同的模型 运行s)。所以我想说,我可以计算每个时间步长的平均值、最大值和最小值:
f1 <- function(x) c(Mean = (mean(x)), Max = (max(x)), Min = (min(x)))
results <- do.call(data.frame, aggregate(scarcityfactor ~ time,
data = df,
FUN = f1))
有人可以帮我写代码吗?
library(tidyverse)
df %>%
group_by(run) %>%
mutate(scarcityfactor = 1 - discharge / lag(inflow,6)) %>%
group_by(time) %>%
summarise(Mean = mean(scarcityfactor),
Max = max(scarcityfactor),
Min = min(scarcityfactor))
# # A tibble: 24 x 4
# time Mean Max Min
# <dttm> <dbl> <dbl> <dbl>
# 1 2012-01-01 00:00:00 NA NA NA
# 2 2012-01-01 01:00:00 NA NA NA
# 3 2012-01-01 02:00:00 NA NA NA
# 4 2012-01-01 03:00:00 NA NA NA
# 5 2012-01-01 04:00:00 NA NA NA
# 6 2012-01-01 05:00:00 NA NA NA
# 7 2012-01-01 06:00:00 -46.7 -46.7 -46.7
# 8 2012-01-01 07:00:00 -2.96 -2.96 -2.96
# 9 2012-01-01 08:00:00 -1.34 -1.34 -1.34
#10 2012-01-01 09:00:00 -0.776 -0.776 -0.776
# # ... with 14 more rows
如果我正确理解问题描述,我相信这就是你想要的。
我将使用 data.table
:
library(data.table)
setDT(df)
# add scarcity_factor (group by run)
df[ , scarcity_factor := 1 - discharge/shift(inflow, 6L), by = run]
# group by time, excluding times for which the
# scarcity factor is missing
df[!is.na(scarcity_factor), by = time,
.(min_scarcity = min(scarcity_factor),
mean_scarcity = mean(scarcity_factor),
max_scarcity = max(scarcity_factor))]
# time min_scarcity mean_scarcity max_scarcity
# 1: 2012-01-01 06:00:00 -46.695652174 -46.695652174 -46.695652174
# 2: 2012-01-01 07:00:00 -2.962732919 -2.962732919 -2.962732919
# 3: 2012-01-01 08:00:00 -1.342995169 -1.342995169 -1.342995169
# 4: 2012-01-01 09:00:00 -0.776086957 -0.776086957 -0.776086957
# 5: 2012-01-01 10:00:00 -0.487284660 -0.487284660 -0.487284660
# 6: 2012-01-01 11:00:00 -0.312252964 -0.312252964 -0.312252964
# 7: 2012-01-01 12:00:00 -0.194826637 -0.194826637 -0.194826637
# 8: 2012-01-01 13:00:00 -0.110586011 -0.110586011 -0.110586011
# 9: 2012-01-01 14:00:00 -0.047204969 -0.047204969 -0.047204969
# 10: 2012-01-01 15:00:00 0.002210759 0.002210759 0.002210759
# 11: 2012-01-01 16:00:00 0.041818785 0.041818785 0.041818785
# 12: 2012-01-01 17:00:00 0.074275362 0.074275362 0.074275362
# 13: 2012-01-01 18:00:00 0.101356965 0.101356965 0.101356965
# 14: 2012-01-01 19:00:00 0.124296675 0.124296675 0.124296675
# 15: 2012-01-01 20:00:00 0.143977192 0.143977192 0.143977192
# 16: 2012-01-01 21:00:00 0.161047028 0.161047028 0.161047028
# 17: 2012-01-01 22:00:00 0.175993343 0.175993343 0.175993343
# 18: 2012-01-01 23:00:00 0.189189189 0.189189189 0.189189189
您可以通过 lapply
遍历不同的聚合器来更加简洁:
df[!is.na(scarcity_factor), by = time,
lapply(list(min, mean, max), function(f) f(scarcity_factor))]
最后,您可以将其视为聚合重塑并使用 dcast
:
dcast(df, time ~ ., value.var = 'scarcity_factor',
fun.aggregate = list(min, mean, max))
(如果要排除无意义的行,请在 dcast
的第一个参数中使用 df[!is.na(scarcity_factor)]
)