改进低效的嵌套 for 循环跨条件均值
Improve inefficient nested for loop across conditional mean
我有一个数据框datav2
,结构如下:
project_id hour cap_factor load_area fuel capacity
17521 1000097 17902 0.753329 CA_PGE_S Solar 21.54
17522 1000097 17901 0.847296 CA_PGE_S Solar 21.54
17523 1001197 17924 0.586530 CA_PGE_S Solar 9.88
17524 2200097 25374 0.000000 CA_PGE_S Solar 44.54
17525 1077597 25414 0.635047 CA_PGE_S Wind 11.33
17526 1000097 19770 -0.39957 CA_PGE_S Solar 21.54
数据框的长度为 2100 万行。我想遍历每一行,当给定行 df$cap_factor < 0
时,将行 df$cap_factor
替换为具有相同 load_area 的所有其他 cap_factor 变量的平均值,小时,燃料,并且是积极的。
到目前为止,我一直在使用以下内容:
datav2$cap_factor2 <-NA
for (i in 1:length(datav2$cap_factor)) {
if (datav2[i,3] < 0)
datav2[i,7] <-mean(datav2$cap_factor[datav2$hour == datav2[i,2] & datav2$fuel == datav2[i,5] &
datav2$fuel == datav2[i,4] & datav2$cap_factor >= 0 ])
else
datav2[i,7] <- datav2[i,3]}
在如此大的数据集上循环时,这非常慢。对提高此过程效率的技术有何建议?
先求均值再加入:
structure(list(project_id = c(1000097L, 1000097L, 1001197L, 2200097L, 1077597L, 1000097L),
hour = c(17902L, 17901L, 17924L, 25374L, 25414L, 19770L),
cap_factor = c(0.753329, 0.847296, 0.58653, 0, 0.635047, -0.39957),
load_area = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "CA_PGE_S", class = "factor"),
fuel = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("Solar", "Wind"), class = "factor"),
capacity = c(21.54, 21.54, 9.88, 44.54, 11.33, 21.54)),
.Names = c("project_id", "hour", "cap_factor", "load_area", "fuel", "capacity"),
class = "data.frame",
row.names = c("17521", "17522", "17523", "17524", "17525", "17526"))
library(dplyr)
d %>% subset(cap_factor > 0) %>%
group_by(load_area, hour, fuel) %>%
summarize(mcap_factor = mean(cap_factor)) %>%
right_join(d)
这是 data.table
的解决方案:
library("data.table")
setDT(datav2)
foo <- function(x) ifelse(x<0, mean(x[x>=0]), x)
datav2[, cap_factor := foo(cap_factor), by=.(load_area, hour, fuel)]
测试代码:
library("data.table")
datav2 <- fread(header=TRUE, drop="rownr",
'rownr project_id hour cap_factor load_area fuel capacity
17521 1000097 17902 0.753329 CA_PGE_S Solar 21.54
17522 1000097 17901 0.847296 CA_PGE_S Solar 21.54
17523 1001197 17924 0.586530 CA_PGE_S Solar 9.88
17524 2200097 25374 0.000000 CA_PGE_S Solar 44.54
17525 1077597 25414 0.635047 CA_PGE_S Wind 11.33
17526 1000097 19770 -0.39957 CA_PGE_S Solar 21.54')
foo <- function(x) ifelse(x<0, mean(x[x>=0]), x)
datav2[, cap_factor := foo(cap_factor), by=.(load_area, hour, fuel)]
datav2
# project_id hour cap_factor load_area fuel capacity
# 1: 1000097 17902 0.753329 CA_PGE_S Solar 21.54
# 2: 1000097 17901 0.847296 CA_PGE_S Solar 21.54
# 3: 1001197 17924 0.586530 CA_PGE_S Solar 9.88
# 4: 2200097 25374 0.000000 CA_PGE_S Solar 44.54
# 5: 1077597 25414 0.635047 CA_PGE_S Wind 11.33
# 6: 1000097 19770 NaN CA_PGE_S Solar 21.54
显示您将获得 NaN
没有 cap_factor>=0
的组。
如果你想测试这样一个组的存在你可以这样做:
datav2[,.(sum(cap_factor>=0), sum(cap_factor<0)),by=.(load_area, hour, fuel)][V1==0 & V2>0]
更改前 cap_factor
.
我有一个数据框datav2
,结构如下:
project_id hour cap_factor load_area fuel capacity
17521 1000097 17902 0.753329 CA_PGE_S Solar 21.54
17522 1000097 17901 0.847296 CA_PGE_S Solar 21.54
17523 1001197 17924 0.586530 CA_PGE_S Solar 9.88
17524 2200097 25374 0.000000 CA_PGE_S Solar 44.54
17525 1077597 25414 0.635047 CA_PGE_S Wind 11.33
17526 1000097 19770 -0.39957 CA_PGE_S Solar 21.54
数据框的长度为 2100 万行。我想遍历每一行,当给定行 df$cap_factor < 0
时,将行 df$cap_factor
替换为具有相同 load_area 的所有其他 cap_factor 变量的平均值,小时,燃料,并且是积极的。
到目前为止,我一直在使用以下内容:
datav2$cap_factor2 <-NA
for (i in 1:length(datav2$cap_factor)) {
if (datav2[i,3] < 0)
datav2[i,7] <-mean(datav2$cap_factor[datav2$hour == datav2[i,2] & datav2$fuel == datav2[i,5] &
datav2$fuel == datav2[i,4] & datav2$cap_factor >= 0 ])
else
datav2[i,7] <- datav2[i,3]}
在如此大的数据集上循环时,这非常慢。对提高此过程效率的技术有何建议?
先求均值再加入:
structure(list(project_id = c(1000097L, 1000097L, 1001197L, 2200097L, 1077597L, 1000097L),
hour = c(17902L, 17901L, 17924L, 25374L, 25414L, 19770L),
cap_factor = c(0.753329, 0.847296, 0.58653, 0, 0.635047, -0.39957),
load_area = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "CA_PGE_S", class = "factor"),
fuel = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("Solar", "Wind"), class = "factor"),
capacity = c(21.54, 21.54, 9.88, 44.54, 11.33, 21.54)),
.Names = c("project_id", "hour", "cap_factor", "load_area", "fuel", "capacity"),
class = "data.frame",
row.names = c("17521", "17522", "17523", "17524", "17525", "17526"))
library(dplyr)
d %>% subset(cap_factor > 0) %>%
group_by(load_area, hour, fuel) %>%
summarize(mcap_factor = mean(cap_factor)) %>%
right_join(d)
这是 data.table
的解决方案:
library("data.table")
setDT(datav2)
foo <- function(x) ifelse(x<0, mean(x[x>=0]), x)
datav2[, cap_factor := foo(cap_factor), by=.(load_area, hour, fuel)]
测试代码:
library("data.table")
datav2 <- fread(header=TRUE, drop="rownr",
'rownr project_id hour cap_factor load_area fuel capacity
17521 1000097 17902 0.753329 CA_PGE_S Solar 21.54
17522 1000097 17901 0.847296 CA_PGE_S Solar 21.54
17523 1001197 17924 0.586530 CA_PGE_S Solar 9.88
17524 2200097 25374 0.000000 CA_PGE_S Solar 44.54
17525 1077597 25414 0.635047 CA_PGE_S Wind 11.33
17526 1000097 19770 -0.39957 CA_PGE_S Solar 21.54')
foo <- function(x) ifelse(x<0, mean(x[x>=0]), x)
datav2[, cap_factor := foo(cap_factor), by=.(load_area, hour, fuel)]
datav2
# project_id hour cap_factor load_area fuel capacity
# 1: 1000097 17902 0.753329 CA_PGE_S Solar 21.54
# 2: 1000097 17901 0.847296 CA_PGE_S Solar 21.54
# 3: 1001197 17924 0.586530 CA_PGE_S Solar 9.88
# 4: 2200097 25374 0.000000 CA_PGE_S Solar 44.54
# 5: 1077597 25414 0.635047 CA_PGE_S Wind 11.33
# 6: 1000097 19770 NaN CA_PGE_S Solar 21.54
显示您将获得 NaN
没有 cap_factor>=0
的组。
如果你想测试这样一个组的存在你可以这样做:
datav2[,.(sum(cap_factor>=0), sum(cap_factor<0)),by=.(load_area, hour, fuel)][V1==0 & V2>0]
更改前 cap_factor
.