使用 for 循环和过滤器优化代码
Optimizing code with for loop and filter
我为这个问题简化了一个巨大的数据集,我尝试将一个函数应用到它的每一行以作为一个特定列的函数。
我尝试了一种 for 循环方法,然后使用 Rprof
和 profvis
进行了一些分析。我知道我可以尝试一些应用或其他方法,但分析似乎表明最慢的部分是由于其他步骤。
这就是我想要做的:
library(dplyr)
# Example data frame
id <- rep(c(1:100), each = 5)
ab <- runif(length(id), 0, 1)
char1 <- runif(length(id), 0, 1)
char2 <- runif(length(id), 0, 1)
dat <- data.frame(cbind(id, ab, char1, char2))
dat$result <- NA
# Loop
com <- unique(id)
for (k in com){
dat_k <- filter(dat, id==k) # slowest line
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line
}
我的代码中最慢的部分是由于带有 filter
的行,然后当我将获得的结果重新分配到原始数据框中时。我试图用 subset
或 which
替换过滤器功能,但它甚至更慢。
因此,应该改进此代码的组织,但我真的不知道如何改进。
下面的for循环要快一点。不需要 dplyr 或 which 语句。
for (k in com){
dat_k <- dat[id == k, ] # no need for filter
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[id==k, "result"] <- dat_k$result # 2nd no need for which
}
我通过 lapply
获得了轻微的加速:
library(microbenchmark)
microbenchmark(
OP=
for (k in com){
dat_k <- filter(dat, id==k) # slowest line
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line
},
phiver=
for (k in com){
dat_k <- dat[id == k, ] # no need for filter
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[id==k, "result"] <- dat_k$result # 2nd no need for which
},
alex= {
dat2 <- split(dat, factor(dat$id))
dat2 <- lapply(dat2, function(l) {
dat_k_dist <- cluster::daisy(l[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
denom <- sum(l[, "ab"]) - l[, "ab"]
l[, "result"] <- as.numeric(num / denom)
return(l)
})
dat$result <- Reduce("c",lapply(dat2, function(l) l$result))
})
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 126.72184 129.94344 133.47666 132.11949 134.14558 196.44860 100 c
phiver 73.78996 77.13434 79.61202 78.21638 79.81958 139.15854 100 b
alex 67.86450 71.61277 73.26273 72.34813 73.50353 90.31229 100 a
但这也是一个比较难并行的问题,所以我们可以并行化。 注意:由于并行的开销,这在示例数据上不会更快。但在所谓的 "huge dataset"
上应该更快
library(parallel)
cl <- makeCluster(detectCores())
dat$result <- Reduce("c", parLapply(cl, dat2, fun= function(l) {
dat_k_dist <- as.matrix(cluster::daisy(l[, c("char1", "char2")], metric = "gower"))
num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
denom <- sum(l[, "ab"]) - l[, "ab"]
return(as.numeric(num / denom))
}))
stopCluster(cl)
我为这个问题简化了一个巨大的数据集,我尝试将一个函数应用到它的每一行以作为一个特定列的函数。
我尝试了一种 for 循环方法,然后使用 Rprof
和 profvis
进行了一些分析。我知道我可以尝试一些应用或其他方法,但分析似乎表明最慢的部分是由于其他步骤。
这就是我想要做的:
library(dplyr)
# Example data frame
id <- rep(c(1:100), each = 5)
ab <- runif(length(id), 0, 1)
char1 <- runif(length(id), 0, 1)
char2 <- runif(length(id), 0, 1)
dat <- data.frame(cbind(id, ab, char1, char2))
dat$result <- NA
# Loop
com <- unique(id)
for (k in com){
dat_k <- filter(dat, id==k) # slowest line
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line
}
我的代码中最慢的部分是由于带有 filter
的行,然后当我将获得的结果重新分配到原始数据框中时。我试图用 subset
或 which
替换过滤器功能,但它甚至更慢。
因此,应该改进此代码的组织,但我真的不知道如何改进。
下面的for循环要快一点。不需要 dplyr 或 which 语句。
for (k in com){
dat_k <- dat[id == k, ] # no need for filter
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[id==k, "result"] <- dat_k$result # 2nd no need for which
}
我通过 lapply
获得了轻微的加速:
library(microbenchmark)
microbenchmark(
OP=
for (k in com){
dat_k <- filter(dat, id==k) # slowest line
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line
},
phiver=
for (k in com){
dat_k <- dat[id == k, ] # no need for filter
dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
dat_k[, "result"] <- as.numeric(num / denom)
dat[id==k, "result"] <- dat_k$result # 2nd no need for which
},
alex= {
dat2 <- split(dat, factor(dat$id))
dat2 <- lapply(dat2, function(l) {
dat_k_dist <- cluster::daisy(l[, c("char1", "char2")], metric = "gower") %>% as.matrix()
num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
denom <- sum(l[, "ab"]) - l[, "ab"]
l[, "result"] <- as.numeric(num / denom)
return(l)
})
dat$result <- Reduce("c",lapply(dat2, function(l) l$result))
})
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 126.72184 129.94344 133.47666 132.11949 134.14558 196.44860 100 c
phiver 73.78996 77.13434 79.61202 78.21638 79.81958 139.15854 100 b
alex 67.86450 71.61277 73.26273 72.34813 73.50353 90.31229 100 a
但这也是一个比较难并行的问题,所以我们可以并行化。 注意:由于并行的开销,这在示例数据上不会更快。但在所谓的 "huge dataset"
上应该更快library(parallel)
cl <- makeCluster(detectCores())
dat$result <- Reduce("c", parLapply(cl, dat2, fun= function(l) {
dat_k_dist <- as.matrix(cluster::daisy(l[, c("char1", "char2")], metric = "gower"))
num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
denom <- sum(l[, "ab"]) - l[, "ab"]
return(as.numeric(num / denom))
}))
stopCluster(cl)