优化 R 中大 df 的距离 distHaversine 模型
Optimizing distance distHaversine model for large df in R
我正在处理一个大型数据集,我正在尝试 运行 在具有 8GB RAM 的本地计算机上进行地理空间分析。看起来我已经超出了我机器的资源,我想知道我是否可以优化我的模型以便我可以 运行 它在我的机器上。
area <- data.frame(area = c('Baker Street','Bank'),
lat = c(51.522236,51.5134047),
lng = c(-0.157080, -0.08905843),
radius = c(100,2000)
)
stop <- data.frame(station = c('Angel','Barbican','Barons Court','Bayswater'),
lat = c(51.53253,51.520865,51.490281,51.51224),
lng = c(-0.10579,-0.097758,-0.214340,-0.187569),
postcode = c('EC1V','EC1A', 'W14', 'W2'))
library(geosphere)
datNew = lapply(1:nrow(area), function(i) {
df = stop
df$dist = distHaversine(df[,c("lng", "lat")],
area[rep(i,nrow(df)), c('lng','lat')])
df$in_circle = ifelse(df$dist <= area[i, "radius"], "Yes", "No")
df$circle_id = area[i, "area"]
df
})
datNew = do.call(rbind, datNew)
require(dplyr)
datNew <- datNew %>%
group_by(station) %>%
slice(which.min(dist))
是否可以计算距离,然后在 station
中找到最小距离乘以 station
,这样我就不会最终将 stations
的数量乘以area
的数量?或者是否有另一种解决方案可以让我以一种资源消耗较少的方式 运行 或拆分作业以使其适合 RAM?
您是否尝试过将 gc() 放在 lapply 函数的末尾?它为下一次迭代释放内存 space。如果这没有帮助我明天再回到这个答案,请回复:)
编辑:
我不知道你是否想到了这一点,但现在开始:
library(geosphere)
library("plyr")
library("magrittr")
area <- data.frame(area = c('Baker Street','Bank'),
lat = c(51.522236,51.5134047),
lng = c(-0.157080, -0.08905843),
radius = c(100,2000)
)
stop <- data.frame(station = c('Angel','Barbican','Barons Court','Bayswater'),
lat = c(51.53253,51.520865,51.490281,51.51224),
lng = c(-0.10579,-0.097758,-0.214340,-0.187569),
postcode = c('EC1V','EC1A', 'W14', 'W2'))
## In the function below you take an area one by one and then save the station which at the minimal
## distance from the given area
min.dist <- ddply(area, ~area, function(xframe){
xframe <<- xframe
cat("Calculating minimum distance from area...", as.character(xframe$area), "\n")
dists <- distHaversine(xframe[, c("lat", "lng")], stop[ , c("lat", "lng")])
stop.min <- stop[which(min(dists)==dists), ]
stop.min$area <- xframe$area
return(stop.min)
gc()
})
min.dist # the new data frame
我正在处理一个大型数据集,我正在尝试 运行 在具有 8GB RAM 的本地计算机上进行地理空间分析。看起来我已经超出了我机器的资源,我想知道我是否可以优化我的模型以便我可以 运行 它在我的机器上。
area <- data.frame(area = c('Baker Street','Bank'),
lat = c(51.522236,51.5134047),
lng = c(-0.157080, -0.08905843),
radius = c(100,2000)
)
stop <- data.frame(station = c('Angel','Barbican','Barons Court','Bayswater'),
lat = c(51.53253,51.520865,51.490281,51.51224),
lng = c(-0.10579,-0.097758,-0.214340,-0.187569),
postcode = c('EC1V','EC1A', 'W14', 'W2'))
library(geosphere)
datNew = lapply(1:nrow(area), function(i) {
df = stop
df$dist = distHaversine(df[,c("lng", "lat")],
area[rep(i,nrow(df)), c('lng','lat')])
df$in_circle = ifelse(df$dist <= area[i, "radius"], "Yes", "No")
df$circle_id = area[i, "area"]
df
})
datNew = do.call(rbind, datNew)
require(dplyr)
datNew <- datNew %>%
group_by(station) %>%
slice(which.min(dist))
是否可以计算距离,然后在 station
中找到最小距离乘以 station
,这样我就不会最终将 stations
的数量乘以area
的数量?或者是否有另一种解决方案可以让我以一种资源消耗较少的方式 运行 或拆分作业以使其适合 RAM?
您是否尝试过将 gc() 放在 lapply 函数的末尾?它为下一次迭代释放内存 space。如果这没有帮助我明天再回到这个答案,请回复:)
编辑:
我不知道你是否想到了这一点,但现在开始:
library(geosphere)
library("plyr")
library("magrittr")
area <- data.frame(area = c('Baker Street','Bank'),
lat = c(51.522236,51.5134047),
lng = c(-0.157080, -0.08905843),
radius = c(100,2000)
)
stop <- data.frame(station = c('Angel','Barbican','Barons Court','Bayswater'),
lat = c(51.53253,51.520865,51.490281,51.51224),
lng = c(-0.10579,-0.097758,-0.214340,-0.187569),
postcode = c('EC1V','EC1A', 'W14', 'W2'))
## In the function below you take an area one by one and then save the station which at the minimal
## distance from the given area
min.dist <- ddply(area, ~area, function(xframe){
xframe <<- xframe
cat("Calculating minimum distance from area...", as.character(xframe$area), "\n")
dists <- distHaversine(xframe[, c("lat", "lng")], stop[ , c("lat", "lng")])
stop.min <- stop[which(min(dists)==dists), ]
stop.min$area <- xframe$area
return(stop.min)
gc()
})
min.dist # the new data frame