R 中光栅处理的内存问题
Memory issue with raster processing in R
我已经编写了一个脚本来自动处理我的栅格数据,但它在完成一半后就失败了。该循环在目录 unzipped
中查找包含 .tif
文件的子目录,并将那些带有一些底图的子目录堆叠在 /base_layers
中。它一直在失败:
Error in matrix(unlist(ini), ncol = 2, byrow = TRUE) : 'data' must be of a vector type, was 'NULL'
和
In writeBin(v, x@file@con, size = x@file@dsize) :
problem writing to connection
和
Error in .local(.Object, ...) :
GDAL Error 3: Free disk space available is 875151360 bytes, whereas 2473962400 are at least necessary. You can disable this check by defining the CHECK_DISK_FREE_SPACE configuration option to FALSE.
我想已修复,查看答案
运行 在具有 Ubuntu、24 GB 内存和 50 GB 存储空间和 4 个 vCPU 的 kubernetes pod 中以及在具有 2 个 vCPU 的 VM 实例 运行 Ubuntu 18.04 中, 8 GB 内存和 50 GB 存储空间。由于内存问题,pods 中途失败,VM 也没有完成。代码通常在到达掩模栅格阶段时失败,但仅在 运行 通过循环的几次迭代后才会失败(因此它在开始时一切正常)。
如果有人能指出此脚本中是否可能出现内存碎片,或者如何清除 OS 内存,我将永远感激不已!
脚本:
library(raster)
library(rgdal)
input = "/mnt/nfs/data/unzipped"
output = "/mnt/nfs/data/training" #path to where the data should go
#get paths to basemaps
DEM = raster("/mnt/nfs/data/base_layers/Clipped_filled_dem.tif")
flow_accum = raster("/mnt/nfs/data/base_layers/accumulation.tif")
slope = raster("/mnt/nfs/data/base_layers/Clipped_slope.tif")
aspect = raster("/mnt/nfs/data/base_layers/Clipped_filled_aspect.tif")
ruggedness = raster("/mnt/nfs/data/base_layers/Clipped_TRI.tif")
#get directories that have data that needs to be processed
directory = list.dirs(path = input, recursive = FALSE)
for(direct in directory) {
subdirect = list.dirs(path = direct,recursive = FALSE)
for(sub in subdirect){
files_for_raster <- list.files(path = sub, pattern = "*.tif$", full.names = TRUE)
rasterstack = stack(files_for_raster)
# name of datapull
name = gsub(paste(direct,"/",sep=""),"",sub)
print(c("working in",name))
# crop DEM to the extent of the satellite image
DEMcrop = crop(DEM,rasterstack) #extent can be a raster
flow_accumcrop = crop(flow_accum,rasterstack)
slopecrop = crop(slope,rasterstack)
aspectcrop = crop(aspect,rasterstack)
ruggednesscrop = crop(ruggedness,rasterstack)
print(c("cropped"))
print(object.size(DEMcrop))
print(object.size(rasterstack))
# resample rasters, this will take a bit
DEMcrop = resample(DEMcrop,rasterstack)
flow_accumcrop = resample(flow_accumcrop,rasterstack)
slopecrop = resample(slopecrop,rasterstack)
aspectcrop = resample(aspectcrop,rasterstack)
ruggednesscrop = resample(ruggednesscrop,rasterstack)
print(c("resampled"))
print(object.size(DEMcrop))
print(object.size(rasterstack))
# mask layers
DEMcrop = mask(DEMcrop,raster::subset(rasterstack,1))
flow_accumcrop = mask(flow_accumcrop,raster::subset(rasterstack,1))
slopecrop = mask(slopecrop,raster::subset(rasterstack,1))
aspectcrop = mask(aspectcrop,raster::subset(rasterstack,1))
ruggednesscrop = mask(ruggednesscrop,raster::subset(rasterstack,1))
print(c("masked"))
print(object.size(DEMcrop))
print(object.size(rasterstack))
# add baselayers to the raster stack
finalstack = addLayer(rasterstack,DEMcrop,flow_accumcrop,slopecrop,aspectcrop,ruggednesscrop)
print(names(finalstack))
print(nlayers(finalstack))
bands<-c("band1","band2","band3","band4","band5","band6","band7")
type<-c("quality","sr_ndvi","DEM","flow_accum","slope","aspect","TRI")
band_info<-data.frame(bands,type)
print("finalstack")
# create new output directory and save raster there
output_subdirect = gsub(paste(input,"/",sep=""),"",sub)
dir.create(file.path(output,output_subdirect), recursive = TRUE)
Sys.chmod(file.path(output,output_subdirect), mode = "777", use_umask = FALSE)
print("created directory")
write = file.path(output,output_subdirect)
writeRaster(finalstack, format="GTiff", filename=file.path(write,name,fsep="/"), options=c("INTERLEAVE=BAND","COMPRESS=NONE"), overwrite=TRUE)
write.csv(band_info, file = paste(file.path(write,name,fsep="/"),".csv",sep=""))
print("done processing")
rm(rasterstack,DEMcrop,flow_accumcrop,slopecrop,aspectcrop,ruggednesscrop)
gc()
print(gc())
system("sudo sysctl -w vm.drop_caches=3")
}
}
# useful functions
#mystack = stack("path/to/multilayer.tif") # multilayer.tif is an existing raster stack
#band1 = subset(mystack,subset=1) # subsets bands from raster stack
#removeTmpFiles(h=0) # removes temp files, can be used after writing raster stackes to delete all temp raster files
#rasterbrick<-brick(rasterstack) #can make a raster brick from a raster stack
我添加了 print(object.size())
以尝试查看我的对象在整个代码执行过程中是否在变大。
该文件正在将大型光栅写入 /tmp
文件夹中的磁盘,每次执行超过 20Gb。很快就把磁盘填满了。仍然担心为什么将光栅写入现有文件而不是清除它们。可以在脚本期间清除 tmp 目录。
这是对光栅包内存问题的很好解释。 https://discuss.ropensci.org/t/how-to-avoid-space-hogging-raster-tempfiles/864
我已经编写了一个脚本来自动处理我的栅格数据,但它在完成一半后就失败了。该循环在目录 unzipped
中查找包含 .tif
文件的子目录,并将那些带有一些底图的子目录堆叠在 /base_layers
中。它一直在失败:
Error in matrix(unlist(ini), ncol = 2, byrow = TRUE) : 'data' must be of a vector type, was 'NULL'
和
In writeBin(v, x@file@con, size = x@file@dsize) : problem writing to connection
和
Error in .local(.Object, ...) : GDAL Error 3: Free disk space available is 875151360 bytes, whereas 2473962400 are at least necessary. You can disable this check by defining the CHECK_DISK_FREE_SPACE configuration option to FALSE.
我想已修复,查看答案 运行 在具有 Ubuntu、24 GB 内存和 50 GB 存储空间和 4 个 vCPU 的 kubernetes pod 中以及在具有 2 个 vCPU 的 VM 实例 运行 Ubuntu 18.04 中, 8 GB 内存和 50 GB 存储空间。由于内存问题,pods 中途失败,VM 也没有完成。代码通常在到达掩模栅格阶段时失败,但仅在 运行 通过循环的几次迭代后才会失败(因此它在开始时一切正常)。
如果有人能指出此脚本中是否可能出现内存碎片,或者如何清除 OS 内存,我将永远感激不已!
脚本:
library(raster)
library(rgdal)
input = "/mnt/nfs/data/unzipped"
output = "/mnt/nfs/data/training" #path to where the data should go
#get paths to basemaps
DEM = raster("/mnt/nfs/data/base_layers/Clipped_filled_dem.tif")
flow_accum = raster("/mnt/nfs/data/base_layers/accumulation.tif")
slope = raster("/mnt/nfs/data/base_layers/Clipped_slope.tif")
aspect = raster("/mnt/nfs/data/base_layers/Clipped_filled_aspect.tif")
ruggedness = raster("/mnt/nfs/data/base_layers/Clipped_TRI.tif")
#get directories that have data that needs to be processed
directory = list.dirs(path = input, recursive = FALSE)
for(direct in directory) {
subdirect = list.dirs(path = direct,recursive = FALSE)
for(sub in subdirect){
files_for_raster <- list.files(path = sub, pattern = "*.tif$", full.names = TRUE)
rasterstack = stack(files_for_raster)
# name of datapull
name = gsub(paste(direct,"/",sep=""),"",sub)
print(c("working in",name))
# crop DEM to the extent of the satellite image
DEMcrop = crop(DEM,rasterstack) #extent can be a raster
flow_accumcrop = crop(flow_accum,rasterstack)
slopecrop = crop(slope,rasterstack)
aspectcrop = crop(aspect,rasterstack)
ruggednesscrop = crop(ruggedness,rasterstack)
print(c("cropped"))
print(object.size(DEMcrop))
print(object.size(rasterstack))
# resample rasters, this will take a bit
DEMcrop = resample(DEMcrop,rasterstack)
flow_accumcrop = resample(flow_accumcrop,rasterstack)
slopecrop = resample(slopecrop,rasterstack)
aspectcrop = resample(aspectcrop,rasterstack)
ruggednesscrop = resample(ruggednesscrop,rasterstack)
print(c("resampled"))
print(object.size(DEMcrop))
print(object.size(rasterstack))
# mask layers
DEMcrop = mask(DEMcrop,raster::subset(rasterstack,1))
flow_accumcrop = mask(flow_accumcrop,raster::subset(rasterstack,1))
slopecrop = mask(slopecrop,raster::subset(rasterstack,1))
aspectcrop = mask(aspectcrop,raster::subset(rasterstack,1))
ruggednesscrop = mask(ruggednesscrop,raster::subset(rasterstack,1))
print(c("masked"))
print(object.size(DEMcrop))
print(object.size(rasterstack))
# add baselayers to the raster stack
finalstack = addLayer(rasterstack,DEMcrop,flow_accumcrop,slopecrop,aspectcrop,ruggednesscrop)
print(names(finalstack))
print(nlayers(finalstack))
bands<-c("band1","band2","band3","band4","band5","band6","band7")
type<-c("quality","sr_ndvi","DEM","flow_accum","slope","aspect","TRI")
band_info<-data.frame(bands,type)
print("finalstack")
# create new output directory and save raster there
output_subdirect = gsub(paste(input,"/",sep=""),"",sub)
dir.create(file.path(output,output_subdirect), recursive = TRUE)
Sys.chmod(file.path(output,output_subdirect), mode = "777", use_umask = FALSE)
print("created directory")
write = file.path(output,output_subdirect)
writeRaster(finalstack, format="GTiff", filename=file.path(write,name,fsep="/"), options=c("INTERLEAVE=BAND","COMPRESS=NONE"), overwrite=TRUE)
write.csv(band_info, file = paste(file.path(write,name,fsep="/"),".csv",sep=""))
print("done processing")
rm(rasterstack,DEMcrop,flow_accumcrop,slopecrop,aspectcrop,ruggednesscrop)
gc()
print(gc())
system("sudo sysctl -w vm.drop_caches=3")
}
}
# useful functions
#mystack = stack("path/to/multilayer.tif") # multilayer.tif is an existing raster stack
#band1 = subset(mystack,subset=1) # subsets bands from raster stack
#removeTmpFiles(h=0) # removes temp files, can be used after writing raster stackes to delete all temp raster files
#rasterbrick<-brick(rasterstack) #can make a raster brick from a raster stack
我添加了 print(object.size())
以尝试查看我的对象在整个代码执行过程中是否在变大。
该文件正在将大型光栅写入 /tmp
文件夹中的磁盘,每次执行超过 20Gb。很快就把磁盘填满了。仍然担心为什么将光栅写入现有文件而不是清除它们。可以在脚本期间清除 tmp 目录。
这是对光栅包内存问题的很好解释。 https://discuss.ropensci.org/t/how-to-avoid-space-hogging-raster-tempfiles/864