如何优化用其他栅格区域平均值替换栅格 nan 值的 Python 循环
How to optimize a Python loop that replaces raster nan values with other raster region mean values
我需要读取 2 个光栅,一个是卫星图像(目标),另一个是该图像的其他区域(分割)。目标图像呈现数值和 nans。分割图像是区域,其中具有相同值的每个像素都来自同一区域,例如,所有值为1的像素都来自片段1。
基于此,我想计算包含 nans 的每个段的平均值,并将 nan 计算值替换为段平均值。如果我有一个 5 像素的片段并且目标图像具有值 (2,nan,4,4,2),则 nan 值必须替换为 3.
我已经编写了一个脚本来执行此操作。但是,当我处理大图像时,FOR 循环中的代码非常慢。基于此,我想知道如何提高循环的性能。
import numpy
import rasterio
### returns which pixels are nan
def get_gaps(img):
gaps = numpy.argwhere( numpy.isnan( img ) )
return( gaps )
def fill(img_targ, gaps_targ, img_seg1):
### Get which segments contains NA on target image
indices_gap_targ = numpy.array( list( zip(gaps_targ[:,0], gaps_targ[:,1] ) ) )
segments_targ = img_seg1[ indices_gap_targ[:,0], indices_gap_targ[:,1] ]
segments_targ = numpy.unique( segments_targ[~numpy.isnan( segments_targ )] )
for seg in segments_targ:
### Get seg pixel position
seg_pixels = numpy.nonzero( img_seg1 == seg )
seg_indices = numpy.array( list( zip(seg_pixels[:][0], seg_pixels[:][1] ) ) )
### Get targ pix values
targ_values_seg = img_targ[ seg_indices[:,0], seg_indices[:,1] ]
### Check if any is not nan otherwise it will not have any value to use as mean
if( numpy.any( ~numpy.isnan(targ_values_seg) ) ):
### Get nan position and replace by mean value
nan_pos = numpy.isnan( targ_values_seg )
img_targ[ seg_indices[:,0][nan_pos], seg_indices[:,1][nan_pos] ] = numpy.nanmean(targ_values_seg)
return img_targ
input_targ_filename = "/home/path/target.tif"
input_seg1_filename = "/home/path/segmentation.tif"
with rasterio.open(input_targ_filename) as dataset:
img_targ = dataset.read(1)
img_targ[ img_targ < -100000 ] = numpy.nan
kwargs = dataset.meta
with rasterio.open(input_seg1_filename) as dataset:
img_seg1 = dataset.read(1)
img_seg1[ img_seg1 < -100000 ] = numpy.nan
gaps_targ = get_gaps(img_targ)
img_filled = fill(img_targ, gaps_targ, img_seg1)
为了回答你的例子,你可以这样做,没有循环:
seg = np.array((2,np.nan,4,4,2))
seg[np.isnan(seg)] = np.nanmean(seg)
输出:
array([2., 3., 4., 4., 2.])
我希望这个原则可以帮助您将其实现到更大的代码中
np.bincount
是这类问题的首选工具。 (它的作用与更直观的 np.add.at
基本相同,但通常更快。)
import numpy as np
# create mock data (this takes longer than the actual processing)
print("creating example")
N = 1000
NS = 2000
tgt = np.random.randn(N,N)
tgt[np.random.random((N,N))<0.1] = np.nan
seg = np.zeros((N,N),int)
seg.ravel()[np.random.choice(N*N,NS,replace=False)] = np.arange(1,NS+1)
idcs = np.s_[1:],np.s_[:,1:],np.s_[:-1],np.s_[:,:-1]
while np.count_nonzero(seg) < N*N/2:
i = np.random.randint(4)
idx,cidx = idcs[i],idcs[i-2]
seg[idx][seg[idx]==0] = seg[cidx][seg[idx]==0]
# replace nans (in-place, overwrites nans in tgt)
print("replacing nans")
n = np.isnan(tgt)
nn = ~n
segnn = seg[nn]
tgt[n] = (np.bincount(segnn,tgt[nn],NS+1)/np.bincount(segnn,None,NS+1))[seg[n]]
# check
print("verifying",end=" ... ")
sample = np.random.randint(0,NS+1,10)
for i in sample:
assert np.allclose(tgt[n][seg[n]==i],np.mean(tgt[nn][seg[nn]==i]))
print("looks ok")
我需要读取 2 个光栅,一个是卫星图像(目标),另一个是该图像的其他区域(分割)。目标图像呈现数值和 nans。分割图像是区域,其中具有相同值的每个像素都来自同一区域,例如,所有值为1的像素都来自片段1。
基于此,我想计算包含 nans 的每个段的平均值,并将 nan 计算值替换为段平均值。如果我有一个 5 像素的片段并且目标图像具有值 (2,nan,4,4,2),则 nan 值必须替换为 3.
我已经编写了一个脚本来执行此操作。但是,当我处理大图像时,FOR 循环中的代码非常慢。基于此,我想知道如何提高循环的性能。
import numpy
import rasterio
### returns which pixels are nan
def get_gaps(img):
gaps = numpy.argwhere( numpy.isnan( img ) )
return( gaps )
def fill(img_targ, gaps_targ, img_seg1):
### Get which segments contains NA on target image
indices_gap_targ = numpy.array( list( zip(gaps_targ[:,0], gaps_targ[:,1] ) ) )
segments_targ = img_seg1[ indices_gap_targ[:,0], indices_gap_targ[:,1] ]
segments_targ = numpy.unique( segments_targ[~numpy.isnan( segments_targ )] )
for seg in segments_targ:
### Get seg pixel position
seg_pixels = numpy.nonzero( img_seg1 == seg )
seg_indices = numpy.array( list( zip(seg_pixels[:][0], seg_pixels[:][1] ) ) )
### Get targ pix values
targ_values_seg = img_targ[ seg_indices[:,0], seg_indices[:,1] ]
### Check if any is not nan otherwise it will not have any value to use as mean
if( numpy.any( ~numpy.isnan(targ_values_seg) ) ):
### Get nan position and replace by mean value
nan_pos = numpy.isnan( targ_values_seg )
img_targ[ seg_indices[:,0][nan_pos], seg_indices[:,1][nan_pos] ] = numpy.nanmean(targ_values_seg)
return img_targ
input_targ_filename = "/home/path/target.tif"
input_seg1_filename = "/home/path/segmentation.tif"
with rasterio.open(input_targ_filename) as dataset:
img_targ = dataset.read(1)
img_targ[ img_targ < -100000 ] = numpy.nan
kwargs = dataset.meta
with rasterio.open(input_seg1_filename) as dataset:
img_seg1 = dataset.read(1)
img_seg1[ img_seg1 < -100000 ] = numpy.nan
gaps_targ = get_gaps(img_targ)
img_filled = fill(img_targ, gaps_targ, img_seg1)
为了回答你的例子,你可以这样做,没有循环:
seg = np.array((2,np.nan,4,4,2))
seg[np.isnan(seg)] = np.nanmean(seg)
输出:
array([2., 3., 4., 4., 2.])
我希望这个原则可以帮助您将其实现到更大的代码中
np.bincount
是这类问题的首选工具。 (它的作用与更直观的 np.add.at
基本相同,但通常更快。)
import numpy as np
# create mock data (this takes longer than the actual processing)
print("creating example")
N = 1000
NS = 2000
tgt = np.random.randn(N,N)
tgt[np.random.random((N,N))<0.1] = np.nan
seg = np.zeros((N,N),int)
seg.ravel()[np.random.choice(N*N,NS,replace=False)] = np.arange(1,NS+1)
idcs = np.s_[1:],np.s_[:,1:],np.s_[:-1],np.s_[:,:-1]
while np.count_nonzero(seg) < N*N/2:
i = np.random.randint(4)
idx,cidx = idcs[i],idcs[i-2]
seg[idx][seg[idx]==0] = seg[cidx][seg[idx]==0]
# replace nans (in-place, overwrites nans in tgt)
print("replacing nans")
n = np.isnan(tgt)
nn = ~n
segnn = seg[nn]
tgt[n] = (np.bincount(segnn,tgt[nn],NS+1)/np.bincount(segnn,None,NS+1))[seg[n]]
# check
print("verifying",end=" ... ")
sample = np.random.randint(0,NS+1,10)
for i in sample:
assert np.allclose(tgt[n][seg[n]==i],np.mean(tgt[nn][seg[nn]==i]))
print("looks ok")