将数据分箱到非基于网格的分箱中
Binning data into non-grid based bins
我使用了 voronoi binning:我有每个 bin 的中心坐标,我想找到每个 bin 中包含的所有像素的平均值。我只是不知道如何切片 numpy 数组以对其进行矢量化。
这是我当前的代码:X 和 Y 是一维数组,每个 bin 的中心都有 x 和 y 坐标;二维图像中的f:
import numpy as np
from scipy.spatial import KDTree
def rebin(f, X, Y):
s = f.shape
x_grid = np.arange(s[0])
y_grid = np.arange(s[1])
x_grid, y_grid = np.meshgrid(x_grid,y_grid)
x_grid, y_grid = x_grid.flatten(), y_grid.flatten()
tree = KDTree(zip(X,Y))
_, b = tree.query(zip(x_grid, y_grid))
out = X*np.nan
for i in range(max(b)):
out[i] = np.nanmean(f[x_grid[b==i], y_grid[b==i]])
return out
for 循环目前是一个巨大的瓶颈。可能有一个非常简单的答案 - 我现在看不到它!
out = X*np.nan
for i in range(max(b)):
out[i] = np.nanmean(f[x_grid[b==i], y_grid[b==i]])
可以替换为对np.bincount
的两次调用:
total = np.bincount(b, weights=f[x_grid, y_grid], minlength=len(X))
count = np.bincount(b, minlength=len(X))
out = total/count
或一通电话 stats.binned_statistic
:
out, bin_edges, binnumber = stats.binned_statistic(
x=b, values=f[x_grid, y_grid], statistic='mean', bins=np.arange(len(X)+1))
例如,
import numpy as np
from scipy.spatial import KDTree
import scipy.stats as stats
np.random.seed(2017)
def rebin(f, X, Y):
s = f.shape
x_grid = np.arange(s[0])
y_grid = np.arange(s[1])
x_grid, y_grid = np.meshgrid(x_grid,y_grid)
x_grid, y_grid = x_grid.flatten(), y_grid.flatten()
tree = KDTree(np.column_stack((X,Y)))
_, b = tree.query(np.column_stack((x_grid, y_grid)))
out, bin_edges, binnumber = stats.binned_statistic(
x=b, values=f[x_grid, y_grid], statistic='mean', bins=np.arange(len(X)+1))
# total = np.bincount(b, weights=f[x_grid, y_grid], minlength=len(X))
# count = np.bincount(b, minlength=len(X))
# out = total/count
return out
def orig(f, X, Y):
s = f.shape
x_grid = np.arange(s[0])
y_grid = np.arange(s[1])
x_grid, y_grid = np.meshgrid(x_grid,y_grid)
x_grid, y_grid = x_grid.flatten(), y_grid.flatten()
tree = KDTree(np.column_stack((X,Y)))
_, b = tree.query(np.column_stack((x_grid, y_grid)))
out = X*np.nan
for i in range(len(X)):
out[i] = np.nanmean(f[x_grid[b==i], y_grid[b==i]])
return out
N = 100
X, Y = np.random.random((2, N))
f = np.random.random((N, N))
expected = orig(f, X, Y)
result = rebin(f, X, Y)
print(np.allclose(expected, result, equal_nan=True))
# True
我刚刚发现有一个名为 cKDTree 的 KDTree 的 cython 功能完整版本,它更快。
我使用了 voronoi binning:我有每个 bin 的中心坐标,我想找到每个 bin 中包含的所有像素的平均值。我只是不知道如何切片 numpy 数组以对其进行矢量化。
这是我当前的代码:X 和 Y 是一维数组,每个 bin 的中心都有 x 和 y 坐标;二维图像中的f:
import numpy as np
from scipy.spatial import KDTree
def rebin(f, X, Y):
s = f.shape
x_grid = np.arange(s[0])
y_grid = np.arange(s[1])
x_grid, y_grid = np.meshgrid(x_grid,y_grid)
x_grid, y_grid = x_grid.flatten(), y_grid.flatten()
tree = KDTree(zip(X,Y))
_, b = tree.query(zip(x_grid, y_grid))
out = X*np.nan
for i in range(max(b)):
out[i] = np.nanmean(f[x_grid[b==i], y_grid[b==i]])
return out
for 循环目前是一个巨大的瓶颈。可能有一个非常简单的答案 - 我现在看不到它!
out = X*np.nan
for i in range(max(b)):
out[i] = np.nanmean(f[x_grid[b==i], y_grid[b==i]])
可以替换为对np.bincount
的两次调用:
total = np.bincount(b, weights=f[x_grid, y_grid], minlength=len(X))
count = np.bincount(b, minlength=len(X))
out = total/count
或一通电话 stats.binned_statistic
:
out, bin_edges, binnumber = stats.binned_statistic(
x=b, values=f[x_grid, y_grid], statistic='mean', bins=np.arange(len(X)+1))
例如,
import numpy as np
from scipy.spatial import KDTree
import scipy.stats as stats
np.random.seed(2017)
def rebin(f, X, Y):
s = f.shape
x_grid = np.arange(s[0])
y_grid = np.arange(s[1])
x_grid, y_grid = np.meshgrid(x_grid,y_grid)
x_grid, y_grid = x_grid.flatten(), y_grid.flatten()
tree = KDTree(np.column_stack((X,Y)))
_, b = tree.query(np.column_stack((x_grid, y_grid)))
out, bin_edges, binnumber = stats.binned_statistic(
x=b, values=f[x_grid, y_grid], statistic='mean', bins=np.arange(len(X)+1))
# total = np.bincount(b, weights=f[x_grid, y_grid], minlength=len(X))
# count = np.bincount(b, minlength=len(X))
# out = total/count
return out
def orig(f, X, Y):
s = f.shape
x_grid = np.arange(s[0])
y_grid = np.arange(s[1])
x_grid, y_grid = np.meshgrid(x_grid,y_grid)
x_grid, y_grid = x_grid.flatten(), y_grid.flatten()
tree = KDTree(np.column_stack((X,Y)))
_, b = tree.query(np.column_stack((x_grid, y_grid)))
out = X*np.nan
for i in range(len(X)):
out[i] = np.nanmean(f[x_grid[b==i], y_grid[b==i]])
return out
N = 100
X, Y = np.random.random((2, N))
f = np.random.random((N, N))
expected = orig(f, X, Y)
result = rebin(f, X, Y)
print(np.allclose(expected, result, equal_nan=True))
# True
我刚刚发现有一个名为 cKDTree 的 KDTree 的 cython 功能完整版本,它更快。