NumPy Broadcasting:计算两个数组之间的平方差之和
NumPy Broadcasting: Calculating sum of squared differences between two arrays
我有以下代码。它在 Python 中持续了很长时间。必须有一种方法可以将此计算转换为广播...
def euclidean_square(a,b):
squares = np.zeros((a.shape[0],b.shape[0]))
for i in range(squares.shape[0]):
for j in range(squares.shape[1]):
diff = a[i,:] - b[j,:]
sqr = diff**2.0
squares[i,j] = np.sum(sqr)
return squares
你可以使用np.einsum
after calculating the differences in a broadcasted way
,像这样-
ab = a[:,None,:] - b
out = np.einsum('ijk,ijk->ij',ab,ab)
或者使用 scipy's cdist
并将其可选的度量参数设置为 'sqeuclidean'
来为我们提供问题所需的平方欧氏距离,例如 -
from scipy.spatial.distance import cdist
out = cdist(a,b,'sqeuclidean')
除了使用cdist之外的另一种解决方案如下
difference_squared = np.zeros((a.shape[0], b.shape[0]))
for dimension_iterator in range(a.shape[1]):
difference_squared = difference_squared + np.subtract.outer(a[:, dimension_iterator], b[:, dimension_iterator])**2.
我收集了这里提出的不同方法,分为两个 other ,并测量了不同方法的速度:
import numpy as np
import scipy.spatial
import sklearn.metrics
def dist_direct(x, y):
d = np.expand_dims(x, -2) - y
return np.sum(np.square(d), axis=-1)
def dist_einsum(x, y):
d = np.expand_dims(x, -2) - y
return np.einsum('ijk,ijk->ij', d, d)
def dist_scipy(x, y):
return scipy.spatial.distance.cdist(x, y, "sqeuclidean")
def dist_sklearn(x, y):
return sklearn.metrics.pairwise.pairwise_distances(x, y, "sqeuclidean")
def dist_layers(x, y):
res = np.zeros((x.shape[0], y.shape[0]))
for i in range(x.shape[1]):
res += np.subtract.outer(x[:, i], y[:, i])**2
return res
# inspired by the excellent https://github.com/droyed/eucl_dist
def dist_ext1(x, y):
nx, p = x.shape
x_ext = np.empty((nx, 3*p))
x_ext[:, :p] = 1
x_ext[:, p:2*p] = x
x_ext[:, 2*p:] = np.square(x)
ny = y.shape[0]
y_ext = np.empty((3*p, ny))
y_ext[:p] = np.square(y).T
y_ext[p:2*p] = -2*y.T
y_ext[2*p:] = 1
return x_ext.dot(y_ext)
#
def dist_ext2(x, y):
return np.einsum('ij,ij->i', x, x)[:,None] + np.einsum('ij,ij->i', y, y) - 2 * x.dot(y.T)
我用timeit
来比较不同方法的速度。为了比较,我使用了长度为10的向量,第一组有100个向量,第二组有1000个向量。
import timeit
p = 10
x = np.random.standard_normal((100, p))
y = np.random.standard_normal((1000, p))
for method in dir():
if not method.startswith("dist_"):
continue
t = timeit.timeit(f"{method}(x, y)", number=1000, globals=globals())
print(f"{method:12} {t:5.2f}ms")
在我的笔记本电脑上,结果如下:
dist_direct 5.07ms
dist_einsum 3.43ms
dist_ext1 0.20ms <-- fastest
dist_ext2 0.35ms
dist_layers 2.82ms
dist_scipy 0.60ms
dist_sklearn 0.67ms
虽然dist_ext1
和dist_ext2
这两种方法,都基于将(x-y)**2
写成x**2 - 2*x*y + y**2
的想法,速度非常快,但有一个缺点:当x
和 y
之间的距离非常小,由于 cancellation error 数值结果有时可能(非常轻微)为负数。
我有以下代码。它在 Python 中持续了很长时间。必须有一种方法可以将此计算转换为广播...
def euclidean_square(a,b):
squares = np.zeros((a.shape[0],b.shape[0]))
for i in range(squares.shape[0]):
for j in range(squares.shape[1]):
diff = a[i,:] - b[j,:]
sqr = diff**2.0
squares[i,j] = np.sum(sqr)
return squares
你可以使用np.einsum
after calculating the differences in a broadcasted way
,像这样-
ab = a[:,None,:] - b
out = np.einsum('ijk,ijk->ij',ab,ab)
或者使用 scipy's cdist
并将其可选的度量参数设置为 'sqeuclidean'
来为我们提供问题所需的平方欧氏距离,例如 -
from scipy.spatial.distance import cdist
out = cdist(a,b,'sqeuclidean')
除了使用cdist之外的另一种解决方案如下
difference_squared = np.zeros((a.shape[0], b.shape[0]))
for dimension_iterator in range(a.shape[1]):
difference_squared = difference_squared + np.subtract.outer(a[:, dimension_iterator], b[:, dimension_iterator])**2.
我收集了这里提出的不同方法,分为两个 other
import numpy as np
import scipy.spatial
import sklearn.metrics
def dist_direct(x, y):
d = np.expand_dims(x, -2) - y
return np.sum(np.square(d), axis=-1)
def dist_einsum(x, y):
d = np.expand_dims(x, -2) - y
return np.einsum('ijk,ijk->ij', d, d)
def dist_scipy(x, y):
return scipy.spatial.distance.cdist(x, y, "sqeuclidean")
def dist_sklearn(x, y):
return sklearn.metrics.pairwise.pairwise_distances(x, y, "sqeuclidean")
def dist_layers(x, y):
res = np.zeros((x.shape[0], y.shape[0]))
for i in range(x.shape[1]):
res += np.subtract.outer(x[:, i], y[:, i])**2
return res
# inspired by the excellent https://github.com/droyed/eucl_dist
def dist_ext1(x, y):
nx, p = x.shape
x_ext = np.empty((nx, 3*p))
x_ext[:, :p] = 1
x_ext[:, p:2*p] = x
x_ext[:, 2*p:] = np.square(x)
ny = y.shape[0]
y_ext = np.empty((3*p, ny))
y_ext[:p] = np.square(y).T
y_ext[p:2*p] = -2*y.T
y_ext[2*p:] = 1
return x_ext.dot(y_ext)
#
def dist_ext2(x, y):
return np.einsum('ij,ij->i', x, x)[:,None] + np.einsum('ij,ij->i', y, y) - 2 * x.dot(y.T)
我用timeit
来比较不同方法的速度。为了比较,我使用了长度为10的向量,第一组有100个向量,第二组有1000个向量。
import timeit
p = 10
x = np.random.standard_normal((100, p))
y = np.random.standard_normal((1000, p))
for method in dir():
if not method.startswith("dist_"):
continue
t = timeit.timeit(f"{method}(x, y)", number=1000, globals=globals())
print(f"{method:12} {t:5.2f}ms")
在我的笔记本电脑上,结果如下:
dist_direct 5.07ms
dist_einsum 3.43ms
dist_ext1 0.20ms <-- fastest
dist_ext2 0.35ms
dist_layers 2.82ms
dist_scipy 0.60ms
dist_sklearn 0.67ms
虽然dist_ext1
和dist_ext2
这两种方法,都基于将(x-y)**2
写成x**2 - 2*x*y + y**2
的想法,速度非常快,但有一个缺点:当x
和 y
之间的距离非常小,由于 cancellation error 数值结果有时可能(非常轻微)为负数。