在 numpy 中矢量化 VLAD 计算
Vectorise VLAD computation in numpy
我想知道是否可以矢量化此 VLAD 计算的实现。
对于上下文:
feats
= 形状为 (T, N, F)
的 numpy 数组
kmeans
= 来自 scikit-learn 的 KMeans 对象初始化为 K
个集群。
当前方法
k = kmeans.n_clusters # K
centers = kmeans.cluster_centers_ # (K, F)
vlad_feats = []
for feat in feats:
# feat shape - (N, F)
cluster_label = kmeans.predict(feat) #(N,)
vlad = np.zeros((k, feat.shape[1])) # (K, F)
# computing the differences for all the clusters (visual words)
for i in range(k):
# if there is at least one descriptor in that cluster
if np.sum(cluster_label == i) > 0:
# add the differences
vlad[i] = np.sum(feat[cluster_label == i, :] - centers[i], axis=0)
vlad = vlad.flatten() # (K*F,)
# L2 normalization
vlad = vlad / np.sqrt(np.dot(vlad, vlad))
vlad_feats.append(vlad)
vlad_feats = np.array(vlad_feats) # (T, K*F)
批量获取 kmeans 预测不是问题,因为我们可以执行以下操作:
feats2 = feats.reshape(-1, F) # (T*N, F)
labels = kmeans.predict(feats2) # (T*N, )
但我一直在计算集群距离。
您已经开始使用正确的方法。让我们尝试将所有行一条一条地从循环中拉出来。一、预测:
cluster_label = kmeans.predict(feats.reshape(-1, F)).reshape(T, N) # T, N
您真的不需要支票 np.sum(cluster_label == i) > 0
,因为总和无论如何都会变成零。您的目标是将每个 T
和特征中的每个 K
标签与中心的距离相加。
您可以使用简单的广播计算 k
个掩码 cluster_label == i
。您希望最后一个维度为 K
:
mask = cluster_label[..., None] == np.arange(k) # T, N, K
您还可以使用更复杂的广播计算 k
差异 feats - centers[i]
:
delta = feats[..., None, :] - centers # T, N, K, F
您现在可以将差异乘以掩码并通过求和沿 N
维度减少:
vlad = (delta * mask[..., None]).sum(axis=1).reshape(T, -1) # T, K * F
从这里开始,规范化应该是微不足道的:
vlad /= np.linalg.norm(vlad, axis=1, keepdims=True)
虽然 答案矢量化,但我发现它会影响性能。
下面,looping
本质上是 OP 算法的重写版本,naivec
通过分解的 4D 张量采用矢量化。
import numpy as np
from sklearn.cluster import MiniBatchKMeans
def looping(kmeans: MiniBatchKMeans, local_tlf):
k, (t, l, f) = kmeans.n_clusters, local_tlf.shape
centers_kf = kmeans.cluster_centers_
vlad_tkf = np.zeros((t, k, f))
for vlad_kf, local_lf in zip(vlad_tkf, local_tlf):
label_l = kmeans.predict(local_lf)
for i in range(k):
vlad_kf[i] = np.sum(local_lf[label_l == i] - centers_kf[i], axis=0)
vlad_D = vlad_kf.ravel()
vlad_D = np.sign(vlad_D) * np.sqrt(np.abs(vlad_D))
vlad_D /= np.linalg.norm(vlad_D)
vlad_kf[:,:] = vlad_D.reshape(k, f)
return vlad_tkf.reshape(t, -1)
def naivec(kmeans: MiniBatchKMeans, local_tlf):
k, (t, l, f) = kmeans.n_clusters, local_tlf.shape
centers_kf = kmeans.cluster_centers_
labels_tl = kmeans.predict(local_tlf.reshape(-1,f)).reshape(t, l)
mask_tlk = labels_tl[..., np.newaxis] == np.arange(k)
local_tl1f = local_tlf[...,np.newaxis,:]
delta_tlkf = local_tl1f - centers_kf # <-- easy to run out of memory
vlad_tD = (delta_tlkf * mask_tlk[..., np.newaxis]).sum(axis=1).reshape(t, -1)
vlad_tD = np.sign(vlad_tD) * np.sqrt(np.abs(vlad_tD))
vlad_tD /= np.linalg.norm(vlad_tD, axis=1, keepdims=True)
return vlad_tD
确实如此,请参阅下面的基准。
np.random.seed(1234)
# usually there are a lot more images than this
t, l, f, k = 256, 128, 64, 512
X = np.random.randn(t, l, f)
km = MiniBatchKMeans(n_clusters=16, n_init=10, random_state=0)
km.fit(X.reshape(-1, f))
result_looping = looping(km, X)
result_naivec = naivec(km, X)
%timeit looping(km, X) # ~200 ms
%timeit naivec(km, X) # ~300 ms
assert np.allclose(result_looping, result_naivec)
避免内存增长超过输出大小(渐近)的惯用矢量化将利用分散减少。
def truvec(kmeans: MiniBatchKMeans, local_tlf):
k, (t, l, f) = kmeans.n_clusters, local_tlf.shape
centers_kf = kmeans.cluster_centers_
labels_tl = kmeans.predict(local_tlf.reshape(-1,f)).reshape(t, l)
vlad_tkf = np.zeros((t, k, f))
M = t * k
labels_tl += np.arange(t)[:, np.newaxis] * k
vlad_Mf = vlad_tkf.reshape(-1, f)
np.add.at(vlad_Mf, labels_tl.ravel(), local_tlf.reshape(-1, f))
counts_M = np.bincount(labels_tl.ravel(), minlength=M)
vlad_tkf -= counts_M.reshape(t, k, 1) * centers_kf
vlad_tD = vlad_tkf.reshape(t, -1)
vlad_tD = np.sign(vlad_tD) * np.sqrt(np.abs(vlad_tD))
vlad_tD /= np.linalg.norm(vlad_tD, axis=1, keepdims=True)
return vlad_tD
然而,令人失望的是,这也只会让我们花费大约 200 ms
的计算时间。这归结为两个原因:
- 内部循环已在
looping()
中矢量化
np.add.at
实际上 不能 使用矢量化 CPU 指令,不像原来的跨步缩减 np.sum(local_lf[label_l == i] - centers_kf[i], axis=0)
VLAD 算法的高性能矢量化版本需要一些复杂的技术来利用连续数组访问。这个版本比 looping()
有 40% 的改进,但需要大量设置---请参阅我的博客上的方法 here。
我想知道是否可以矢量化此 VLAD 计算的实现。
对于上下文:
feats
= 形状为 (T, N, F)
kmeans
= 来自 scikit-learn 的 KMeans 对象初始化为 K
个集群。
当前方法
k = kmeans.n_clusters # K
centers = kmeans.cluster_centers_ # (K, F)
vlad_feats = []
for feat in feats:
# feat shape - (N, F)
cluster_label = kmeans.predict(feat) #(N,)
vlad = np.zeros((k, feat.shape[1])) # (K, F)
# computing the differences for all the clusters (visual words)
for i in range(k):
# if there is at least one descriptor in that cluster
if np.sum(cluster_label == i) > 0:
# add the differences
vlad[i] = np.sum(feat[cluster_label == i, :] - centers[i], axis=0)
vlad = vlad.flatten() # (K*F,)
# L2 normalization
vlad = vlad / np.sqrt(np.dot(vlad, vlad))
vlad_feats.append(vlad)
vlad_feats = np.array(vlad_feats) # (T, K*F)
批量获取 kmeans 预测不是问题,因为我们可以执行以下操作:
feats2 = feats.reshape(-1, F) # (T*N, F)
labels = kmeans.predict(feats2) # (T*N, )
但我一直在计算集群距离。
您已经开始使用正确的方法。让我们尝试将所有行一条一条地从循环中拉出来。一、预测:
cluster_label = kmeans.predict(feats.reshape(-1, F)).reshape(T, N) # T, N
您真的不需要支票 np.sum(cluster_label == i) > 0
,因为总和无论如何都会变成零。您的目标是将每个 T
和特征中的每个 K
标签与中心的距离相加。
您可以使用简单的广播计算 k
个掩码 cluster_label == i
。您希望最后一个维度为 K
:
mask = cluster_label[..., None] == np.arange(k) # T, N, K
您还可以使用更复杂的广播计算 k
差异 feats - centers[i]
:
delta = feats[..., None, :] - centers # T, N, K, F
您现在可以将差异乘以掩码并通过求和沿 N
维度减少:
vlad = (delta * mask[..., None]).sum(axis=1).reshape(T, -1) # T, K * F
从这里开始,规范化应该是微不足道的:
vlad /= np.linalg.norm(vlad, axis=1, keepdims=True)
虽然
下面,looping
本质上是 OP 算法的重写版本,naivec
通过分解的 4D 张量采用矢量化。
import numpy as np
from sklearn.cluster import MiniBatchKMeans
def looping(kmeans: MiniBatchKMeans, local_tlf):
k, (t, l, f) = kmeans.n_clusters, local_tlf.shape
centers_kf = kmeans.cluster_centers_
vlad_tkf = np.zeros((t, k, f))
for vlad_kf, local_lf in zip(vlad_tkf, local_tlf):
label_l = kmeans.predict(local_lf)
for i in range(k):
vlad_kf[i] = np.sum(local_lf[label_l == i] - centers_kf[i], axis=0)
vlad_D = vlad_kf.ravel()
vlad_D = np.sign(vlad_D) * np.sqrt(np.abs(vlad_D))
vlad_D /= np.linalg.norm(vlad_D)
vlad_kf[:,:] = vlad_D.reshape(k, f)
return vlad_tkf.reshape(t, -1)
def naivec(kmeans: MiniBatchKMeans, local_tlf):
k, (t, l, f) = kmeans.n_clusters, local_tlf.shape
centers_kf = kmeans.cluster_centers_
labels_tl = kmeans.predict(local_tlf.reshape(-1,f)).reshape(t, l)
mask_tlk = labels_tl[..., np.newaxis] == np.arange(k)
local_tl1f = local_tlf[...,np.newaxis,:]
delta_tlkf = local_tl1f - centers_kf # <-- easy to run out of memory
vlad_tD = (delta_tlkf * mask_tlk[..., np.newaxis]).sum(axis=1).reshape(t, -1)
vlad_tD = np.sign(vlad_tD) * np.sqrt(np.abs(vlad_tD))
vlad_tD /= np.linalg.norm(vlad_tD, axis=1, keepdims=True)
return vlad_tD
确实如此,请参阅下面的基准。
np.random.seed(1234)
# usually there are a lot more images than this
t, l, f, k = 256, 128, 64, 512
X = np.random.randn(t, l, f)
km = MiniBatchKMeans(n_clusters=16, n_init=10, random_state=0)
km.fit(X.reshape(-1, f))
result_looping = looping(km, X)
result_naivec = naivec(km, X)
%timeit looping(km, X) # ~200 ms
%timeit naivec(km, X) # ~300 ms
assert np.allclose(result_looping, result_naivec)
避免内存增长超过输出大小(渐近)的惯用矢量化将利用分散减少。
def truvec(kmeans: MiniBatchKMeans, local_tlf):
k, (t, l, f) = kmeans.n_clusters, local_tlf.shape
centers_kf = kmeans.cluster_centers_
labels_tl = kmeans.predict(local_tlf.reshape(-1,f)).reshape(t, l)
vlad_tkf = np.zeros((t, k, f))
M = t * k
labels_tl += np.arange(t)[:, np.newaxis] * k
vlad_Mf = vlad_tkf.reshape(-1, f)
np.add.at(vlad_Mf, labels_tl.ravel(), local_tlf.reshape(-1, f))
counts_M = np.bincount(labels_tl.ravel(), minlength=M)
vlad_tkf -= counts_M.reshape(t, k, 1) * centers_kf
vlad_tD = vlad_tkf.reshape(t, -1)
vlad_tD = np.sign(vlad_tD) * np.sqrt(np.abs(vlad_tD))
vlad_tD /= np.linalg.norm(vlad_tD, axis=1, keepdims=True)
return vlad_tD
然而,令人失望的是,这也只会让我们花费大约 200 ms
的计算时间。这归结为两个原因:
- 内部循环已在
looping()
中矢量化
np.add.at
实际上 不能 使用矢量化 CPU 指令,不像原来的跨步缩减np.sum(local_lf[label_l == i] - centers_kf[i], axis=0)
VLAD 算法的高性能矢量化版本需要一些复杂的技术来利用连续数组访问。这个版本比 looping()
有 40% 的改进,但需要大量设置---请参阅我的博客上的方法 here。