在 python 中使用 ndimage 的最快方法?
The fastest way to work with ndimage in python?
我有迭代 ndimage 的函数(将图像从一种颜色 space 转换为另一种颜色)。
运行速度太慢(2 核 CPU,2.3 GHz,图像大小 = 3 MP):
1) 蛮力方法(循环):27 秒
def imageRGBtoYCrCb(rgb_image):
""" Converts image from RGB to YCrCb. OVERWRITES.
"""
w, h = rgb_image.shape[0], rgb_image.shape[1]
for y in range(h):
for x in range(w):
rgb = rgb_image[x][y]
ycrcb = RGBtoYCrCb(rgb)
rgb_image[x][y] = ycrcb
return rgb_image
def RGBtoYCrCb(rgb):
""" Converts RGB vector to YCrCb vector
Keyword arguments:
rgb -- list of size 3: [r,g,b]
Returns: YCrCr color (list of size 3: [y,cr,cb])
"""
r,g,b = float(rgb[0]),float(rgb[1]),float(rgb[2])
y = 0.299*r + 0.587*g + 0.114*b
cb = 128 - 0.1687*r - 0.3313*g + 0.5*b
cr = 128 + 0.5*r - 0.4187*g - 0.0813*b
return [y,cr,cb]
2) 向量化方法 (numpy.apply_along_axis, numpy.dot): 90 秒 (???)
import numpy as np
matr_to_ycrcb_mult = np.array([ [0.299, 0.587, 0.114], [0.5, -0.4187, -0.0813], [-0.1687, -0.3313, 0.5] ])
vec_to_ycrcb_add = np.array([ 0, 128, 128 ])
matr_to_rgb_mult = np.array([ [1,1.402,0], [1,-0.71414,-0.34414], [1,0,1.772] ])
vec_to_rgb_add = np.array([ -128*1.402, 128*1.05828, -128*1.772 ])
def imageRGBtoYCrCb(rgb_image):
rgb_image = np.apply_along_axis(RGBtoYCrCb, 2, rgb_image)
return rgb_image
def RGBtoYCrCb(rgb):
""" Converts RGB vector to YCrCb vector
"""
return np.dot(matr_to_ycrcb_mult,rgb) + vec_to_ycrcb_add
有没有更快的处理ndimage的方法?
我是否正确实施了矢量化概念?
apply_along_axis
是一个方便的函数,但它并没有真正向量化操作,因为迭代仍然发生在 Python 土地上。
你的RGBtoYCrCb
函数只需要一个微小的改变就可以操作整个图像。在函数 new_RGBtoYCrCb2
中,我们切出 r
、g
、b
(这些只是原始图像数据的视图,没有复制),然后在最后将它们堆叠在一起。
您还可以重塑并使用点积,而无需切片和粘合,这可能会更快(函数 new_RGBtoYCrCb2
):
import numpy as np
from skimage.data import coffee
def imageRGBtoYCrCb(rgb_image):
""" Converts image from RGB to YCrCb. OVERWRITES.
"""
w, h = rgb_image.shape[0], rgb_image.shape[1]
for y in range(h):
for x in range(w):
rgb = rgb_image[x][y]
ycrcb = RGBtoYCrCb(rgb)
rgb_image[x][y] = ycrcb
return rgb_image
def RGBtoYCrCb(rgb):
""" Converts RGB vector to YCrCb vector
Keyword arguments:
rgb -- list of size 3: [r,g,b]
Returns: YCrCr color (list of size 3: [y,cr,cb])
"""
r,g,b = float(rgb[0]),float(rgb[1]),float(rgb[2])
y = 0.299*r + 0.587*g + 0.114*b
cb = 128 - 0.1687*r - 0.3313*g + 0.5*b
cr = 128 + 0.5*r - 0.4187*g - 0.0813*b
return [y,cr,cb]
def new_RGBtoYCrCb(image):
r, g, b = image[..., 0], image[..., 1], image[..., 2]
y = 0.299*r + 0.587*g + 0.114*b
cb = 128 - 0.1687*r - 0.3313*g + 0.5*b
cr = 128 + 0.5*r - 0.4187*g - 0.0813*b
return np.dstack((y, cr, cb))
def new_RGBtoYCrCb2(image):
M = np.array([[0.299, 0.587, 0.114],
[0.5, - 0.4187, - 0.0813],
[-0.1687, - 0.3313, 0.5]])
result = M.dot(image.reshape(-1, 3).T).T
result[..., 1:] += 128.0
return result.reshape(image.shape)
image = coffee() / 255.0
result = new_RGBtoYCrCb(image)
print(np.allclose(result, imageRGBtoYCrCb(image)))
首先,np.apply_along_axis
不进行 'proper' 矢量化,因为所有轴上的循环仍在 Python 中完成。虽然它提供了更好的语法,但您不应期望它比标准 Python for
循环执行得更快。
避免在 Python 中循环的一种方法是使用 np.einsum
,它允许您使用爱因斯坦求和符号执行张量收缩。具有重复下标标签的轴被求和 - 在这种情况下,我们想要对 matr_to_ycrcb_mult
的第二个轴和 rgb_image
的第三个轴(j
下标)求和。
例如:
def rgb2ycrcb_einsum(rgb_image):
out = np.einsum('ij,xyj->xyi', matr_to_ycrcb_mult, rgb_image)
out[..., -2:] += 128
return out
与使用 np.apply_along_axis
:
对每个像素应用 np.dot
相比,这已经取得了很大的性能改进
rgb_image = np.random.random_integers(0, 255, (1024, 768, 3))
ycrbr_image1 = imageRGBtoYCrCb(rgb_image)
ycrbr_image2 = rgb2ycrcb_einsum(rgb_image)
print(np.allclose(ycrbr_image1, ycrbr_image2))
# True
%timeit imageRGBtoYCrCb(rgb_image)
# 1 loops, best of 3: 3.95 s per loop
%timeit rgb2ycrcb_einsum(rgb_image)
# 10 loops, best of 3: 165 ms per loop
更新
使用 numpy.core.umath_tests.inner1d
可以做得更好:
def rgb2ycrcb_inner1d(rgb_image):
out = inner1d(matr_to_ycrcb_mult[None, None, :], rgb_image[..., None, :])
out[..., -2:] += 128
return out
%timeit rgb2ycrcb_inner1d(rgb_image)
# 10 loops, best of 3: 34.6 ms per loop
更新 2
YXD 使用 np.dot
的解决方案似乎是迄今为止最快的,比 inner1d
快了 3 倍(大概是通过利用 BLAS 加速矩阵乘法)。
def rgb2ycrcb_dot(rgb_image):
out = matr_to_ycrcb_mult.dot(rgb_image.reshape(-1, 3).T).T
out[..., -2:] += 128
return out.reshape(rgb_image.shape)
%timeit rgb2ycrcb_dot(rgb_image)
# 100 loops, best of 3: 12.2 ms per loop
我有迭代 ndimage 的函数(将图像从一种颜色 space 转换为另一种颜色)。 运行速度太慢(2 核 CPU,2.3 GHz,图像大小 = 3 MP):
1) 蛮力方法(循环):27 秒
def imageRGBtoYCrCb(rgb_image):
""" Converts image from RGB to YCrCb. OVERWRITES.
"""
w, h = rgb_image.shape[0], rgb_image.shape[1]
for y in range(h):
for x in range(w):
rgb = rgb_image[x][y]
ycrcb = RGBtoYCrCb(rgb)
rgb_image[x][y] = ycrcb
return rgb_image
def RGBtoYCrCb(rgb):
""" Converts RGB vector to YCrCb vector
Keyword arguments:
rgb -- list of size 3: [r,g,b]
Returns: YCrCr color (list of size 3: [y,cr,cb])
"""
r,g,b = float(rgb[0]),float(rgb[1]),float(rgb[2])
y = 0.299*r + 0.587*g + 0.114*b
cb = 128 - 0.1687*r - 0.3313*g + 0.5*b
cr = 128 + 0.5*r - 0.4187*g - 0.0813*b
return [y,cr,cb]
2) 向量化方法 (numpy.apply_along_axis, numpy.dot): 90 秒 (???)
import numpy as np
matr_to_ycrcb_mult = np.array([ [0.299, 0.587, 0.114], [0.5, -0.4187, -0.0813], [-0.1687, -0.3313, 0.5] ])
vec_to_ycrcb_add = np.array([ 0, 128, 128 ])
matr_to_rgb_mult = np.array([ [1,1.402,0], [1,-0.71414,-0.34414], [1,0,1.772] ])
vec_to_rgb_add = np.array([ -128*1.402, 128*1.05828, -128*1.772 ])
def imageRGBtoYCrCb(rgb_image):
rgb_image = np.apply_along_axis(RGBtoYCrCb, 2, rgb_image)
return rgb_image
def RGBtoYCrCb(rgb):
""" Converts RGB vector to YCrCb vector
"""
return np.dot(matr_to_ycrcb_mult,rgb) + vec_to_ycrcb_add
有没有更快的处理ndimage的方法?
我是否正确实施了矢量化概念?
apply_along_axis
是一个方便的函数,但它并没有真正向量化操作,因为迭代仍然发生在 Python 土地上。
你的RGBtoYCrCb
函数只需要一个微小的改变就可以操作整个图像。在函数 new_RGBtoYCrCb2
中,我们切出 r
、g
、b
(这些只是原始图像数据的视图,没有复制),然后在最后将它们堆叠在一起。
您还可以重塑并使用点积,而无需切片和粘合,这可能会更快(函数 new_RGBtoYCrCb2
):
import numpy as np
from skimage.data import coffee
def imageRGBtoYCrCb(rgb_image):
""" Converts image from RGB to YCrCb. OVERWRITES.
"""
w, h = rgb_image.shape[0], rgb_image.shape[1]
for y in range(h):
for x in range(w):
rgb = rgb_image[x][y]
ycrcb = RGBtoYCrCb(rgb)
rgb_image[x][y] = ycrcb
return rgb_image
def RGBtoYCrCb(rgb):
""" Converts RGB vector to YCrCb vector
Keyword arguments:
rgb -- list of size 3: [r,g,b]
Returns: YCrCr color (list of size 3: [y,cr,cb])
"""
r,g,b = float(rgb[0]),float(rgb[1]),float(rgb[2])
y = 0.299*r + 0.587*g + 0.114*b
cb = 128 - 0.1687*r - 0.3313*g + 0.5*b
cr = 128 + 0.5*r - 0.4187*g - 0.0813*b
return [y,cr,cb]
def new_RGBtoYCrCb(image):
r, g, b = image[..., 0], image[..., 1], image[..., 2]
y = 0.299*r + 0.587*g + 0.114*b
cb = 128 - 0.1687*r - 0.3313*g + 0.5*b
cr = 128 + 0.5*r - 0.4187*g - 0.0813*b
return np.dstack((y, cr, cb))
def new_RGBtoYCrCb2(image):
M = np.array([[0.299, 0.587, 0.114],
[0.5, - 0.4187, - 0.0813],
[-0.1687, - 0.3313, 0.5]])
result = M.dot(image.reshape(-1, 3).T).T
result[..., 1:] += 128.0
return result.reshape(image.shape)
image = coffee() / 255.0
result = new_RGBtoYCrCb(image)
print(np.allclose(result, imageRGBtoYCrCb(image)))
首先,np.apply_along_axis
不进行 'proper' 矢量化,因为所有轴上的循环仍在 Python 中完成。虽然它提供了更好的语法,但您不应期望它比标准 Python for
循环执行得更快。
避免在 Python 中循环的一种方法是使用 np.einsum
,它允许您使用爱因斯坦求和符号执行张量收缩。具有重复下标标签的轴被求和 - 在这种情况下,我们想要对 matr_to_ycrcb_mult
的第二个轴和 rgb_image
的第三个轴(j
下标)求和。
例如:
def rgb2ycrcb_einsum(rgb_image):
out = np.einsum('ij,xyj->xyi', matr_to_ycrcb_mult, rgb_image)
out[..., -2:] += 128
return out
与使用 np.apply_along_axis
:
np.dot
相比,这已经取得了很大的性能改进
rgb_image = np.random.random_integers(0, 255, (1024, 768, 3))
ycrbr_image1 = imageRGBtoYCrCb(rgb_image)
ycrbr_image2 = rgb2ycrcb_einsum(rgb_image)
print(np.allclose(ycrbr_image1, ycrbr_image2))
# True
%timeit imageRGBtoYCrCb(rgb_image)
# 1 loops, best of 3: 3.95 s per loop
%timeit rgb2ycrcb_einsum(rgb_image)
# 10 loops, best of 3: 165 ms per loop
更新
使用 numpy.core.umath_tests.inner1d
可以做得更好:
def rgb2ycrcb_inner1d(rgb_image):
out = inner1d(matr_to_ycrcb_mult[None, None, :], rgb_image[..., None, :])
out[..., -2:] += 128
return out
%timeit rgb2ycrcb_inner1d(rgb_image)
# 10 loops, best of 3: 34.6 ms per loop
更新 2
YXD 使用 np.dot
的解决方案似乎是迄今为止最快的,比 inner1d
快了 3 倍(大概是通过利用 BLAS 加速矩阵乘法)。
def rgb2ycrcb_dot(rgb_image):
out = matr_to_ycrcb_mult.dot(rgb_image.reshape(-1, 3).T).T
out[..., -2:] += 128
return out.reshape(rgb_image.shape)
%timeit rgb2ycrcb_dot(rgb_image)
# 100 loops, best of 3: 12.2 ms per loop