更快的位级数据打包
Faster bit-level data packing
连接到 Raspberry Pi(零 W)的 256*64 像素 OLED 显示器将 4 位灰度像素数据打包到一个字节中(即每个字节两个像素),因此总共 8192 个字节。例如。字节
0a 0b 0c 0d (only lower nibble has data)
成为
ab cd
转换这些 bytes either obtained from a Pillow (PIL) Image or a cairo ImageSurface 当 天真地 迭代像素数据时最多需要 0.9 秒,具体取决于颜色深度。
从 Pillow "L"(单色 8 位)图像中每两个字节组合一次:
imd = im.tobytes()
nibbles = [int(p / 16) for p in imd]
packed = []
msn = None
for n in nibbles:
nib = n & 0x0F
if msn is not None:
b = msn << 4 | nib
packed.append(b)
msn = None
else:
msn = nib
这(省略 state 并保存 float/integer 转换)将其降低到大约一半(0.2 秒):
packed = []
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
基本上是第一个应用于 RGB24(32 位!)cairo ImageSurface,尽管有粗糙的灰度转换:
mv = surface.get_data()
w = surface.get_width()
h = surface.get_height()
f = surface.get_format()
s = surface.get_stride()
print(len(mv), w, h, f, s)
# convert xRGB
o = []
msn = None
for p in range(0, len(mv), 4):
nib = int( (mv[p+1] + mv[p+2] + mv[p+3]) / 3 / 16) & 0x0F
if msn is not None:
b = msn << 4 | nib
o.append(b)
msn = None
else:
msn = nib
大约需要两倍的时间(0.9 秒对 0.4 秒)。
struct
模块不支持半字节(半字节)。
bitstring
允许打包半字节:
>>> a = bitstring.BitStream()
>>> a.insert('0xf')
>>> a.insert('0x1')
>>> a
BitStream('0xf1')
>>> a.insert(5)
>>> a
BitStream('0b1111000100000')
>>> a.insert('0x2')
>>> a
BitStream('0b11110001000000010')
>>>
但似乎没有一种方法可以快速将其解压缩为整数列表——这需要 30 秒!:
a = bitstring.BitStream()
for p in imd:
a.append( bitstring.Bits(uint=p//16, length=4) )
packed=[]
a.pos=0
for p in range(256*64//2):
packed.append( a.read(8).uint )
Python 3 是否有办法有效地做到这一点,或者我是否需要替代方案?
用 ctypes 包装的外部包装器?与 Cython (我还没有研究这些) 一样,但更简单?看起来很不错,看到我的回答。
通过 just wrapping the loop in a function
从 200 毫秒减少到 130 毫秒
def packer0(imd):
"""same loop in a def"""
packed = []
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
return packed
通过 Cythonizing 相同的代码
缩短至 35 毫秒
def packer1(imd):
"""Cythonize python nibble packing loop"""
packed = []
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
return packed
使用类型
缩短至 16 毫秒
def packer2(imd):
"""Cythonize python nibble packing loop, typed"""
packed = []
cdef unsigned int b
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
return packed
与 "simplified" 循环没有太大区别
def packer3(imd):
"""Cythonize python nibble packing loop, typed"""
packed = []
cdef unsigned int i
for i in range(256*64/2):
packed.append( (imd[i*2]//16)<<4 | (imd[i*2+1]//16) )
return packed
甚至可能快一点(15 毫秒)
def packer4(it):
"""Cythonize python nibble packing loop, typed"""
cdef unsigned int n = len(it)//2
cdef unsigned int i
return [ (it[i*2]//16)<<4 | it[i*2+1]//16 for i in range(n) ]
这里是 timeit
>>> timeit.timeit('packer4(data)', setup='from pack import packer4; data = [0]*256*64', number=100)
1.31725951000044
>>> exit()
pi@raspberrypi:~ $ python3 -m timeit -s 'from pack import packer4; data = [0]*256*64' 'packer4(data)'
100 loops, best of 3: 9.04 msec per loop
这已经满足了我的要求,但我猜想 input/output 可迭代对象(-> unsigned int 数组?)或访问具有更广泛数据类型的输入数据(Raspbian是32位,BCM2835是ARM1176JZF-S单核)。
或在 GPU 或多核 Raspberry Pis 上具有并行性。
与 C (ideone) 中相同循环的粗略比较:
#include <stdio.h>
#include <stdint.h>
#define SIZE (256*64)
int main(void) {
uint8_t in[SIZE] = {0};
uint8_t out[SIZE/2] = {0};
uint8_t t;
for(t=0; t<100; t++){
uint16_t i;
for(i=0; i<SIZE/2; i++){
out[i] = (in[i*2]/16)<<4 | in[i*2+1]/16;
}
}
return 0;
}
显然快了 100 倍:
pi@raspberry:~ $ gcc p.c
pi@raspberry:~ $ time ./a.out
real 0m0.085s
user 0m0.060s
sys 0m0.010s
消除 shifts/division 可能是另一个轻微的优化(我没有检查生成的 C,也没有检查二进制文件):
def packs(bytes it):
"""Cythonize python nibble packing loop, typed"""
cdef unsigned int n = len(it)//2
cdef unsigned int i
return [ ( (it[i<<1]&0xF0) | (it[(i<<1)+1]>>4) ) for i in range(n) ]
结果
python3 -m timeit -s 'from pack import pack; data = bytes([0]*256*64)' 'pack(data)'
100 loops, best of 3: 12.7 msec per loop
python3 -m timeit -s 'from pack import packs; data = bytes([0]*256*64)' 'packs(data)'
100 loops, best of 3: 12 msec per loop
python3 -m timeit -s 'from pack import packs; data = bytes([0]*256*64)' 'packs(data)'
100 loops, best of 3: 11 msec per loop
python3 -m timeit -s 'from pack import pack; data = bytes([0]*256*64)' 'pack(data)'
100 loops, best of 3: 13.9 msec per loop
连接到 Raspberry Pi(零 W)的 256*64 像素 OLED 显示器将 4 位灰度像素数据打包到一个字节中(即每个字节两个像素),因此总共 8192 个字节。例如。字节
0a 0b 0c 0d (only lower nibble has data)
成为
ab cd
转换这些 bytes either obtained from a Pillow (PIL) Image or a cairo ImageSurface 当 天真地 迭代像素数据时最多需要 0.9 秒,具体取决于颜色深度。
从 Pillow "L"(单色 8 位)图像中每两个字节组合一次:
imd = im.tobytes()
nibbles = [int(p / 16) for p in imd]
packed = []
msn = None
for n in nibbles:
nib = n & 0x0F
if msn is not None:
b = msn << 4 | nib
packed.append(b)
msn = None
else:
msn = nib
这(省略 state 并保存 float/integer 转换)将其降低到大约一半(0.2 秒):
packed = []
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
基本上是第一个应用于 RGB24(32 位!)cairo ImageSurface,尽管有粗糙的灰度转换:
mv = surface.get_data()
w = surface.get_width()
h = surface.get_height()
f = surface.get_format()
s = surface.get_stride()
print(len(mv), w, h, f, s)
# convert xRGB
o = []
msn = None
for p in range(0, len(mv), 4):
nib = int( (mv[p+1] + mv[p+2] + mv[p+3]) / 3 / 16) & 0x0F
if msn is not None:
b = msn << 4 | nib
o.append(b)
msn = None
else:
msn = nib
大约需要两倍的时间(0.9 秒对 0.4 秒)。
struct
模块不支持半字节(半字节)。
bitstring
允许打包半字节:
>>> a = bitstring.BitStream()
>>> a.insert('0xf')
>>> a.insert('0x1')
>>> a
BitStream('0xf1')
>>> a.insert(5)
>>> a
BitStream('0b1111000100000')
>>> a.insert('0x2')
>>> a
BitStream('0b11110001000000010')
>>>
但似乎没有一种方法可以快速将其解压缩为整数列表——这需要 30 秒!:
a = bitstring.BitStream()
for p in imd:
a.append( bitstring.Bits(uint=p//16, length=4) )
packed=[]
a.pos=0
for p in range(256*64//2):
packed.append( a.read(8).uint )
Python 3 是否有办法有效地做到这一点,或者我是否需要替代方案?
用 ctypes 包装的外部包装器?与 Cython (我还没有研究这些) 一样,但更简单?看起来很不错,看到我的回答。
通过 just wrapping the loop in a function
从 200 毫秒减少到 130 毫秒def packer0(imd):
"""same loop in a def"""
packed = []
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
return packed
通过 Cythonizing 相同的代码
缩短至 35 毫秒def packer1(imd):
"""Cythonize python nibble packing loop"""
packed = []
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
return packed
使用类型
缩短至 16 毫秒def packer2(imd):
"""Cythonize python nibble packing loop, typed"""
packed = []
cdef unsigned int b
for b in range(0, 256*64, 2):
packed.append( (imd[b]//16)<<4 | (imd[b+1]//16) )
return packed
与 "simplified" 循环没有太大区别
def packer3(imd):
"""Cythonize python nibble packing loop, typed"""
packed = []
cdef unsigned int i
for i in range(256*64/2):
packed.append( (imd[i*2]//16)<<4 | (imd[i*2+1]//16) )
return packed
甚至可能快一点(15 毫秒)
def packer4(it):
"""Cythonize python nibble packing loop, typed"""
cdef unsigned int n = len(it)//2
cdef unsigned int i
return [ (it[i*2]//16)<<4 | it[i*2+1]//16 for i in range(n) ]
这里是 timeit
>>> timeit.timeit('packer4(data)', setup='from pack import packer4; data = [0]*256*64', number=100)
1.31725951000044
>>> exit()
pi@raspberrypi:~ $ python3 -m timeit -s 'from pack import packer4; data = [0]*256*64' 'packer4(data)'
100 loops, best of 3: 9.04 msec per loop
这已经满足了我的要求,但我猜想 input/output 可迭代对象(-> unsigned int 数组?)或访问具有更广泛数据类型的输入数据(Raspbian是32位,BCM2835是ARM1176JZF-S单核)。
或在 GPU 或多核 Raspberry Pis 上具有并行性。
与 C (ideone) 中相同循环的粗略比较:
#include <stdio.h>
#include <stdint.h>
#define SIZE (256*64)
int main(void) {
uint8_t in[SIZE] = {0};
uint8_t out[SIZE/2] = {0};
uint8_t t;
for(t=0; t<100; t++){
uint16_t i;
for(i=0; i<SIZE/2; i++){
out[i] = (in[i*2]/16)<<4 | in[i*2+1]/16;
}
}
return 0;
}
显然快了 100 倍:
pi@raspberry:~ $ gcc p.c
pi@raspberry:~ $ time ./a.out
real 0m0.085s
user 0m0.060s
sys 0m0.010s
消除 shifts/division 可能是另一个轻微的优化(我没有检查生成的 C,也没有检查二进制文件):
def packs(bytes it):
"""Cythonize python nibble packing loop, typed"""
cdef unsigned int n = len(it)//2
cdef unsigned int i
return [ ( (it[i<<1]&0xF0) | (it[(i<<1)+1]>>4) ) for i in range(n) ]
结果
python3 -m timeit -s 'from pack import pack; data = bytes([0]*256*64)' 'pack(data)'
100 loops, best of 3: 12.7 msec per loop
python3 -m timeit -s 'from pack import packs; data = bytes([0]*256*64)' 'packs(data)'
100 loops, best of 3: 12 msec per loop
python3 -m timeit -s 'from pack import packs; data = bytes([0]*256*64)' 'packs(data)'
100 loops, best of 3: 11 msec per loop
python3 -m timeit -s 'from pack import pack; data = bytes([0]*256*64)' 'pack(data)'
100 loops, best of 3: 13.9 msec per loop