重新排序连续跨步像素阵列的最有效方法是什么？

Question

我正在 Jetson TX2（带有 ARM 处理器）上开发一个对性能至关重要的图像处理管道，它涉及读取一组图像，然后通过 Darknet 执行基于深度学习的对象检测。用 C 编写的 Darknet 有自己的图像存储方式，这与 OpenCV 的 IplImage 或 Python numpy 数组存储图像的方式不同。

在我的应用程序中，我需要通过 Python 与 Darknet 进行交互。因此，截至目前，我正在将 'batch' 图像（通常为 16 张）读入一个 numpy 数组，然后使用 ctypes 将其作为连续数组传递给 Darknet。在 Darknet 中，我必须重新排列像素的顺序，从 numpy 格式变为 Darknet 格式。

虽然输入数组是一个连续的块，按列排列，然后按行排列，然后按通道排列，然后按图像排列，但 Darknet 格式需要先按通道排列，然后按列排列，然后按行：并且在批次中每个图像包含一行，而不是一个连续的块。下图试图展示差异。在此示例中，我假设有一个 ixj 图像。 (0,0), (0,1) 等表示（行，列），而在顶部，C0，C1，C2..等表示相应行中的列。请注意，在多个图像作为批处理的一部分的情况下，输入格式将它们一个接一个地顺序排列，但 Darknet 需要它们在不同的行上：每行仅包含来自一个图像的数据。

截至目前，我在 C 中将输入数组转换为 Darknet 格式的代码如下所示，它迭代地命中每个通道中的每个像素并将其放在不同的位置，同时还对沿方式。

matrix ndarray_to_matrix(unsigned char* src, long* shape, long* strides)
{
int nb = shape[0];    // Batch size
int h = shape[1];     // Height of each image
int w = shape[2];     // Width of each image
int c = shape[3];     // No. of channels in each image
matrix X = make_matrix(nb, h*w*c);     // Output array format: 2D

int step_b = strides[0];
int step_h = strides[1];
int step_w = strides[2];
int step_c = strides[3];

int b, i, j, k;
int index1, index2 = 0;

for(b = 0; b < nb ; ++b) {
    for(i = 0; i < h; ++i) {
        for(k= 0; k < c; ++k) {
            for(j = 0; j < w; ++j) {
                index1 = k*w*h + i*w + j;
                index2 = step_b*b + step_h*i + step_w*j + step_c*k;
                X.vals[b][index1] = src[index2]/255.;
            }
        }
    }
}
return X;
}

有没有更有效的方法在 C 中进行这种重新排列和规范化？

我正在使用 Jetson TX2：它包含一个 ARM 处理器和一个 NVIDIA GPU，因此可以访问 NEON 和 CUDA 以及 OpenMP。
图像尺寸是固定的，可以硬编码：只有批量大小可以改变。

Answer 1

下面的函数几乎和`memcpy`:

一样快

/*
 *  Created on: 2018. 5. 5.
 *      Author: Jake 'Alquimista' Lee
 */

    .arch   armv8-a
    .text
    .global alquimista_ndarray_to_matrix

// void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);

pDst    .req    x0
pRed    .req    x1
pGrn    .req    x2
pBlu    .req    x3
count   .req    w4

.balign 64
.func
alquimista_ndarray_to_matrix:
    mov     x16, #(640*360) & 0xffff
    str     q8, [sp, #-16]!
    movk    x16, #((640*360)>>16), lsl #16
    mov     count, #(640*360)/128
    add     pGrn, pRed, x16
    add     pBlu, pRed, x16, lsl #1
    b       1f

.balign 64
1:
    ldp     q0, q3, [pRed], #32
    ldp     q1, q4, [pGrn], #32
    ldp     q2, q5, [pBlu], #32

    ldp     q6, q16, [pRed], #32
    ldp     q7, q17, [pGrn], #32
    ldp     q8, q18, [pBlu], #32

    ldp     q19, q22, [pRed], #32
    ldp     q20, q23, [pGrn], #32
    ldp     q21, q24, [pBlu], #32

    ldp     q25, q28, [pRed], #32
    ldp     q26, q29, [pGrn], #32
    ldp     q27, q30, [pBlu], #32

    subs    count, count, #1

    st3     {v0.16b, v1.16b, v2.16b}, [pDst], #48
    st3     {v3.16b, v4.16b, v5.16b}, [pDst], #48
    st3     {v6.16b, v7.16b, v8.16b}, [pDst], #48
    st3     {v16.16b, v17.16b, v18.16b}, [pDst], #48

    st3     {v19.16b, v20.16b, v21.16b}, [pDst], #48
    st3     {v22.16b, v23.16b, v24.16b}, [pDst], #48
    st3     {v25.16b, v26.16b, v27.16b}, [pDst], #48
    st3     {v28.16b, v29.16b, v30.16b}, [pDst], #48
    b.gt    1b

.balign 8
    ldr     q8, [sp], #16
    ret
.endfunc
.end

为了获得最佳性能和最低功耗，您可能希望将源指针对齐到 32 字节，将目标指针对齐到 16 字节。

函数原型为：

void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);

下面是动态转换为 float 的函数。

并且我添加了批号作为参数，这样您就不必为每个图像都调用一个函数。

/*
 *  Created on: 2018. 5. 5.
 *      Copyright: Jake 'Alquimista' Lee. All rights reserved
 */

    .arch   armv8-a
    .text
    .global alquimista_ndarray_to_matrix_float

// void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);

pDst    .req    x0
pRed    .req    x1
batch   .req    w2
pGrn    .req    x3
pBlu    .req    x4
stride  .req    x5
count   .req    w7

.balign 64
.func
alquimista_ndarray_to_matrix_float:
    mov     stride, #((640*360)<<1) & 0xffff
    stp     q8, q15, [sp, #-32]!
    movk    stride, #((640*360)>>15), lsl #16
    mov     count, #(640*360)/32
    add     pGrn, pRed, stride, lsr #1
    add     pBlu, pRed, stride
    b       1f

.balign 64
1:
    ldp     q0, q1, [pRed], #32
    ldp     q2, q3, [pGrn], #32
    ldp     q4, q5, [pBlu], #32

    subs    count, count, #1

    ushll   v20.8h, v0.8b, #7
    ushll2  v23.8h, v0.16b, #7
    ushll   v26.8h, v1.8b, #7
    ushll2  v29.8h, v1.16b, #7
    ushll   v21.8h, v2.8b, #7
    ushll2  v24.8h, v2.16b, #7
    ushll   v27.8h, v3.8b, #7
    ushll2  v30.8h, v3.16b, #7
    ushll   v22.8h, v4.8b, #7
    ushll2  v25.8h, v4.16b, #7
    ushll   v28.8h, v5.8b, #7
    ushll2  v31.8h, v5.16b, #7

    ursra   v20.8h, v20.8h, #8
    ursra   v21.8h, v21.8h, #8
    ursra   v22.8h, v22.8h, #8
    ursra   v23.8h, v23.8h, #8
    ursra   v24.8h, v24.8h, #8
    ursra   v25.8h, v25.8h, #8
    ursra   v26.8h, v26.8h, #8
    ursra   v27.8h, v27.8h, #8
    ursra   v28.8h, v28.8h, #8
    ursra   v29.8h, v29.8h, #8
    ursra   v30.8h, v30.8h, #8
    ursra   v31.8h, v31.8h, #8

    uxtl    v0.4s, v20.4h
    uxtl    v1.4s, v21.4h
    uxtl    v2.4s, v22.4h
    uxtl2   v3.4s, v20.8h
    uxtl2   v4.4s, v21.8h
    uxtl2   v5.4s, v22.8h

    uxtl    v6.4s, v23.4h
    uxtl    v7.4s, v24.4h
    uxtl    v8.4s, v25.4h
    uxtl2   v15.4s, v23.8h
    uxtl2   v16.4s, v24.8h
    uxtl2   v17.4s, v25.8h

    uxtl    v18.4s, v26.4h
    uxtl    v19.4s, v27.4h
    uxtl    v20.4s, v28.4h
    uxtl2   v21.4s, v26.8h
    uxtl2   v22.4s, v27.8h
    uxtl2   v23.4s, v28.8h

    uxtl    v24.4s, v29.4h
    uxtl    v25.4s, v30.4h
    uxtl    v26.4s, v31.4h
    uxtl2   v27.4s, v29.8h
    uxtl2   v28.4s, v30.8h
    uxtl2   v29.4s, v31.8h

    ucvtf   v0.4s, v0.4s, #15
    ucvtf   v1.4s, v1.4s, #15
    ucvtf   v2.4s, v2.4s, #15

    ucvtf   v3.4s, v3.4s, #15
    ucvtf   v4.4s, v4.4s, #15
    ucvtf   v5.4s, v5.4s, #15

    ucvtf   v6.4s, v6.4s, #15
    ucvtf   v7.4s, v7.4s, #15
    ucvtf   v8.4s, v8.4s, #15

    ucvtf   v15.4s, v15.4s, #15
    ucvtf   v16.4s, v16.4s, #15
    ucvtf   v17.4s, v17.4s, #15

    ucvtf   v18.4s, v18.4s, #15
    ucvtf   v19.4s, v19.4s, #15
    ucvtf   v20.4s, v20.4s, #15

    ucvtf   v21.4s, v21.4s, #15
    ucvtf   v22.4s, v22.4s, #15
    ucvtf   v23.4s, v23.4s, #15

    ucvtf   v24.4s, v24.4s, #15
    ucvtf   v25.4s, v25.4s, #15
    ucvtf   v26.4s, v26.4s, #15

    ucvtf   v27.4s, v27.4s, #15
    ucvtf   v28.4s, v28.4s, #15
    ucvtf   v29.4s, v29.4s, #15

    st3     {v0.4s - v2.4s}, [pDst], #48
    st3     {v3.4s - v5.4s}, [pDst], #48
    st3     {v6.4s - v8.4s}, [pDst], #48
    st3     {v15.4s - v17.4s}, [pDst], #48
    st3     {v18.4s - v20.4s}, [pDst], #48
    st3     {v21.4s - v23.4s}, [pDst], #48
    st3     {v24.4s - v26.4s}, [pDst], #48
    st3     {v27.4s - v29.4s}, [pDst], #48
    b.gt    1b

    add     pRed, pRed, stride
    add     pGrn, pGrn, stride
    add     pGrn, pGrn, stride
    subs    batch, batch, #1
    mov     count, #(640*360)/32
    b.gt    1b

.balign 8
    ldp     q8, q15, [sp], #32
    ret
.endfunc
.end

它很长，比上面的 uint8 花费的时间要长得多。

请注意，它将非常适合多核执行。

函数原型为：

void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);

重新排序连续跨步像素阵列的最有效方法是什么？

What is the most efficient way to reorder a contiguous strided pixel array?

c

optimization

neon

darknet

下面的函数几乎和`memcpy`:

重新排序连续跨步像素阵列的最有效方法是什么？

What is the most efficient way to reorder a contiguous strided pixel array?

c

optimization

neon

darknet

下面的函数几乎和memcpy:

下面的函数几乎和`memcpy`: