将 ARM 32 位霓虹灯转换为 ARM 64 位霓虹灯

Convert ARM 32-bit neon to ARM 64-bit neon

我有以下简单提取图像的 32 位霓虹灯代码:

extractY8ImageARM(unsigned char *from, unsigned char *to, int left, int top, int width, int height, int stride)
from: pointer to the original image
to: pointer to the destination extracted image
left, top: position where to extract in the original image
width, height: size of the extracted image
stride: width of the original image

这里是汇编代码:

.text
.arch armv7-a
.fpu neon
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM

extractY8ImageARM:
from    .req r0
to  .req r1
left    .req r2
top .req r3
width   .req r4
height  .req r5
stride  .req r6
tmp .req r7

    push {r0-r7, lr}

//Let's get back the arguments
    ldr width, [sp, #(9 * 4)]
    ldr height, [sp, #(10 * 4)]
    ldr stride, [sp, #(11 * 4)]

//Update the from pointer. Advance left + stride * top
    add from, from, left
    mul tmp, top, stride
    add from, from, tmp

.loopV:
//We will copy width
    mov tmp, width

.loopH:
//Read and store data
    pld [from]
    vld1.u8 { d0, d1, d2, d3 }, [from]!

    pld [to]
    vst1.u8 { d0, d1, d2, d3 }, [to]!

    subs tmp, tmp, #32
    bgt .loopH

//We advance the from pointer for the next line
    add from, from, stride
    sub from, from, width

    subs height, height, #1
    bgt .loopV


    pop {r0-r7, pc}

.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp

我需要将其移植到 64 位 neon。谁能帮我做翻译?我已经阅读了这份白皮书http://malideveloper.arm.com/downloads/Porting%20to%20ARM%2064-bit.pdf,所以我或多或少地了解了其中的差异。

我的代码很简单,这将是一个很好的例子,说明如何在 64 位 neon 程序集文件中传递参数和 load/store 数据。我宁愿避免内在的。

整个代码如下所示:

.text
.arch armv8-a
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM

extractY8ImageARM:
from    .req x0
to  .req x1
left    .req x2
top .req x3
width   .req x4
height  .req x5
stride  .req x6
tmp .req x9

    add from, from, left
    mul tmp, top, stride
    add from, from, tmp

.loopV:
    mov tmp, width

.loopH:
    ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [from], #64

    st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [to], #64

    subs tmp, tmp, #64
    bgt .loopH

    add from, from, stride
    sub from, from, width

    subs height, height, #1
    bgt .loopV

    ret


.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp