将 ARM 32 位霓虹灯转换为 ARM 64 位霓虹灯
Convert ARM 32-bit neon to ARM 64-bit neon
我有以下简单提取图像的 32 位霓虹灯代码:
extractY8ImageARM(unsigned char *from, unsigned char *to, int left, int top, int width, int height, int stride)
from: pointer to the original image
to: pointer to the destination extracted image
left, top: position where to extract in the original image
width, height: size of the extracted image
stride: width of the original image
这里是汇编代码:
.text
.arch armv7-a
.fpu neon
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM
extractY8ImageARM:
from .req r0
to .req r1
left .req r2
top .req r3
width .req r4
height .req r5
stride .req r6
tmp .req r7
push {r0-r7, lr}
//Let's get back the arguments
ldr width, [sp, #(9 * 4)]
ldr height, [sp, #(10 * 4)]
ldr stride, [sp, #(11 * 4)]
//Update the from pointer. Advance left + stride * top
add from, from, left
mul tmp, top, stride
add from, from, tmp
.loopV:
//We will copy width
mov tmp, width
.loopH:
//Read and store data
pld [from]
vld1.u8 { d0, d1, d2, d3 }, [from]!
pld [to]
vst1.u8 { d0, d1, d2, d3 }, [to]!
subs tmp, tmp, #32
bgt .loopH
//We advance the from pointer for the next line
add from, from, stride
sub from, from, width
subs height, height, #1
bgt .loopV
pop {r0-r7, pc}
.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp
我需要将其移植到 64 位 neon。谁能帮我做翻译?我已经阅读了这份白皮书http://malideveloper.arm.com/downloads/Porting%20to%20ARM%2064-bit.pdf,所以我或多或少地了解了其中的差异。
我的代码很简单,这将是一个很好的例子,说明如何在 64 位 neon 程序集文件中传递参数和 load/store 数据。我宁愿避免内在的。
整个代码如下所示:
.text
.arch armv8-a
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM
extractY8ImageARM:
from .req x0
to .req x1
left .req x2
top .req x3
width .req x4
height .req x5
stride .req x6
tmp .req x9
add from, from, left
mul tmp, top, stride
add from, from, tmp
.loopV:
mov tmp, width
.loopH:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [from], #64
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [to], #64
subs tmp, tmp, #64
bgt .loopH
add from, from, stride
sub from, from, width
subs height, height, #1
bgt .loopV
ret
.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp
我有以下简单提取图像的 32 位霓虹灯代码:
extractY8ImageARM(unsigned char *from, unsigned char *to, int left, int top, int width, int height, int stride)
from: pointer to the original image
to: pointer to the destination extracted image
left, top: position where to extract in the original image
width, height: size of the extracted image
stride: width of the original image
这里是汇编代码:
.text
.arch armv7-a
.fpu neon
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM
extractY8ImageARM:
from .req r0
to .req r1
left .req r2
top .req r3
width .req r4
height .req r5
stride .req r6
tmp .req r7
push {r0-r7, lr}
//Let's get back the arguments
ldr width, [sp, #(9 * 4)]
ldr height, [sp, #(10 * 4)]
ldr stride, [sp, #(11 * 4)]
//Update the from pointer. Advance left + stride * top
add from, from, left
mul tmp, top, stride
add from, from, tmp
.loopV:
//We will copy width
mov tmp, width
.loopH:
//Read and store data
pld [from]
vld1.u8 { d0, d1, d2, d3 }, [from]!
pld [to]
vst1.u8 { d0, d1, d2, d3 }, [to]!
subs tmp, tmp, #32
bgt .loopH
//We advance the from pointer for the next line
add from, from, stride
sub from, from, width
subs height, height, #1
bgt .loopV
pop {r0-r7, pc}
.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp
我需要将其移植到 64 位 neon。谁能帮我做翻译?我已经阅读了这份白皮书http://malideveloper.arm.com/downloads/Porting%20to%20ARM%2064-bit.pdf,所以我或多或少地了解了其中的差异。
我的代码很简单,这将是一个很好的例子,说明如何在 64 位 neon 程序集文件中传递参数和 load/store 数据。我宁愿避免内在的。
整个代码如下所示:
.text
.arch armv8-a
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM
extractY8ImageARM:
from .req x0
to .req x1
left .req x2
top .req x3
width .req x4
height .req x5
stride .req x6
tmp .req x9
add from, from, left
mul tmp, top, stride
add from, from, tmp
.loopV:
mov tmp, width
.loopH:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [from], #64
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [to], #64
subs tmp, tmp, #64
bgt .loopH
add from, from, stride
sub from, from, width
subs height, height, #1
bgt .loopV
ret
.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp