是否有用于向零舍入的带符号右移的 ARM NEON 指令?
Are there are ARM NEON instructions for signed right-shift that round toward zero?
我正在尝试使用 ARM 内部函数实现算法。
算法的下一步需要将有符号整数右移,但需要向上舍入负值(即负数较小)。例如,如果右移一位,则 -8 应为 -4,但 -1 应为 0。
换句话说,我想要将负值舍入为零而不是向下舍入的东西:
int rightshift(int number, unsigned int shift)
{
return ((number < 0) ? -1 : 1) * (abs(number) >> shift);
}
我找不到合适的函数来以 SIMD 方式执行此操作。有没有一种方法可以在一个函数调用中做到这一点,或者可以使用一些技巧?
我认为不存在向零舍入行为的单指令移位。
但是,您可以通过一些移位和掩码指令相当简单地完成它。如果我们以负数开始并且有一个 'carry' 输出(即结果右边的任何位本来是 1),我们需要做的是将结果加一。
我可以用下面的纯 C 代码来证明这一点:
#include <stddef.h>
#include <limits.h>
int16_t rightshift(int number, unsigned int shift)
{
static const size_t bits = sizeof number * CHAR_BIT;
number += ((1<<shift) - 1) & (number >> bits-1);
return number >> shift;
}
#include <stdio.h>
int main() {
for (int i = -16; i <= 16; ++i) {
printf(" %3d: ", i);
for (int j = 0; j < 4; ++j)
printf("%4d", rightshift(i, j));
puts("");
}
}
这会编译成一些不错的分支费用程序集,它看起来适合内联(尤其是当 shift
是编译时常量时):
rightshift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
movs r3, #1
lsls r3, r3, r1
subs r3, r3, #1
and r3, r3, r0, asr #31
add r0, r0, r3
asrs r0, r0, r1
bx lr
为了瞄准 Neon,我写了另一个函数,用多个数据来练习它:
void do_shift(int16_t *restrict dest, const int16_t *restrict src,
size_t count, unsigned int shift)
{
for (size_t j = 0; j < count; ++j) {
dest[j] = rightshift(src[j], shift);
}
}
以及它的测试程序:
#include <stdio.h>
int main() {
static const int16_t src[] = {
-32768, -32767, -32766, -32765, -32764,
-16384, -16383, -16382, -16381, -16380,
-8193, -8192, -8191, -8190, -8189,
-16, -15, -14, -13, -12, -10, -9,
-8, -7, -6, -5, -4, -3, -2, -1, 0,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
1023, 1024, 32767,
};
static const size_t count = sizeof src / sizeof *src;
int16_t dest[16][count];
for (unsigned int i = 0; i < 16; ++i) {
do_shift(dest[i], src, count, i);
}
for (size_t i = 0; i < count; ++i) {
printf("%7d: ", src[i]);
for (int j = 0; j < 16; ++j)
printf("%7d", dest[j][i]);
puts("");
}
}
我用 gcc -O3 -march=armv7 -mfpu=neon
编译了这个。我承认我不熟悉 Neon 指令,但结果可能具有指导意义:
do_shift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
cmp r2, #0
beq .L21
push {r4, r5, r6, r7, r8, lr}
ubfx r4, r1, #1, #2
negs r4, r4
movs r5, #1
and r4, r4, #7
lsls r5, r5, r3
adds r7, r4, #7
subs r6, r2, #1
subs r5, r5, #1
cmp r6, r7
sxth r5, r5
bcc .L8
cmp r4, #0
beq .L9
ldrsh r7, [r1]
cmp r4, #1
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0] @ movhi
beq .L9
ldrsh r7, [r1, #2]
cmp r4, #2
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #2] @ movhi
beq .L9
ldrsh r7, [r1, #4]
cmp r4, #3
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #4] @ movhi
beq .L9
ldrsh r7, [r1, #6]
cmp r4, #4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #6] @ movhi
beq .L9
ldrsh r7, [r1, #8]
cmp r4, #5
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #8] @ movhi
beq .L9
ldrsh r7, [r1, #10]
cmp r4, #7
ite eq
moveq r8, r4
movne r8, #6
and r6, r5, r7, asr #31
add r6, r6, r7
it eq
ldrsheq r7, [r1, #12]
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #10] @ movhi
itttt eq
andeq r6, r5, r7, asr #31
addeq r6, r6, r7
sxtheq r6, r6
asreq r6, r6, r3
it eq
strheq r6, [r0, #12] @ movhi
.L4:
vdup.32 q10, r3
sub lr, r2, r4
lsls r4, r4, #1
movs r7, #0
vneg.s32 q10, q10
adds r6, r1, r4
lsr ip, lr, #3
add r4, r4, r0
vdup.16 q12, r5
.L6:
adds r7, r7, #1
adds r6, r6, #16
vldr d18, [r6, #-16]
vldr d19, [r6, #-8]
cmp r7, ip
vshr.s16 q8, q9, #15
vand q8, q8, q12
vadd.i16 q8, q8, q9
vmovl.s16 q9, d16
vmovl.s16 q8, d17
vshl.s32 q9, q9, q10
vshl.s32 q8, q8, q10
vmovn.i32 d22, q9
vmovn.i32 d23, q8
vst1.16 {q11}, [r4]
add r4, r4, #16
bcc .L6
bic r6, lr, #7
cmp lr, r6
add r4, r8, r6
beq .L1
.L3:
ldrsh ip, [r1, r4, lsl #1]
adds r7, r4, #1
cmp r2, r7
and r6, r5, ip, asr #31
add r6, r6, ip
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r4, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #2
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #3
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #4
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #5
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #6
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #7
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #8
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #9
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #10
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #11
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #12
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh r7, [r1, ip, lsl #1]
adds r4, r4, #13
cmp r2, r4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh r1, [r1, r4, lsl #1]
and r2, r5, r1, asr #31
add r2, r2, r1
sxth r2, r2
asr r3, r2, r3
strh r3, [r0, r4, lsl #1] @ movhi
.L1:
pop {r4, r5, r6, r7, r8, pc}
.L9:
mov r8, r4
b .L4
.L21:
bx lr
.L8:
movs r4, #0
b .L3
有很多循环展开使代码更长,但模式应该很清楚。
我正在尝试使用 ARM 内部函数实现算法。
算法的下一步需要将有符号整数右移,但需要向上舍入负值(即负数较小)。例如,如果右移一位,则 -8 应为 -4,但 -1 应为 0。
换句话说,我想要将负值舍入为零而不是向下舍入的东西:
int rightshift(int number, unsigned int shift)
{
return ((number < 0) ? -1 : 1) * (abs(number) >> shift);
}
我找不到合适的函数来以 SIMD 方式执行此操作。有没有一种方法可以在一个函数调用中做到这一点,或者可以使用一些技巧?
我认为不存在向零舍入行为的单指令移位。
但是,您可以通过一些移位和掩码指令相当简单地完成它。如果我们以负数开始并且有一个 'carry' 输出(即结果右边的任何位本来是 1),我们需要做的是将结果加一。
我可以用下面的纯 C 代码来证明这一点:
#include <stddef.h>
#include <limits.h>
int16_t rightshift(int number, unsigned int shift)
{
static const size_t bits = sizeof number * CHAR_BIT;
number += ((1<<shift) - 1) & (number >> bits-1);
return number >> shift;
}
#include <stdio.h>
int main() {
for (int i = -16; i <= 16; ++i) {
printf(" %3d: ", i);
for (int j = 0; j < 4; ++j)
printf("%4d", rightshift(i, j));
puts("");
}
}
这会编译成一些不错的分支费用程序集,它看起来适合内联(尤其是当 shift
是编译时常量时):
rightshift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
movs r3, #1
lsls r3, r3, r1
subs r3, r3, #1
and r3, r3, r0, asr #31
add r0, r0, r3
asrs r0, r0, r1
bx lr
为了瞄准 Neon,我写了另一个函数,用多个数据来练习它:
void do_shift(int16_t *restrict dest, const int16_t *restrict src,
size_t count, unsigned int shift)
{
for (size_t j = 0; j < count; ++j) {
dest[j] = rightshift(src[j], shift);
}
}
以及它的测试程序:
#include <stdio.h>
int main() {
static const int16_t src[] = {
-32768, -32767, -32766, -32765, -32764,
-16384, -16383, -16382, -16381, -16380,
-8193, -8192, -8191, -8190, -8189,
-16, -15, -14, -13, -12, -10, -9,
-8, -7, -6, -5, -4, -3, -2, -1, 0,
1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16,
1023, 1024, 32767,
};
static const size_t count = sizeof src / sizeof *src;
int16_t dest[16][count];
for (unsigned int i = 0; i < 16; ++i) {
do_shift(dest[i], src, count, i);
}
for (size_t i = 0; i < count; ++i) {
printf("%7d: ", src[i]);
for (int j = 0; j < 16; ++j)
printf("%7d", dest[j][i]);
puts("");
}
}
我用 gcc -O3 -march=armv7 -mfpu=neon
编译了这个。我承认我不熟悉 Neon 指令,但结果可能具有指导意义:
do_shift:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
cmp r2, #0
beq .L21
push {r4, r5, r6, r7, r8, lr}
ubfx r4, r1, #1, #2
negs r4, r4
movs r5, #1
and r4, r4, #7
lsls r5, r5, r3
adds r7, r4, #7
subs r6, r2, #1
subs r5, r5, #1
cmp r6, r7
sxth r5, r5
bcc .L8
cmp r4, #0
beq .L9
ldrsh r7, [r1]
cmp r4, #1
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0] @ movhi
beq .L9
ldrsh r7, [r1, #2]
cmp r4, #2
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #2] @ movhi
beq .L9
ldrsh r7, [r1, #4]
cmp r4, #3
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #4] @ movhi
beq .L9
ldrsh r7, [r1, #6]
cmp r4, #4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #6] @ movhi
beq .L9
ldrsh r7, [r1, #8]
cmp r4, #5
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #8] @ movhi
beq .L9
ldrsh r7, [r1, #10]
cmp r4, #7
ite eq
moveq r8, r4
movne r8, #6
and r6, r5, r7, asr #31
add r6, r6, r7
it eq
ldrsheq r7, [r1, #12]
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, #10] @ movhi
itttt eq
andeq r6, r5, r7, asr #31
addeq r6, r6, r7
sxtheq r6, r6
asreq r6, r6, r3
it eq
strheq r6, [r0, #12] @ movhi
.L4:
vdup.32 q10, r3
sub lr, r2, r4
lsls r4, r4, #1
movs r7, #0
vneg.s32 q10, q10
adds r6, r1, r4
lsr ip, lr, #3
add r4, r4, r0
vdup.16 q12, r5
.L6:
adds r7, r7, #1
adds r6, r6, #16
vldr d18, [r6, #-16]
vldr d19, [r6, #-8]
cmp r7, ip
vshr.s16 q8, q9, #15
vand q8, q8, q12
vadd.i16 q8, q8, q9
vmovl.s16 q9, d16
vmovl.s16 q8, d17
vshl.s32 q9, q9, q10
vshl.s32 q8, q8, q10
vmovn.i32 d22, q9
vmovn.i32 d23, q8
vst1.16 {q11}, [r4]
add r4, r4, #16
bcc .L6
bic r6, lr, #7
cmp lr, r6
add r4, r8, r6
beq .L1
.L3:
ldrsh ip, [r1, r4, lsl #1]
adds r7, r4, #1
cmp r2, r7
and r6, r5, ip, asr #31
add r6, r6, ip
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r4, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #2
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #3
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #4
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #5
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #6
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
adds r7, r4, #7
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #8
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #9
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #10
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, ip, lsl #1]
add r7, r4, #11
cmp r2, r7
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh lr, [r1, r7, lsl #1]
add ip, r4, #12
cmp r2, ip
and r6, r5, lr, asr #31
add r6, r6, lr
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, r7, lsl #1] @ movhi
bls .L1
ldrsh r7, [r1, ip, lsl #1]
adds r4, r4, #13
cmp r2, r4
and r6, r5, r7, asr #31
add r6, r6, r7
sxth r6, r6
asr r6, r6, r3
strh r6, [r0, ip, lsl #1] @ movhi
bls .L1
ldrsh r1, [r1, r4, lsl #1]
and r2, r5, r1, asr #31
add r2, r2, r1
sxth r2, r2
asr r3, r2, r3
strh r3, [r0, r4, lsl #1] @ movhi
.L1:
pop {r4, r5, r6, r7, r8, pc}
.L9:
mov r8, r4
b .L4
.L21:
bx lr
.L8:
movs r4, #0
b .L3
有很多循环展开使代码更长,但模式应该很清楚。