NEON Cortex-A8 的组装示例

In-Assembly example for NEON Cortex-A8

我正在尝试为皮质 A8 的 NEON 编译和示例汇编代码,以便在 BeagleBone 黑板 (BBB) 上使用此二进制文件。我使用下面列出的 eclipse 工具 GCC 编译器和汇编器,

GCC : arm-linux-gnueabi-gcc

ASSEMBLER : arm-linux-gnueabi-as

示例中出现以下错误,对于我使用的每个示例,我都发现了类似的错误

Description Path    Resource    Location    Type
SP not allowed in register list -- `ldmia r12,{r4-r11,r13,lr}'      EXAMPLE_NEON    line 61, external location: /tmp/ccTXrczs.s C/C++ Problem

我使用的代码

/************************
* neon.c * ***************************/

#include <stdio.h>


__attribute__((aligned (16)))
unsigned short int data1[8];
unsigned short int data2[8];
unsigned short int out[8];

void* neontest_save_buffer[16];


void
neontest(unsigned short int *a, unsigned short int *b,
                unsigned short int* q)
{
  __asm__(
"   movw        r12, #:lower16:neontest_save_buffer\n\t"
"   movt        r12, #:upper16:neontest_save_buffer\n\t"
"   stmia       r12, {r4-r11, r13, lr}        @ save registers\n\t"
"   vld1.16     {q1}, [r0:128]\n\t"
"   vld1.16     {q2}, [r1:128]\n\t"
"   vadd.i16    q0, q1, q2\n\t"
"   vst1.32     {q0}, [r2:128]\n\t"
"   movw        r12, #:lower16:neontest_save_buffer\n\t"
"   movt        r12, #:upper16:neontest_save_buffer\n\t"
"   ldmia       r12, {r4-r11, r13, lr}        @ reload all registers and return\n\t"
"finish:\n\t"
    );

}

int
main(void)
{
    int i;

    for (i=0; i<8; i++)
    {
        data1[i]=i*10;
        data2[i]=5;
        out[i]=0;
    }

    neontest(data1, data2, out);

    printf("output is: ");
    for (i=0; i<7; i++)
    {
        printf("%d, ", out[i]);
    }
    printf("%d\n", out[i]);

  return(0);
}

您似乎在使用 Thumb32 模式,其中 sp cannot be in the list of registers(来自 [1])。

如果您不设置新堆栈,为什么还需要保存堆栈? 只需尝试从 stm 块和 ldm 块中删除 r13。

默认编译器使用thumb模式,在命令行中添加“-marm”以在ARM模式下编译代码:

arm-linux-gnueabihf-gcc -mcpu=cortex-a8 -mfpu=neon -marm neon.c

你也可以调整代码让编译器做寄存器save/restore,这样代码就可以为ARM和thumb2指令集编译:

#include <stdio.h>


__attribute__((aligned (16)))
unsigned short int data1[8];
unsigned short int data2[8];
unsigned short int out[8];

void
neontest(unsigned short int *a, unsigned short int *b,
                unsigned short int* q)
{
    __asm volatile (
"   vld1.16     {q1}, [%[a]:128]\n\t"
"   vld1.16     {q2}, [%[b]:128]\n\t"
"   vadd.i16    q0, q1, q2\n\t"
"   vst1.32     {q0}, [%[q]:128]\n\t"

     : [q] "+r" (q)
     : [a] "r" (a), [b] "r" (b)
     : "q0", "q1", "q2"
    );
}

int
main(void)
{
    int i;

    for (i=0; i<8; i++)
    {
        data1[i]=i*10;
        data2[i]=5;
        out[i]=0;
    }

    neontest(data1, data2, out);

    printf("output is: ");
    for (i=0; i<7; i++)
    {
        printf("%d, ", out[i]);
    }
    printf("%d\n", out[i]);

  return(0);
}

arm-linux-gnueabihf-gcc -mcpu=cortex-a8 -mfpu=neon -marm neon2.c

arm-linux-gnueabihf-gcc -mcpu=cortex-a8 -mfpu=neon -mthumb neon2.c