x86_64 内联汇编；将 64 位寄存器直接复制到 64 位内存位置

Question

我是运行下面的代码，遇到两个问题：

1) 当我将 movl（从寄存器复制值）更改为 movq 时，我遇到了 gcc 错误：Error: operand size mismatch for movq。在正常的程序集中，我看到这可以通过添加 qword 前缀或喜欢来实现，但这也不能满足 gcc

uint64_t cpuid_0(uint64_t* _rax, uint64_t* _rbx, uint64_t* _rcx, uint64_t* _rdx){

    int a, b, c, d;
    *_rax = 0x0;

    __asm__
    __volatile__
    (
        "movq [=11=],  %%rax\n"
        "cpuid\n"
        "movl %%eax, %0\n"
        "movl %%ebx, %1\n"
        "movl %%ecx, %2\n"
        "movl %%edx, %3\n"
        : "=r" (a), "=r" (b), "=r" (c), "=r" (d)
        : "0" (a)
    );
    *_rax=a;*_rbx=b;*_rcx=c;*_rdx=d;
    return *_rax;
}

2) 我想消除额外的复制操作，所以我在约束规范中修改了我的代码：

uint64_t cpuid_0(uint64_t* _rax, uint64_t* _rbx, uint64_t* _rcx, uint64_t* _rdx){

    int a, b, c, d;
    *_rax = 0x0;

    __asm__
    __volatile__
    (
         "movq [=12=],  %%rax\n"
         "cpuid\n"
         "movl %%eax, %0\n"
         "movl %%ebx, %1\n"
         "movl %%ecx, %2\n"
         "movl %%edx, %3\n"
        : "+m" (*_rax), "=m" (*_rbx), "=m" (*_rcx), "=m" (_rdx)
        : "0" (*_rax)
    );
    *_rax=a;*_rbx=b;*_rcx=c;*_rdx=d;
    return *_rax;
}

这给了我很多错误，如下所示：

warning: matching constraint does not allow a register
error: inconsistent operand constraints in an ‘asm’

此外，我假设 __volatile__ 可以在这个小代码中删除。

Answer 1

是输入 "0" (*_rax) 造成的...似乎 "0" 不适用于 "=m" 内存限制，也不适用于 "+m"。（我不知道为什么。）

改变你的第二个函数来编译和工作：

uint32_t cpuid_0(uint32_t* _eax, uint32_t* _ebx, uint32_t* _ecx, uint32_t* _edx)
{
  __asm__
  (
    "mov [=10=],  %%eax\n"
    "cpuid\n"
    "mov %%eax, %0\n"
    "mov %%ebx, %1\n"
    "mov %%ecx, %2\n"
    "mov %%edx, %3\n"
    : "=m" (*_eax), "=m" (*_ebx), "=m" (*_ecx), "=m" (*_edx)
    : //"0" (*_eax) -- not required and throws errors !!
    : "%rax", "%rbx", "%rcx", "%rdx"  // ESSENTIAL "clobbers"
  ) ;
  return *_eax ;
}

其中：

为了保持一致性，一切都和uint32_t一样。
丢弃多余的int a, b, c, d;
省略了 "0" 输入，无论如何都没有使用。
为 (*_eax)
"clobbers" 所有“%rax”、“%rbx”、“%rcx”、“%rdx”
丢弃多余的volatile。

最后一个是必要的，因为没有它编译器不知道那些寄存器会受到影响。

以上编译为：

   push   %rbx                 # compiler (now) knows %rbx is "clobbered"
   mov    %rdx,%r8             # likewise %rdx
   mov    %rcx,%r9             # ditto %rcx

     mov    [=11=]x0,%eax          # the __asm__(....
     cpuid  
     mov    %eax,(%rdi)
     mov    %ebx,(%rsi)
     mov    %ecx,(%r8)
     mov    %edx,(%r9)         # ....) ;

   mov    (%rdi),%eax
   pop    %rbx
   retq

注意：没有 "clobbers" 编译为：

   mov    [=12=]x0,%eax
   cpuid  
   mov    %eax,(%rdi)
   mov    %ebx,(%rsi)
   mov    %ecx,(%rdx)
   mov    %edx,(%rcx)
   mov    (%rdi),%eax
   retq

哪个更短，但遗憾的是不起作用！！

您还可以（版本 2）：

struct cpuid
{
  uint32_t  eax ;
  uint32_t  ebx ;
  uint32_t  ecx ;
  uint32_t  edx ;
};

uint32_t cpuid_0(struct cpuid* cid)
{
  uint32_t eax ;

  __asm__
  (
    "mov [=13=],  %%eax\n"
    "cpuid\n"
    "mov %%ebx, %1\n"
    "mov %%ecx, %2\n"
    "mov %%edx, %3\n"
    : "=a" (eax), "=m" (cid->ebx), "=m" (cid->ecx), "=m" (cid->edx)
    :: "%ebx", "%ecx", "%edx"
  ) ;

  return cid->eax = eax ;
}

编译成非常短的东西：

   push   %rbx
   mov    [=14=]x0,%eax
   cpuid  
   mov    %ebx,0x4(%rdi)
   mov    %ecx,0x8(%rdi)
   mov    %edx,0xc(%rdi)
   pop    %rbx
   mov    %eax,(%rdi)
   retq

或者您可以做一些更像您的第一个版本（第 3 版）的事情：

uint32_t cpuid_0(struct cpuid* cid)
{
  uint32_t eax, ebx, ecx, edx ;

  eax = 0 ;
  __asm__(" cpuid\n" : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx));

  cid->edx = edx ;
  cid->ecx = ecx ;
  cid->ebx = ebx ;
  return cid->eax = eax ;
}

编译为：

   push   %rbx
   xor    %eax,%eax
   cpuid  
   mov    %ebx,0x4(%rdi)
   mov    %edx,0xc(%rdi)
   pop    %rbx
   mov    %ecx,0x8(%rdi)
   mov    %eax,(%rdi)
   retq

这个版本使用"+a"、"=b"等魔法来告诉编译器将特定的寄存器分配给各种变量。这将汇编程序的数量减少到最低限度，这通常是一件好事。 [注意，编译器知道 xor %eax,%eax 比 mov [=29=],%eax 更好（也更短），并且认为早一点执行 pop %rbx 有一些优势。]

更好的是——在@Peter Cordes（第 4 版）的评论之后：

uint32_t cpuid_1(struct cpuid* cid)
{
  __asm__
  (
    "xor %%eax, %%eax\n"
    "cpuid\n"
    : "=a" (cid->eax), "=b" (cid->ebx), "=c" (cid->ecx), "=d" (cid->edx)
  ) ;

  return cid->eax ;
}

编译器发现 cid->eax 已经在 %eax 中，因此编译为：

   push   %rbx
   xor    %eax,%eax
   cpuid  
   mov    %ebx,0x4(%rdi)
   mov    %eax,(%rdi)
   pop    %rbx
   mov    %ecx,0x8(%rdi)
   mov    %edx,0xc(%rdi)
   retq

与版本 3 相同，只是指令的顺序略有不同。

FWIW：__asm__() 定义为：

asm asm-qualifiers (AssemblerTemplate : OutputOperands [ : InputOperands [ : Clobbers ] )

内联汇编的关键是理解编译器：

不知道 AssemblerTemplate 部分的含义。

它确实扩展了 %xx 占位符，但是 什么都不理解 其他。
是否了解OutputOperands、InputOperands（如果有）和 Clobbers（如果有）...

...这些告诉编译器汇编程序需要什么作为参数，以及如何展开各种%xx.

...但是这些也告诉编译器 AssemblerTemplate 做了什么，在编译器理解的术语。

所以，编译器理解的是一种"data flow"。据了解，汇编程序需要一些输入，returns 一些输出，并且（可能）作为副作用 "clobber" 一些寄存器 and/or 内存量。有了这些信息，编译器就可以将 "black box" 汇编程序序列与围绕它生成的代码集成在一起。除其他事项外，编译器将：

为输出和输入操作数分配寄存器

并安排输入到所需的寄存器中（根据需要）。

注意：编译器将汇编器视为单个操作，在生成任何输出之前消耗所有输入。如果在 __asm__() 之后没有使用输入，编译器可以将给定的寄存器分配为输入和输出。因此需要所谓的"early clobber".
围绕周围的代码移动 "black box"，保持汇编器对其输入源的依赖性以及后面的代码对汇编器输出的依赖性。
如果似乎没有什么依赖于其输出，则完全丢弃 "black box"！

x86_64 内联汇编；将 64 位寄存器直接复制到 64 位内存位置

x86_64 Inline Assembly ; Copying 64-bit register directly to 64-bit memory location

assembly

gcc

x86-64

inline-assembly