AVX-512 VORPS 中 writemask k1 的用法?

The usage of writemask k1 in AVX-512 VORPS?

我正在研究 AVX-512。我对 VORPS 有疑问。

文档是这样说的:

EVEX.512.0F.W0 56 /r VORPS zmm1 {k1}{z}, zmm2, zmm3/m512/m32bcst

Return zmm2 和 zmm3/m512/m32bcst 中压缩单精度浮点值的按位逻辑或,受写掩码 k1 影响。

EVEX 编码版本:第一个源操作数是 ZMM/YMM/XMM 寄存器。第二个源操作数可以是 ZMM/YMM/XMM 寄存器、512/256/128 位内存位置或从 32 位内存位置广播的 512/256/128 位向量。目标操作数是一个 ZMM/YMM/XMM 寄存器,有条件地用写掩码 k1 更新。

参考:https://www.felixcloutier.com/x86/orps


“subject to writemask k1”是什么意思?

任何人都可以给出本指令中 k1 贡献的具体示例吗?

我写这段代码是为了做一些关于 VORPS 的实验:https://godbolt.org/z/fMcqoa

代码

#include <stdio.h>
#include <stddef.h>
#include <stdint.h>

int main()
{
  register uint8_t *st_data asm("rbx");
  asm volatile(
    // Fix stack alignment
    "andq   $~0x3f, %%rsp\n\t"

    // Allocate stack
    "subq   [=10=]x100, %%rsp\n\t"

    // Take stack pointer, save it to st_data
    "movq   %%rsp, %[st_data]\n\t"

    // Fill 64 bytes top of stack with 0x01
    "movq   %%rsp, %%rdi\n\t"
    "movl   [=10=]x40, %%ecx\n\t"
    "movl   [=10=]x1, %%eax\n\t"
    "rep    stosb\n\t"

    // Fill 64 bytes next with 0x02
    "incl   %%eax\n\t"
    "leaq   0x40(%%rsp), %%rdi\n\t"
    "movl   [=10=]x40, %%ecx\n\t"
    "rep    stosb\n\t"

    // Take 0x1 and 0x2 to ZMM register
    "vmovdqa64  (%%rsp), %%zmm0\n\t"
    "vmovdqa64  0x40(%%rsp), %%zmm1\n\t"

    // Set write mask
    "movq   [=10=]x123456, %%rax\n\t"
    "kmovq  %%rax, %%k0\n\t"
    "kmovq  %%rax, %%k1\n\t"
    "kmovq  %%rax, %%k2\n\t"

    // Execute vorps, store the result to ZMM2
    "vorps  %%zmm0, %%zmm1, %%zmm2\n\t"

    // Plug back the result to memory
    "vmovdqa64  %%zmm2, 0x80(%%rsp)\n\t"
    "vzeroupper"
    : [st_data]"=r"(st_data)
    :
    : "rax", "rcx", "rdi", "zmm0", "zmm1",
      "zmm2", "memory", "cc"
  );

  static const char *x[] = {
    "Data 1:", "Data 2:", "Result:"
  };

  for (size_t i = 0; i < 3; i++) {
    printf("%s\n", x[i]);
    for (size_t j = 0; j < 8; j++) {
      for (size_t k = 0; k < 8; k ++) {
        printf("%02x ", *st_data++);
      }
      printf("\n");
    }
    printf("\n");
  }

  fflush(stdout);

  asm volatile(
    // sys_exit
    "movl   [=10=]x3c, %eax\n\t"
    "xorl   %edi, %edi\n\t"
    "syscall"
  );
}

这里,我尝试改变了k0,k1,k2的值。但是结果总是一样的。

Result:
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03

屏蔽寄存器不影响结果的原因是因为我没有在vorps的目标操作数中编码屏蔽寄存器。

在 AT&T 语法中,用法类似于:

# Without z-bit (merge-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}

# With z-bit (zero-masking)
vorps %zmm0, %zmm1, %zmm2 {%k1}{z}

在 GCC 内联 asm 中,{} 必须像这样转义:

# Without z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}

# With z-bit
vorps %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}

在这种情况下,可以使用 z 位来清除目标操作数的值。

带 z 位

例如,如果在 vorps 操作之前 zmm2 的值是:

ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 

zmm0zmm1的值与问题中的上述情况相同。

在这些指令之后:

    // Set write mask
    "movq   [=13=]b11111111, %%rax\n\t"
    "kmovq  %%rax, %%k1\n\t"

    // Execute vorps, store the result to ZMM2
    "vorps  %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"

    // Plug back the result to memory
    "vmovdqa64  %%zmm2, 0x80(%[buf])\n\t"

那么结果就是:

03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
00 00 00 00 00 00 00 00 
00 00 00 00 00 00 00 00 
00 00 00 00 00 00 00 00 
00 00 00 00 00 00 00 00 

如果没有 z 位,结果将是

03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
03 03 03 03 03 03 03 03 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 
ff ff ff ff ff ff ff ff 

代码示例

神马 link: https://godbolt.org/z/4rq5M8

#include <stdio.h>
#include <stddef.h>
#include <stdint.h>
#include <stdalign.h>

int main()
{
  alignas(64) uint8_t buf[0x100];
  uint8_t *st_data = buf;

  asm(
    // Fill ZMM2 with 0xff garbage.
    "vpternlogd [=16=]xff, %%zmm2, %%zmm2, %%zmm2\n\t"

    // Fill ZMM0 with 0x01
    "movl   [=16=]x01010101, %%eax\n\t"
    "vpbroadcastd %%eax, %%zmm0\n\t"

    // Fill ZMM1 with 0x02
    "movl   [=16=]x02020202, %%eax\n\t"
    "vpbroadcastd %%eax, %%zmm1\n\t"

    // Plug ZMM0 and ZMM1 value to memory to print later
    "vmovdqa64  %%zmm0, %[buf_0x00]\n\t"
    "vmovdqa64  %%zmm1, %[buf_0x40]\n\t"

    // Set write mask
    "movl   [=16=]b11111111, %%eax\n\t"
    "kmovq  %%rax, %%k1\n\t"

    // vorps without z-bit (merge into ZMM2)
    "vorps  %%zmm0, %%zmm1, %%zmm2 %{%%k1%}\n\t"

    // // vorps with z-bit (zero-mask, overwrite ZMM2)
    // "vorps   %%zmm0, %%zmm1, %%zmm2 %{%%k1%}%{z%}\n\t"

    // Plug the result to memory
    "vmovdqa64  %%zmm2, %[buf_0x80]\n\t"

#ifndef __AVX__
    /*
     * Note:
     * If we pass -mavx or -mavx2 or -mavx512* and then we clobber
     * AVX register(s) with inline assembly, then the compiler will
     * yield "vzeroupper" after the inline assembly.
     *
     * So we should only put vzeroupper when there is no AVX flag
     * to prevent duplicate vzeroupper.
     */
    "vzeroupper"
#endif

    : [buf_0x00]"=m"(*(uint8_t (*)[0x40])(buf + 0x00)),
      [buf_0x40]"=m"(*(uint8_t (*)[0x40])(buf + 0x40)),
      [buf_0x80]"=m"(*(uint8_t (*)[0x40])(buf + 0x80))
      /*
       * Yes, it is all `*(uint8_t (*)[0x40])`, meaning we
       * are going to write 0x40 bytes for each constraint.
       */
    :
    : "rax", "zmm0", "zmm1", "zmm2", "k1"
  );

  static const char *x[] = {
    "Data 1:", "Data 2:", "Result:"
  };

  for (size_t i = 0; i < 3; i++) {
    printf("%s\n", x[i]);
    for (size_t j = 0; j < 8; j++) {
      for (size_t k = 0; k < 8; k ++) {
        printf("%02x ", *st_data++);
      }
      printf("\n");
    }
    printf("\n");
  }
  return 0;
}