从类型“int”分配给类型“__m256d”时不兼容的类型
incompatible types when assigning to type ‘__m256d’ from type ‘int’
我正在做一个优化矩阵乘法的项目,我正在尝试使用内在函数。
这是我正在使用的一些代码:
#include <immintrin.h>
/* Vector tiling and loop unrolling */
static void do_block(int lda, int M, int N, int K, double* A, double* B, double* C) {
/* For each row i of A */
int i, j, k;
for (i = 0; i < M / 4; ++i) {
/* For each column j of B */
for (j = 0; j < N / 12; ++j) {
register __m256d c_00_03_0 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12);
register __m256d c_00_03_1 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12 + 4);
register __m256d c_00_03_2 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12 + 8);
register __m256d c_10_13_0 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12);
register __m256d c_10_13_1 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12+4);
register __m256d c_10_13_2 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12+8);
register __m256d c_20_23_0 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12);
register __m256d c_20_23_1 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12+4);
register __m256d c_20_23_2 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12+8);
register __m256d c_30_33_0 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12);
register __m256d c_30_33_1 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12+4);
register __m256d c_30_33_2 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12+8);
/* Loop unrolling */
for (k = 0; k < K; k += 1) {
register __m256d b_00_03 = _mm256_loadu_pd(B + k * lda + j * 12);
register __m256d b_10_03 = _mm256_loadu_pd(B + k * lda + j * 12+4);
register __m256d b_20_03 = _mm256_loadu_pd(B + k * lda + j * 12+8);
register __m256d a00 = _mm256_broadcast_sd(A + (4 * i) * lda + k);
c_00_03_0 = _mm256_fmadd_pd(a00, b_00_03, c_00_03_0);
c_00_03_1 = _mm256_fmadd_pd(a00, b_10_03, c_00_03_1);
c_00_03_2 = _mm256_fmadd_pd(a00, b_20_03, c_00_03_2);
a00 = _mm256_broadcast_sd(A + (4 * i + 1) * lda + k);
c_10_13_0 = _mm256_fmadd_pd(a00, b_00_03, c_10_13_0);
c_10_13_1 = _mm256_fmadd_pd(a00, b_10_03, c_10_13_1);
c_10_13_2 = _mm256_fmadd_pd(a00, b_20_03, c_10_13_2);
a00 = _mm256_broadcast_sd(A + (4 * i + 2) * lda + k);
c_20_23_0 = _mm256_fmadd_pd(a00, b_00_03, c_20_23_0);
c_20_23_1 = _mm256_fmadd_pd(a00, b_10_03, c_20_23_1);
c_20_23_2 = _mm256_fmadd_pd(a00, b_20_03, c_20_23_2);
a00 = _mm256_broadcast_sd(A + (4 * i + 3) * lda + k);
c_30_33_0 = _mm256_fmadd_pd(a00, b_00_03, c_30_33_0);
c_30_33_1 = _mm256_fmadd_pd(a00, b_10_03, c_30_33_1);
c_30_33_2 = _mm256_fmadd_pd(a00, b_20_03, c_30_33_2);
}
}
}
所有使用函数 __mm256_fmadd_pd 的行都会抛出以下错误:
<b>从类型“int”</b>
分配给类型“__m256d”时不兼容的类型
我不确定int是从哪里来的,因为__mm256_madd_pd
的所有参数都是“__m256d
”并且return的值也是“__m256d
” (__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c))
正如 chtz 所提到的,我使用的处理器不支持 _mm256_fmadd_pd
的 FMA 功能,所以我使用了他提出的工作正常的解决方法:
_mm256_add_pd(_mm256_mul_pd(aXX, bYY), cZZ)
我正在做一个优化矩阵乘法的项目,我正在尝试使用内在函数。
这是我正在使用的一些代码:
#include <immintrin.h>
/* Vector tiling and loop unrolling */
static void do_block(int lda, int M, int N, int K, double* A, double* B, double* C) {
/* For each row i of A */
int i, j, k;
for (i = 0; i < M / 4; ++i) {
/* For each column j of B */
for (j = 0; j < N / 12; ++j) {
register __m256d c_00_03_0 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12);
register __m256d c_00_03_1 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12 + 4);
register __m256d c_00_03_2 = _mm256_loadu_pd(C + (4 * i) * lda + j * 12 + 8);
register __m256d c_10_13_0 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12);
register __m256d c_10_13_1 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12+4);
register __m256d c_10_13_2 = _mm256_loadu_pd(C + (4 * i + 1) * lda + j * 12+8);
register __m256d c_20_23_0 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12);
register __m256d c_20_23_1 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12+4);
register __m256d c_20_23_2 = _mm256_loadu_pd(C + (4 * i + 2) * lda + j * 12+8);
register __m256d c_30_33_0 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12);
register __m256d c_30_33_1 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12+4);
register __m256d c_30_33_2 = _mm256_loadu_pd(C + (4 * i + 3) * lda + j * 12+8);
/* Loop unrolling */
for (k = 0; k < K; k += 1) {
register __m256d b_00_03 = _mm256_loadu_pd(B + k * lda + j * 12);
register __m256d b_10_03 = _mm256_loadu_pd(B + k * lda + j * 12+4);
register __m256d b_20_03 = _mm256_loadu_pd(B + k * lda + j * 12+8);
register __m256d a00 = _mm256_broadcast_sd(A + (4 * i) * lda + k);
c_00_03_0 = _mm256_fmadd_pd(a00, b_00_03, c_00_03_0);
c_00_03_1 = _mm256_fmadd_pd(a00, b_10_03, c_00_03_1);
c_00_03_2 = _mm256_fmadd_pd(a00, b_20_03, c_00_03_2);
a00 = _mm256_broadcast_sd(A + (4 * i + 1) * lda + k);
c_10_13_0 = _mm256_fmadd_pd(a00, b_00_03, c_10_13_0);
c_10_13_1 = _mm256_fmadd_pd(a00, b_10_03, c_10_13_1);
c_10_13_2 = _mm256_fmadd_pd(a00, b_20_03, c_10_13_2);
a00 = _mm256_broadcast_sd(A + (4 * i + 2) * lda + k);
c_20_23_0 = _mm256_fmadd_pd(a00, b_00_03, c_20_23_0);
c_20_23_1 = _mm256_fmadd_pd(a00, b_10_03, c_20_23_1);
c_20_23_2 = _mm256_fmadd_pd(a00, b_20_03, c_20_23_2);
a00 = _mm256_broadcast_sd(A + (4 * i + 3) * lda + k);
c_30_33_0 = _mm256_fmadd_pd(a00, b_00_03, c_30_33_0);
c_30_33_1 = _mm256_fmadd_pd(a00, b_10_03, c_30_33_1);
c_30_33_2 = _mm256_fmadd_pd(a00, b_20_03, c_30_33_2);
}
}
}
所有使用函数 __mm256_fmadd_pd 的行都会抛出以下错误:
<b>从类型“int”</b>
我不确定int是从哪里来的,因为__mm256_madd_pd
的所有参数都是“__m256d
”并且return的值也是“__m256d
” (__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c))
正如 chtz 所提到的,我使用的处理器不支持 _mm256_fmadd_pd
的 FMA 功能,所以我使用了他提出的工作正常的解决方法:
_mm256_add_pd(_mm256_mul_pd(aXX, bYY), cZZ)