以下 SSE2 代码如何读取数据
How the following following SSE2 code read data
我发现了以下用于乘以 2x2 矩阵的 SSE2 代码。谁能解释一下这段代码是如何执行的。当我浏览代码时,我觉得它只是将值添加到 C(2x2) 矩阵 (C[0],C[3]
) 的两个位置。 lda
是大矩阵的大小,A,B,C是2x2矩阵。
static void simd_2x2(int lda, double* A, double* B, double* C)
{
__m128d a,b1,b2,c1,c2;
c1 = _mm_loadu_pd( C+0*lda ); //load unaligned block in C
c2 = _mm_loadu_pd( C+1*lda );
for( int i = 0; i < 2; ++i )
{
a = _mm_load_pd( A+i*lda );//load aligned i-th column of A
b1 = _mm_load1_pd( B+i+0*lda ); //load i-th row of B
b2 = _mm_load1_pd( B+i+1*lda );
c1=_mm_add_pd( c1, _mm_mul_pd( a, b2 ) ); //rank-1 update
c2=_mm_add_pd( c2, _mm_mul_pd( a, b2 ) );
}
_mm_storeu_pd( C+0*lda, c1 ); //store unaligned block in C
_mm_storeu_pd( C+1*lda, c2 );
}
我猜你困惑的根源是双精度内在函数(_mm_load_pd
等)每个处理一个包含两个双精度值的向量。 lda
似乎是大步。例如:
c1 = _mm_loadu_pd( C+0*lda );
c2 = _mm_loadu_pd( C+1*lda );
从 C、C+1、C+lda、C+lda+1 加载一个 2x2 的双打块。
您可以检查函数的输入以确保矩阵初始化是否正确,我使用类似的代码并得到正确的输出:
#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include <xmmintrin.h>
int main(void)
{
double *a, *b, *c;
int a_r = 2, a_c = 2, b_c = 2, b_r = 2;
int i, j, k;
/* allocate memory for matrix one */
a = (double *)malloc(sizeof(double) * a_r * a_r);
for (i = 0; i < a_c * a_c; i++)
{
*(a + i) = 2;
}
/* allocate memory for matrix two */
b = (double *)malloc(sizeof(double *) * b_r * b_r);
for (i = 0; i < b_c * b_c; i++)
{
*(b + i) = 2;
}
/* allocate memory for sum matrix */
c = (double *)malloc(sizeof(double *) * a_r * a_r);
for (i = 0; i < b_c * b_c; i++)
{
*(c + i) = 0;
}
printf("Initializing matrices...\n");
int lda = 2;
__m128d veca, vecb1, vecb2, c1, c2;
c1 = _mm_loadu_pd(c + 0 * lda);
c2 = _mm_loadu_pd(c + 1 * lda);
for (i = 0; i < 2; i++)
{
veca = _mm_load_pd(a);
vecb1 = _mm_load1_pd(b + i + 0 * lda); //load i-th row of B
vecb2 = _mm_load1_pd(b + i + 1 * lda);
//printf("vb10 %f vb11 %f vb20 %f vb21 %f\n", vecb1[0], vecb1[1], vecb2[0], vecb2[1]);
c1 = _mm_add_pd(c1, _mm_mul_pd(veca, vecb1)); //rank-1 update
c2 = _mm_add_pd(c2, _mm_mul_pd(veca, vecb2));
//printf("c10 %f c11 %f c20 %f c21 %f\n", c1[0], c1[1], c2[0], c2[1]);
}
_mm_storeu_pd(c + 0 * lda, c1); //store unaligned block in C
_mm_storeu_pd(c + 1 * lda, c2);
for (i = 0; i < 4; i++)
{
printf("c%d :(%f)\n", i, *(c + i));
}
}
我发现了以下用于乘以 2x2 矩阵的 SSE2 代码。谁能解释一下这段代码是如何执行的。当我浏览代码时,我觉得它只是将值添加到 C(2x2) 矩阵 (C[0],C[3]
) 的两个位置。 lda
是大矩阵的大小,A,B,C是2x2矩阵。
static void simd_2x2(int lda, double* A, double* B, double* C)
{
__m128d a,b1,b2,c1,c2;
c1 = _mm_loadu_pd( C+0*lda ); //load unaligned block in C
c2 = _mm_loadu_pd( C+1*lda );
for( int i = 0; i < 2; ++i )
{
a = _mm_load_pd( A+i*lda );//load aligned i-th column of A
b1 = _mm_load1_pd( B+i+0*lda ); //load i-th row of B
b2 = _mm_load1_pd( B+i+1*lda );
c1=_mm_add_pd( c1, _mm_mul_pd( a, b2 ) ); //rank-1 update
c2=_mm_add_pd( c2, _mm_mul_pd( a, b2 ) );
}
_mm_storeu_pd( C+0*lda, c1 ); //store unaligned block in C
_mm_storeu_pd( C+1*lda, c2 );
}
我猜你困惑的根源是双精度内在函数(_mm_load_pd
等)每个处理一个包含两个双精度值的向量。 lda
似乎是大步。例如:
c1 = _mm_loadu_pd( C+0*lda );
c2 = _mm_loadu_pd( C+1*lda );
从 C、C+1、C+lda、C+lda+1 加载一个 2x2 的双打块。
您可以检查函数的输入以确保矩阵初始化是否正确,我使用类似的代码并得到正确的输出:
#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include <xmmintrin.h>
int main(void)
{
double *a, *b, *c;
int a_r = 2, a_c = 2, b_c = 2, b_r = 2;
int i, j, k;
/* allocate memory for matrix one */
a = (double *)malloc(sizeof(double) * a_r * a_r);
for (i = 0; i < a_c * a_c; i++)
{
*(a + i) = 2;
}
/* allocate memory for matrix two */
b = (double *)malloc(sizeof(double *) * b_r * b_r);
for (i = 0; i < b_c * b_c; i++)
{
*(b + i) = 2;
}
/* allocate memory for sum matrix */
c = (double *)malloc(sizeof(double *) * a_r * a_r);
for (i = 0; i < b_c * b_c; i++)
{
*(c + i) = 0;
}
printf("Initializing matrices...\n");
int lda = 2;
__m128d veca, vecb1, vecb2, c1, c2;
c1 = _mm_loadu_pd(c + 0 * lda);
c2 = _mm_loadu_pd(c + 1 * lda);
for (i = 0; i < 2; i++)
{
veca = _mm_load_pd(a);
vecb1 = _mm_load1_pd(b + i + 0 * lda); //load i-th row of B
vecb2 = _mm_load1_pd(b + i + 1 * lda);
//printf("vb10 %f vb11 %f vb20 %f vb21 %f\n", vecb1[0], vecb1[1], vecb2[0], vecb2[1]);
c1 = _mm_add_pd(c1, _mm_mul_pd(veca, vecb1)); //rank-1 update
c2 = _mm_add_pd(c2, _mm_mul_pd(veca, vecb2));
//printf("c10 %f c11 %f c20 %f c21 %f\n", c1[0], c1[1], c2[0], c2[1]);
}
_mm_storeu_pd(c + 0 * lda, c1); //store unaligned block in C
_mm_storeu_pd(c + 1 * lda, c2);
for (i = 0; i < 4; i++)
{
printf("c%d :(%f)\n", i, *(c + i));
}
}