没有加速 omp simd 减少

no speedup for omp simd reduction

我正在尝试使用矢量化 (openmp simd) 来加速矩阵乘法。为了利用矢量化,我转置了第二个矩阵(以使变化最快的索引遍历连续内存)。我 运行 我在 3000 x 3000 阵列上进行测试。因为我无法测量使用 open mp pragma 与不使用 open mp pragma 时的挂钟时间差异,所以我想确认我实际上正在为我正在乘法的单个阵列获得加速(事实并非如此)。因此,我插入了一些相同大小的虚拟数组,以检查是否可以像我所做的那样使用 SIMD 对它们进行加速(至少在使用 reduction 子句时)。

现在我的问题是我的问题是什么阻碍了 SIMD 加速我唯一的猜测是它一定是数组的二维性但我不完全明白为什么这会导致减速。

或者我的代码还有其他我没有看到的问题?

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#include <time.h>

const int N = 3000;

struct timespec begin, end;

double **create_a() {
    double *a = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **a_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        a_indexed[i] = a+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            a_indexed[i][j] = i * j;
        }
    }
    return a_indexed;
}

double **create_b(){
    double *b = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **b_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        b_indexed[i] = b+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            b_indexed[i][j] = (i == j) ? 1:0;
        }
    }
    return b_indexed;
}

double **transpose( double **matrix) {
    double *t = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **t_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        t_indexed[i] = t+i*N;
    }

    for (int i = 0; i< N; i++) {
        for (int j = 0; j<N; j++) {
            t_indexed[i][j] = matrix[j][i];
        }
    }
    return t_indexed;
}

double **mul(double **a, double **b) {
    double **b_t = transpose(b);
    double *res = (double *)aligned_alloc(32, sizeof(double) * N * N);
    double **res_indexed = (double **)aligned_alloc(32, sizeof(double *) * N);
    for (int i = 0; i<N; i++){
        res_indexed[i] = res+i*N;
    }

    for (int row = 0; row< 1; row++) {
        for (int col = 0; col < 1; col++) {
    double cell_res = 0;

    // problematic calculation that I can't get to speed up no matter what pragma I use
    clock_gettime(CLOCK_REALTIME, &begin);
    #pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
    }
    clock_gettime(CLOCK_REALTIME, &end);

    long seconds = end.tv_sec - begin.tv_sec;
    long nanoseconds = end.tv_nsec - begin.tv_nsec;
    double elapsed = seconds + nanoseconds*1e-9;

    printf("Time simd reduce measured: %.9f seconds.\n", elapsed);


    // dummy array measurements

    struct timespec begin2, end2;
    struct timespec begin3, end3;
    struct timespec begin4, end4;

    double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
    double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
    for (int i = 0; i < N ; i++){
        a2[i] = 1;
        b2[i] = 1;
    }

    // measurement with reduction is significantly faster than others
    clock_gettime(CLOCK_REALTIME, &begin2);
    #pragma omp simd aligned(a2, b2:32) reduction(+:cell_res)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }

    clock_gettime(CLOCK_REALTIME, &end2);
    long seconds2 = end2.tv_sec - begin2.tv_sec;
    long nanoseconds2 = end2.tv_nsec - begin2.tv_nsec;
    double elapsed2 = seconds2 + nanoseconds2*1e-9;
    
    printf("time2 (simd reduction): %.9f seconds.\n", elapsed2);

    // no speedup compared to without simd (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME, &begin3);
    #pragma omp simd aligned(a2, b2:32)
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME, &end3);

    long seconds3 = end3.tv_sec - begin3.tv_sec;
    long nanoseconds3 = end3.tv_nsec - begin3.tv_nsec;
    double elapsed3 = seconds3 + nanoseconds3*1e-9;
    printf("time3 (simd): %.9f seconds.\n", elapsed3);

    // no pragma (slightly faster than problematic calculation)
    clock_gettime(CLOCK_REALTIME, &begin4);
    for (int i = 0; i < N; i++) {
        cell_res += a2[i] * b2[i];
    }
    clock_gettime(CLOCK_REALTIME, &end4);

    long seconds4 = end4.tv_sec - begin4.tv_sec;
    long nanoseconds4 = end4.tv_nsec - begin4.tv_nsec;
    double elapsed4 = seconds4 + nanoseconds4*1e-9;
    printf("time4: %.9f seconds.\n", elapsed4);


    res_indexed[0][0] = cell_res;
        }
    }
    return res_indexed;
}


int main (int argc, char **argv) {
    //init a(i,j) = i * j
    double **a = create_a();

    //init b(i,j) = (i == j) ? 1:0;
    double **b = create_b();

    //multiply
    double **res = mul(a,b);
}
Time simd reduce measured: 0.000004188 seconds. // problematic
time2 (simd reduction): 0.000001762 seconds. // faster
time3 (simd): 0.000003475 seconds. //slightly faster
time4: 0.000003476 seconds. //slightly faster

我已经在我的机器上测试了前两个循环,我可以重现相同的行为。

Time simd reduce measured: 0.000006000 seconds.
time2 (simd reduction): 0.000004000 seconds.

我的猜测是有两个问题:

第一个问题:

多个版本的执行时间差异似乎更多地与缓存有关,而不是向量化。因此,当您使用包含 3000 个元素(24 KB)的虚拟数组进行测试时:

double *a2 = (double *)aligned_alloc(32, sizeof(double) * N);
double *b2 = (double *)aligned_alloc(32, sizeof(double) * N);
for (int i = 0; i < N ; i++){
    a2[i] = 1;
    b2[i] = 1;
}

这些小数组在初始化期间加载到缓存中( a2[i] = 1)。另一方面,矩阵 ab_t 的大小为 3000 x 3000(72 兆字节),这使得第一行不太可能完全位于缓存中(尽管这取决于测试环境的缓存大小)。

我更改了以下循环:

// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME, &begin);
#pragma omp simd aligned(a, b_t:32) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
    cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME, &end);

通过将矩阵ab_t的第一行分别打包成两个新矩阵a_2b_2,即:

for(int i = 0; i < N; i++){
      a_2[0][i]  = a[0][i];
      bt_2[0][i] = b_t[0][i];
 }

// problematic calculation that I can't get to speed up no matter what pragma I use
clock_gettime(CLOCK_REALTIME, &begin);
#pragma omp simd aligned(a_2, bt_2) reduction(+:cell_res)
for (int i = 0; i < N; i++) {
    cell_res += a_2[0][i] * bt_2[0][i];
}

通过将这些矩阵打包成只有一行的较小矩阵,我可以将这些矩阵加载到缓存中,从而减少了第一个版本的执行时间。新结果:

Time simd reduce measured: 0.000004000 seconds.
time2 (simd reduction): 0.000004000 seconds.

IMO 你不应该在同一个函数中测试所有这些循环,因为编译器可能会以不同的方式优化这些循环,然后就会出现缓存这些值的问题,等等。我会在单独的运行中测试它们。

Now my question is what is my problem that is hindering the SIMD speedup my only guess is that it must be the two-dimensionality of the array but I don't fully get why that would cause a slowdown.

我还测试了直接将矩阵 ab_t 的第一行打包到单独的一维数组(而不是矩阵)中,但结果完全相同。物有所值。现在您应该在您的环境中进行分析,即测试缓存未命中。

更重要的是,测试这个版本:

clock_gettime(CLOCK_REALTIME, &begin);
for (int i = 0; i < N; i++) {
    cell_res += a[0][i] * b_t[0][i];
}
clock_gettime(CLOCK_REALTIME, &end);

有和没有 #pragma omp simd aligned(a_2, bt_2:32) reduction(+:cell_res),有和没有打包到数组,但分别测试所有这些版本。此外,针对不同的输入大小测试它们。

第二题:

另一个有问题的是:

for (int i = 0; i < N; i++) {
        cell_res += a[0][i] * b_t[0][i];
} 

是内存限制的,因此使用 SIMD 获得收益的机会较少,不应该 expect much gains 使用 SIMD 来自双精度浮点积。一种解决方法是将矩阵从双精度更改为浮点,从而将所需的内存带宽减少一半,并将您可以执行的 SIMD 操作的数量加倍。尽管如此,上述代码片段仍将受内存限制。尽管如此,您可能会获得一些收益,主要是当值在缓存中时。

在我的机器中,从 double 变为 float,使得 SIMD 版本比没有它的版本明显更快,即使不使用包装也是如此。这也可能是您遇到的问题。