“私有变量不能减少”,尽管该变量是在 SIMD 块之外定义的

“private variable cannot be reduction” although that variable is defined outside the SIMD block

我有一个使用 OpenMP 的 C++ 项目,我尝试在 Blue Gene/Q 上使用 LLVM 编译它。有一个功能,精简后看起来像这样:

template <typename FT, int veclen>
inline void xmyNorm2Spinor(FT *res,
                           FT *x,
                           FT *y,
                           double &n2res,
                           int n,
                           int n_cores,
                           int n_simt,
                           int n_blas_simt) {
#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
    double norm2res __attribute__((aligned(QPHIX_LLC_CACHE_ALIGN))) = 0;
#else
    __declspec(align(QPHIX_LLC_CACHE_ALIGN)) double norm2res = 0;
#endif

#pragma omp parallel shared(norm_array)
    {
        // […]
        if (smtid < n_blas_simt) {
            // […]

            double lnorm = 0;

//#pragma prefetch x,y,res
//#pragma vector aligned(x,y,res)
#pragma omp simd aligned(res, x, y : veclen) reduction(+ : lnorm)
            for (int i = low; i < hi; i++) {
                res[i] = x[i] - y[i];
                double tmpd = (double)res[i];
                lnorm += (tmpd * tmpd);
            }
            // […]
        }
    }
    // […]
}

错误就在这里:

In file included from /homec/hbn28/hbn28e/Sources/qphix/tests/timeDslashNoQDP.cc:6:
In file included from /homec/hbn28/hbn28e/Sources/qphix/include/qphix/blas.h:8:
/homec/hbn28/hbn28e/Sources/qphix/include/qphix/blas_c.h:156:54: error: private variable cannot be reduction
#pragma omp simd aligned(res,x,y:veclen) reduction(+:lnorm)
                                                     ^
/homec/hbn28/hbn28e/Sources/qphix/include/qphix/blas_c.h:151:12: note: predetermined as private
                                double lnorm=0;
                                       ^

由于外部 omp parallel 块,变量 lnorm 是为每个线程定义的。然后是一个额外的 SIMD 部分,其中每个线程使用一个 SIMD 通道。减少应该在线程内完成,所以变量的范围看起来是正确的。编译器仍然不希望这样。

这里有什么问题?

问题似乎是 omp parallel 块附加到 lnorm 变量的 private 属性与 OpenMP reduction() 子句在其参数变量上(即使 lnorm 对于 reduction() 子句所在的嵌套 omp simd 块不是私有的 适用)。

您可以尝试通过将 lnorm 计算代码提取到它自己的函数中来解决该问题:

template <typename FT, int veclen>
inline double compute_res_and_lnorm(FT *res,
                           FT *x,
                           FT *y,
                           int low,
                           int hi)
{
    double lnorm = 0;

#pragma omp simd aligned(res, x, y : veclen) reduction(+ : lnorm)
    for (int i = low; i < hi; i++) {
        res[i] = x[i] - y[i];
        double tmpd = (double)res[i];
        lnorm += (tmpd * tmpd);
    }
    return lnorm;
}

template <typename FT, int veclen>
inline void xmyNorm2Spinor(FT *res,
                           FT *x,
                           FT *y,
                           double &n2res,
                           int n,
                           int n_cores,
                           int n_simt,
                           int n_blas_simt) {
#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
    double norm2res __attribute__((aligned(QPHIX_LLC_CACHE_ALIGN))) = 0;
#else
    __declspec(align(QPHIX_LLC_CACHE_ALIGN)) double norm2res = 0;
#endif

#pragma omp parallel shared(norm_array)
    {
        // […]
        if (smtid < n_blas_simt) {
            // […]
            double lnorm = compute_res_and_lnorm(res, x, y, low, hi);
            // […]
        }
    }
    // […]
}