没有定义宏的代码重复问题

Code duplication issue without define macro

我正在尝试使用像这样的矢量化函数来模仿 CUDA/OpenCL 工作流程:

#include <omp.h>
#include <iostream>
#include <string>
#include <functional>
#include<cmath>
template<typename Type, int Simd>
struct KernelData
{
    alignas(32)
    Type data[Simd];

    inline void readFrom(const Type * const __restrict__ ptr) noexcept
    {
        for(int i=0;i<Simd;i++)
        {
            data[i] = ptr[i];
        }
    }

    inline void writeTo(Type * const __restrict__ ptr) const noexcept
    {
        for(int i=0;i<Simd;i++)
        {
            ptr[i] = data[i];
        }
    }

    inline const KernelData<Type,Simd> sqrt() const noexcept
    {
        KernelData<Type,Simd> result;
        for(int i=0;i<Simd;i++)
        {
            result.data[i] = std::sqrt(data[i]);
        }
        return result;
    }
};




template<int mask>
struct KernelDataFactory
{
    KernelDataFactory()
    {

    }


    template<typename Type>
    inline
    KernelData<Type,mask> generate() const
    {
        return KernelData<Type,mask>();
    }
};

template<int SimdWidth, typename... Args>
class Kernel
{
public:
    Kernel(std::function<void(int,int, Args...)> kernelPrm)
    {
        kernel = kernelPrm;
    }


    void run(int n, Args... args)
    {
        const int nLoop = (n/SimdWidth);
        for(int i=0;i<nLoop;i++)
        {
            kernel(i*SimdWidth,SimdWidth, args...);
        }

        if((n/SimdWidth)*SimdWidth != n)
        {
            const int m = n%SimdWidth;
            for(int i=0;i<m;i++)
            {
                kernel(nLoop*SimdWidth+i,1, args...);
            }
        }
    }
private:
    std::function<void(int,int, Args...)> kernel;
};

// cpu cycles from Whosebug
#include <stdint.h>  // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
inline
uint64_t readTSC() {
    // _mm_lfence();  // optionally wait for earlier insns to retire before reading the clock
    uint64_t tsc = __rdtsc();
    // _mm_lfence();  // optionally block later instructions until rdtsc retires
    return tsc;
}

int main(int argC, char** argV)
{ 
    constexpr int simd = 16;
    constexpr int n = 1003;
    Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
        const int id = simdGroupId;
        if(simdWidth == simd)
        {
            const KernelDataFactory<simd> factory;
            auto a = factory.generate<float>();

            a.readFrom(input+id);
            const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt();
            b.writeTo(output+id);
        }
        else
        {
            const KernelDataFactory<1> factory;
            auto a = factory.generate<float>();

            a.readFrom(input+id);
            const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt();
            b.writeTo(output+id);
        }
    });
  
    alignas(32)
    float i[n],o[n];
    for(int j=0;j<n;j++)
        i[j]=j;
    auto t1 = readTSC();
    for(int k=0;k<10000;k++)
        kernel.run(n,i,o);
    auto t2 = readTSC();
    for(int i=n-10;i<n;i++)
    {

        std::cout<<"i="<<i<<" value="<<o[i]<<std::endl;
    }
    std::cout<<0.0001f*(t2-t1)/(float)(15*n)<<" cycles per sqrt"<<std::endl;
    return 0;
}

但是用户给出的部分必须像这样复制:

Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
    const int id = simdGroupId;
    if(simdWidth == simd)
    {
        const KernelDataFactory<simd> factory;
        auto a = factory.generate<float>();

        a.readFrom(input+id);
        const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
        sqrt().sqrt().sqrt().sqrt().sqrt().
        sqrt().sqrt().sqrt().sqrt().sqrt();
        b.writeTo(output+id);
    }
    else
    {
        const KernelDataFactory<1> factory;
        auto a = factory.generate<float>();

        a.readFrom(input+id);
        const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
        sqrt().sqrt().sqrt().sqrt().sqrt().
        sqrt().sqrt().sqrt().sqrt().sqrt();
        b.writeTo(output+id);
    }
});

唯一的区别是编译时已知的两个模板生成:

KernelDataFactory<1> and KernelDataFactory<simd>

使用定义宏,很容易只复制 lambda 的函数体。我试图在不使用任何定义宏的情况下执行此操作。有没有一种简单的方法可以让用户只给出这个:

auto a = factory.generate<float>();
a.readFrom(input+id);
const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt().
sqrt().sqrt().sqrt().sqrt().sqrt();
b.writeTo(output+id);

它会被实现自动复制吗?

当前的实现是:

KernelDataFactory 模板参数(必须是编译时已知的)(1 和 simd)用于让编译器生成矢量化代码。 (在 godbolt.org (avx512) 上,它 运行 以“每平方根 0.9 个周期”的速度运行,而在我的系统 (avx1) 上,它是每平方根 3.8 个周期。)

您可以使用通用的 lambda (C++14) 来实现类似的目的。请注意,这需要您更改 Kernel::kernel 的类型并稍微更改内核的创建以允许自动类型推导:

内核

template<int SimdWidth, typename F, typename... Args>
class Kernel
{
public:
    Kernel(F&& kernelPrm)
        : kernel(std::move(kernelPrm))
    {
    }

    void run(int n, Args... args)
    {
        const int nLoop = (n / SimdWidth);
        for (int i = 0; i < nLoop; i++)
        {
            CallKernel(i * SimdWidth, SimdWidth, args...);
        }

        if ((n / SimdWidth) * SimdWidth != n)
        {
            const int m = n % SimdWidth;
            for (int i = 0; i < m; i++)
            {
                CallKernel(nLoop * SimdWidth + i, 1, args...);
            }
        }
    }
private:
    // helper function creating the factory and passing it to kernel
    void CallKernel(int simdGroupId, int simdWidth, Args... args)
    {
        const int id = simdGroupId;
        if (simdWidth == SimdWidth)
        {
            const KernelDataFactory<SimdWidth> factory;
            kernel(factory, id, args...);
        }
        else
        {
            const KernelDataFactory<1> factory;
            kernel(factory, id, args...);
        }
    }

    F kernel;
};

帮手

这些助手是推断内核的第二个模板参数所必需的。

// helper for specifying the parameter pack
template<class...Args>
struct KernelArgs
{};

template<int SimdWidth, typename F, class...Args>
auto CreateKernel(F&& kernelPrm, KernelArgs<Args...> const&)
{
    return Kernel<SimdWidth, F, Args...>(std::forward<F>(kernelPrm));
}

主要

...

auto kernel = CreateKernel<simd>([](auto& factory, int const id, float* input, float* output)
{
    auto a = factory.template generate<float>();

    a.readFrom(input + id);
    const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
        sqrt().sqrt().sqrt().sqrt().sqrt().
        sqrt().sqrt().sqrt().sqrt().sqrt();
    b.writeTo(output + id);
}, KernelArgs<float*, float*>{});

...