使用 managedCUDA 进行一维 FFT 加内核计算
1D FFT plus kernel calculation with managedCUDA
我正在尝试进行 FFT 加内核计算。
FFT :托管 CUDA 库
内核计算:自己的内核
C#代码
public void cuFFTreconstruct() {
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");
float[] fData = new float[Resolution * Resolution * 2];
float[] result = new float[Resolution * Resolution * 2];
CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j) * 2] = i + j * 2;
fData[(i * Resolution + j) * 2 + 1] = 0.0f;
}
}
devData.CopyToDevice(fData);
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
}
}
ctx.FreeMemory(devData.DevicePointer);
ctx.FreeMemory(copy_devData.DevicePointer);
}
内核代码
//Includes for IntelliSense
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
// Texture reference
texture<float2, 2> texref;
extern "C"
{
__global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
}
}
但是这个程序运行不佳。
发生以下错误:
ErrorLaunchFailed:执行内核时设备出现异常。常见原因包括取消引用无效的设备指针和访问越界共享内存。
上下文无法使用,因此必须销毁它(并且应该创建一个新的)。
来自此上下文的所有现有设备内存分配都是无效的,如果程序要继续使用 CUDA,则必须重建。
FFT 计划以元素的数量(即复数的数量)作为参数。因此,删除计划构造函数第一个参数中的 * 2
。而且批次数的两倍也没有意义...
此外,我将使用 float2
或 cuFloatComplex
类型(在 ManagedCuda.VectorTypes
中)来表示复数而不是两个原始浮点数。要释放内存,请使用 CudaDeviceVariable 的 Dispose 方法。否则稍后会被 GC 内部调用。
主机代码将如下所示:
int Resolution = 512;
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel.ptx", "cu_ArrayInversion");
//float2 or cuFloatComplex
float2[] fData = new float2[Resolution * Resolution];
float2[] result = new float2[Resolution * Resolution];
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j)].x = i + j * 2;
fData[(i * Resolution + j)].y = 0.0f;
}
}
devData.CopyToDevice(fData);
//Only Resolution times in X and Resolution batches
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
//ResultData[i, j, 0] = result[(i * Resolution + j)].x;
//ResultData[i, j, 1] = result[(i * Resolution + j)].y;
}
}
//And better free memory using Dispose()
//ctx.FreeMemory is only meant for raw device pointers obtained from somewhere else...
devData.Dispose();
copy_devData.Dispose();
plan1D.Dispose();
//For Cuda Memory checker and profiler:
CudaContext.ProfilerStop();
ctx.Dispose();
感谢您的建议。
我尝试了建议的代码。
但是,错误仍然存在。
(错误:ErrorLaunchFailed:执行内核时设备发生异常。常见原因包括取消引用无效设备指针和访问越界共享内存。无法使用上下文,因此必须销毁它(并且应该创建一个新的已创建)。来自此上下文的所有现有设备内存分配都是无效的,如果程序要继续使用 CUDA,则必须重建。)
为了使用float2,我修改了cu代码如下
extern "C"
{
__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].x;
data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].y;
}
当程序执行 "cuKernel.Run" 时,进程停止。
ptx 文件
.version 4.3
.target sm_20
.address_size 32
// .globl cu_ArrayInversion
.global .texref texref;
.visible .entry cu_ArrayInversion(
.param .u32 cu_ArrayInversion_param_0,
.param .u32 cu_ArrayInversion_param_1,
.param .u32 cu_ArrayInversion_param_2
)
{
.reg .f32 %f<5>;
.reg .b32 %r<17>;
ld.param.u32 %r1, [cu_ArrayInversion_param_0];
ld.param.u32 %r2, [cu_ArrayInversion_param_1];
ld.param.u32 %r3, [cu_ArrayInversion_param_2];
cvta.to.global.u32 %r4, %r2;
cvta.to.global.u32 %r5, %r1;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r7, %r6, %r8;
mov.u32 %r10, %ctaid.y;
mad.lo.s32 %r11, %r10, %r3, %r9;
shl.b32 %r12, %r11, 3;
add.s32 %r13, %r5, %r12;
mad.lo.s32 %r14, %r9, %r3, %r10;
shl.b32 %r15, %r14, 3;
add.s32 %r16, %r4, %r15;
ld.global.v2.f32 {%f1, %f2}, [%r13];
st.global.v2.f32 [%r16], {%f1, %f2};
ret;
}
感谢留言
主机代码
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Drawing.Imaging;
using ManagedCuda;
using ManagedCuda.CudaFFT;
using ManagedCuda.VectorTypes;
namespace WFA_CUDA_FFT
{
public partial class CuFFTMain : Form
{
float[, ,] FFTData2D;
int Resolution;
const int cuda_blockNum = 256;
public CuFFTMain()
{
InitializeComponent();
Resolution = 1024;
}
private void button1_Click(object sender, EventArgs e)
{
cuFFTreconstruct();
}
public void cuFFTreconstruct()
{
CudaContext ctx = new CudaContext(0);
ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);
float2[] fData = new float2[Resolution * Resolution];
float2[] result = new float2[Resolution * Resolution];
FFTData2D = new float[Resolution, Resolution, 2];
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[i * Resolution + j].x = i + j * 2;
avrg += fData[i * Resolution + j].x;
fData[i * Resolution + j].y = 0.0f;
}
}
avrg = avrg / (double)(Resolution * Resolution);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
}
}
devData.CopyToDevice(fData);
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
copy_devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
FFTData2D[i, j, 0] = result[i * Resolution + j].x;
FFTData2D[i, j, 1] = result[i * Resolution + j].y;
}
}
//Clean up
devData.Dispose();
copy_devData.Dispose();
plan1D.Dispose();
CudaContext.ProfilerStop();
ctx.Dispose();
}
}
}
内核代码
//Includes for IntelliSense
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
#include <vector>
// Texture reference
texture<float2, 2> texref;
extern "C"
{
// Device code
__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].x;
data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].y;
}
}
首先我用.Net4.5编译。
该程序无法运行,并显示错误 (System.BadImageFormatException)。
然而当FFT函数被注释掉时,内核程序运行。
其次,我从 .Net 4.5 更改为 .Net 4.0。
FFT 函数有效,但内核无效 运行 并显示错误。
我的电脑是 windows 8.1 pro,我用的是 visual studio 2013.
我正在尝试进行 FFT 加内核计算。 FFT :托管 CUDA 库 内核计算:自己的内核
C#代码
public void cuFFTreconstruct() {
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion");
float[] fData = new float[Resolution * Resolution * 2];
float[] result = new float[Resolution * Resolution * 2];
CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j) * 2] = i + j * 2;
fData[(i * Resolution + j) * 2 + 1] = 0.0f;
}
}
devData.CopyToDevice(fData);
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
ResultData[i, j, 0] = result[(i * Resolution + j) * 2];
ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1];
}
}
ctx.FreeMemory(devData.DevicePointer);
ctx.FreeMemory(copy_devData.DevicePointer);
}
内核代码
//Includes for IntelliSense
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
// Texture reference
texture<float2, 2> texref;
extern "C"
{
__global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2];
data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1];
}
}
但是这个程序运行不佳。 发生以下错误:
ErrorLaunchFailed:执行内核时设备出现异常。常见原因包括取消引用无效的设备指针和访问越界共享内存。 上下文无法使用,因此必须销毁它(并且应该创建一个新的)。 来自此上下文的所有现有设备内存分配都是无效的,如果程序要继续使用 CUDA,则必须重建。
FFT 计划以元素的数量(即复数的数量)作为参数。因此,删除计划构造函数第一个参数中的 * 2
。而且批次数的两倍也没有意义...
此外,我将使用 float2
或 cuFloatComplex
类型(在 ManagedCuda.VectorTypes
中)来表示复数而不是两个原始浮点数。要释放内存,请使用 CudaDeviceVariable 的 Dispose 方法。否则稍后会被 GC 内部调用。
主机代码将如下所示:
int Resolution = 512;
CudaContext ctx = new CudaContext(0);
CudaKernel cuKernel = ctx.LoadKernel("kernel.ptx", "cu_ArrayInversion");
//float2 or cuFloatComplex
float2[] fData = new float2[Resolution * Resolution];
float2[] result = new float2[Resolution * Resolution];
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j)].x = i + j * 2;
fData[(i * Resolution + j)].y = 0.0f;
}
}
devData.CopyToDevice(fData);
//Only Resolution times in X and Resolution batches
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / 256, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
//ResultData[i, j, 0] = result[(i * Resolution + j)].x;
//ResultData[i, j, 1] = result[(i * Resolution + j)].y;
}
}
//And better free memory using Dispose()
//ctx.FreeMemory is only meant for raw device pointers obtained from somewhere else...
devData.Dispose();
copy_devData.Dispose();
plan1D.Dispose();
//For Cuda Memory checker and profiler:
CudaContext.ProfilerStop();
ctx.Dispose();
感谢您的建议。
我尝试了建议的代码。 但是,错误仍然存在。 (错误:ErrorLaunchFailed:执行内核时设备发生异常。常见原因包括取消引用无效设备指针和访问越界共享内存。无法使用上下文,因此必须销毁它(并且应该创建一个新的已创建)。来自此上下文的所有现有设备内存分配都是无效的,如果程序要继续使用 CUDA,则必须重建。)
为了使用float2,我修改了cu代码如下
extern "C"
{
__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].x;
data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].y;
}
当程序执行 "cuKernel.Run" 时,进程停止。
ptx 文件
.version 4.3
.target sm_20
.address_size 32
// .globl cu_ArrayInversion
.global .texref texref;
.visible .entry cu_ArrayInversion(
.param .u32 cu_ArrayInversion_param_0,
.param .u32 cu_ArrayInversion_param_1,
.param .u32 cu_ArrayInversion_param_2
)
{
.reg .f32 %f<5>;
.reg .b32 %r<17>;
ld.param.u32 %r1, [cu_ArrayInversion_param_0];
ld.param.u32 %r2, [cu_ArrayInversion_param_1];
ld.param.u32 %r3, [cu_ArrayInversion_param_2];
cvta.to.global.u32 %r4, %r2;
cvta.to.global.u32 %r5, %r1;
mov.u32 %r6, %ctaid.x;
mov.u32 %r7, %ntid.x;
mov.u32 %r8, %tid.x;
mad.lo.s32 %r9, %r7, %r6, %r8;
mov.u32 %r10, %ctaid.y;
mad.lo.s32 %r11, %r10, %r3, %r9;
shl.b32 %r12, %r11, 3;
add.s32 %r13, %r5, %r12;
mad.lo.s32 %r14, %r9, %r3, %r10;
shl.b32 %r15, %r14, 3;
add.s32 %r16, %r4, %r15;
ld.global.v2.f32 {%f1, %f2}, [%r13];
st.global.v2.f32 [%r16], {%f1, %f2};
ret;
}
感谢留言
主机代码
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Drawing.Imaging;
using ManagedCuda;
using ManagedCuda.CudaFFT;
using ManagedCuda.VectorTypes;
namespace WFA_CUDA_FFT
{
public partial class CuFFTMain : Form
{
float[, ,] FFTData2D;
int Resolution;
const int cuda_blockNum = 256;
public CuFFTMain()
{
InitializeComponent();
Resolution = 1024;
}
private void button1_Click(object sender, EventArgs e)
{
cuFFTreconstruct();
}
public void cuFFTreconstruct()
{
CudaContext ctx = new CudaContext(0);
ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);
float2[] fData = new float2[Resolution * Resolution];
float2[] result = new float2[Resolution * Resolution];
FFTData2D = new float[Resolution, Resolution, 2];
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution);
int i, j;
Random rnd = new Random();
double avrg = 0.0;
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[i * Resolution + j].x = i + j * 2;
avrg += fData[i * Resolution + j].x;
fData[i * Resolution + j].y = 0.0f;
}
}
avrg = avrg / (double)(Resolution * Resolution);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
}
}
devData.CopyToDevice(fData);
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);
cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);
cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);
copy_devData.CopyToHost(result);
for (i = 0; i < Resolution; i++)
{
for (j = 0; j < Resolution; j++)
{
FFTData2D[i, j, 0] = result[i * Resolution + j].x;
FFTData2D[i, j, 1] = result[i * Resolution + j].y;
}
}
//Clean up
devData.Dispose();
copy_devData.Dispose();
plan1D.Dispose();
CudaContext.ProfilerStop();
ctx.Dispose();
}
}
}
内核代码
//Includes for IntelliSense
#define _SIZE_T_DEFINED
#ifndef __CUDACC__
#define __CUDACC__
#endif
#ifndef __cplusplus
#define __cplusplus
#endif
#include <cuda.h>
#include <device_launch_parameters.h>
#include <texture_fetch_functions.h>
#include "float.h"
#include <builtin_types.h>
#include <vector_functions.h>
#include <vector>
// Texture reference
texture<float2, 2> texref;
extern "C"
{
// Device code
__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution)
{
int image_x = blockIdx.x * blockDim.x + threadIdx.x;
int image_y = blockIdx.y;
data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].x;
data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].y;
}
}
首先我用.Net4.5编译。 该程序无法运行,并显示错误 (System.BadImageFormatException)。 然而当FFT函数被注释掉时,内核程序运行。
其次,我从 .Net 4.5 更改为 .Net 4.0。 FFT 函数有效,但内核无效 运行 并显示错误。
我的电脑是 windows 8.1 pro,我用的是 visual studio 2013.