使用GPU的FFT计算:无法用递归编译程序
FFT calculation using GPU: unable to compile program with recursion
我正在尝试学习 GPU 编程。我的系统环境如下:
OS: windows 10 pro
GPU: NVIDIA GTX 1080 Ti (display does not run on this; there is another gpu for that)
CUDA toolkit: v9.1
我使用 CUDA 编写了这个简单的程序,在 GPU 上从头开始计算 FFT。该算法遵循 Cooley-Tukey algorithm 的维基百科示例。该代码使用递归函数来计算复数数组的 FFT。
#include <iostream>
#include <string>
#include "conio.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust\complex.h>
#include <cstdio>
#include <fstream>
using namespace std;
#define winSize 2048
#define winShift 1024
#define M_PI 3.14159265358979323846
__device__ void separate(thrust::complex<double>* a, int n)
{
thrust::complex<double>* b = new thrust::complex<double>[n / 2]; // get temp heap storage
for (int i = 0; i<n / 2; i++) // copy all odd elements to heap storage
b[i] = a[i * 2 + 1];
for (int i = 0; i<n / 2; i++) // copy all even elements to lower-half of a[]
a[i] = a[i * 2];
for (int i = 0; i<n / 2; i++) // copy all odd (from heap) to upper-half of a[]
a[i + n / 2] = b[i];
cudaFree(b); // delete heap storage
}
// N must be a power-of-2, or bad things will happen.
// Currently no check for this condition.
//
// N input samples in X[] are FFT'd and results left in X[].
// Because of Nyquist theorem, N samples means
// only first N/2 FFT results in X[] are the answer.
// (upper half of X[] is a reflection with no new information).
__global__ void fft2(thrust::complex<double>* X, int N)
{
if (N < 2)
{
// bottom of recursion.
// Do nothing here, because already X[0] = x[0]
}
else
{
separate(X, N); // all evens to lower half, all odds to upper half
fft2 << <1, 1 >> >(X, N / 2); // recurse even items
fft2 << <1, 1 >> >(X + N / 2, N / 2); // recurse odd items
// combine results of two half recursions
for (int k = 0; k<N / 2; k++)
{
thrust::complex<double> e = X[k]; // even
thrust::complex<double> o = X[k + N / 2]; // odd
// w is the "twiddle-factor"
thrust::complex<double> w = exp(thrust::complex<double>(0, -2.*M_PI*k / N));
X[k] = e + w * o;
X[k + N / 2] = e - w * o;
}
}
}
int main()
{
const int nSamples = 64;
double nSeconds = 0.02; // total time for sampling
double sampleRate = nSamples / nSeconds; // n Hz = n / second
double freqResolution = sampleRate / nSamples; // freq step in FFT result
thrust::complex<double> x[nSamples]; // storage for sample data
thrust::complex<double> X[nSamples]; // storage for FFT answer
thrust::complex<double> *d_arr1;
const int nFreqs = 5;
double freq[nFreqs] = { 2,4,8,32,72 }; // known freqs for testing
size_t n_byte = nSamples * sizeof(complex<double>);
// generate samples for testing
for (int i = 0; i<nSamples; i++)
{
x[i] = thrust::complex<double>(0., 0.);
// sum several known sinusoids into x[]
for (int j = 0; j < nFreqs; j++)
x[i] += sin(2 * M_PI*freq[j] * i); // / nSamples);
X[i] = x[i]; // copy into X[] for FFT work & result
}
// compute fft for this data
cudaMalloc((void**)&d_arr1, n_byte);
cudaMemcpy(d_arr1, X, n_byte, cudaMemcpyHostToDevice);
//launchKernel << <1, 1 >> >(d_arr1, nSamples);
fft2 << <1, 1 >> > (d_arr1, nSamples);
cudaMemcpy(X, d_arr1, n_byte, cudaMemcpyDeviceToHost);
printf(" n\tx[]\tX[]\tf\n"); // header line
// loop to print values
for (int i = 0; i<nSamples; i++)
{
printf("% 3d\t%+.3f\t%+.3f\t%g\n",
i, x[i].real(), abs(X[i]), i*freqResolution);
}
ofstream myfile("example_cuda.txt");
printf("I am trying to write to file\n");
if (myfile.is_open())
{
for (int count = 0; count < nSamples; count++)
{
myfile << x[count].real() << "," << abs(X[count]) << "," << count*freqResolution << "\n";
}
myfile.close();
}
}
我使用以下命令使用 VS2015 命令提示符编译代码:
nvcc -o fft_Wiki2.exe -c -arch=compute_35 -rdc=true
--expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
编译本身不会显示任何错误或警告,但可执行文件不会 运行。当我尝试
fft_Wiki2.exe
它只是说这个可执行文件的版本与 64 位 Windows 版本不兼容,因此无法执行。但是我正在使用 --machine 64
选项来强制执行可执行版本。
如何让这个程序执行?
How do I get this program to execute ?
这不是您要尝试的程序 运行,它是一个目标文件。
在你的编译命令中你传递 -c
:
nvcc -o fft_Wiki2.exe -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
这意味着只有编译没有链接。你需要做的是这样的:
nvcc -o fft_Wiki2.obj -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
nvcc -o fft_Wiki2.exe -arch=compute_35 --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.obj
[注意我无法访问 Windows 开发平台来检查命令的准确性]
第一个命令编译并发出目标文件。第二个执行主机和设备代码链接并发出一个您应该能够 运行
的可执行文件
我正在尝试学习 GPU 编程。我的系统环境如下:
OS: windows 10 pro
GPU: NVIDIA GTX 1080 Ti (display does not run on this; there is another gpu for that)
CUDA toolkit: v9.1
我使用 CUDA 编写了这个简单的程序,在 GPU 上从头开始计算 FFT。该算法遵循 Cooley-Tukey algorithm 的维基百科示例。该代码使用递归函数来计算复数数组的 FFT。
#include <iostream>
#include <string>
#include "conio.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust\complex.h>
#include <cstdio>
#include <fstream>
using namespace std;
#define winSize 2048
#define winShift 1024
#define M_PI 3.14159265358979323846
__device__ void separate(thrust::complex<double>* a, int n)
{
thrust::complex<double>* b = new thrust::complex<double>[n / 2]; // get temp heap storage
for (int i = 0; i<n / 2; i++) // copy all odd elements to heap storage
b[i] = a[i * 2 + 1];
for (int i = 0; i<n / 2; i++) // copy all even elements to lower-half of a[]
a[i] = a[i * 2];
for (int i = 0; i<n / 2; i++) // copy all odd (from heap) to upper-half of a[]
a[i + n / 2] = b[i];
cudaFree(b); // delete heap storage
}
// N must be a power-of-2, or bad things will happen.
// Currently no check for this condition.
//
// N input samples in X[] are FFT'd and results left in X[].
// Because of Nyquist theorem, N samples means
// only first N/2 FFT results in X[] are the answer.
// (upper half of X[] is a reflection with no new information).
__global__ void fft2(thrust::complex<double>* X, int N)
{
if (N < 2)
{
// bottom of recursion.
// Do nothing here, because already X[0] = x[0]
}
else
{
separate(X, N); // all evens to lower half, all odds to upper half
fft2 << <1, 1 >> >(X, N / 2); // recurse even items
fft2 << <1, 1 >> >(X + N / 2, N / 2); // recurse odd items
// combine results of two half recursions
for (int k = 0; k<N / 2; k++)
{
thrust::complex<double> e = X[k]; // even
thrust::complex<double> o = X[k + N / 2]; // odd
// w is the "twiddle-factor"
thrust::complex<double> w = exp(thrust::complex<double>(0, -2.*M_PI*k / N));
X[k] = e + w * o;
X[k + N / 2] = e - w * o;
}
}
}
int main()
{
const int nSamples = 64;
double nSeconds = 0.02; // total time for sampling
double sampleRate = nSamples / nSeconds; // n Hz = n / second
double freqResolution = sampleRate / nSamples; // freq step in FFT result
thrust::complex<double> x[nSamples]; // storage for sample data
thrust::complex<double> X[nSamples]; // storage for FFT answer
thrust::complex<double> *d_arr1;
const int nFreqs = 5;
double freq[nFreqs] = { 2,4,8,32,72 }; // known freqs for testing
size_t n_byte = nSamples * sizeof(complex<double>);
// generate samples for testing
for (int i = 0; i<nSamples; i++)
{
x[i] = thrust::complex<double>(0., 0.);
// sum several known sinusoids into x[]
for (int j = 0; j < nFreqs; j++)
x[i] += sin(2 * M_PI*freq[j] * i); // / nSamples);
X[i] = x[i]; // copy into X[] for FFT work & result
}
// compute fft for this data
cudaMalloc((void**)&d_arr1, n_byte);
cudaMemcpy(d_arr1, X, n_byte, cudaMemcpyHostToDevice);
//launchKernel << <1, 1 >> >(d_arr1, nSamples);
fft2 << <1, 1 >> > (d_arr1, nSamples);
cudaMemcpy(X, d_arr1, n_byte, cudaMemcpyDeviceToHost);
printf(" n\tx[]\tX[]\tf\n"); // header line
// loop to print values
for (int i = 0; i<nSamples; i++)
{
printf("% 3d\t%+.3f\t%+.3f\t%g\n",
i, x[i].real(), abs(X[i]), i*freqResolution);
}
ofstream myfile("example_cuda.txt");
printf("I am trying to write to file\n");
if (myfile.is_open())
{
for (int count = 0; count < nSamples; count++)
{
myfile << x[count].real() << "," << abs(X[count]) << "," << count*freqResolution << "\n";
}
myfile.close();
}
}
我使用以下命令使用 VS2015 命令提示符编译代码:
nvcc -o fft_Wiki2.exe -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
编译本身不会显示任何错误或警告,但可执行文件不会 运行。当我尝试
fft_Wiki2.exe
它只是说这个可执行文件的版本与 64 位 Windows 版本不兼容,因此无法执行。但是我正在使用 --machine 64
选项来强制执行可执行版本。
如何让这个程序执行?
How do I get this program to execute ?
这不是您要尝试的程序 运行,它是一个目标文件。
在你的编译命令中你传递 -c
:
nvcc -o fft_Wiki2.exe -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
这意味着只有编译没有链接。你需要做的是这样的:
nvcc -o fft_Wiki2.obj -c -arch=compute_35 -rdc=true --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.cu
nvcc -o fft_Wiki2.exe -arch=compute_35 --expt-relaxed-constexpr --machine 64 -Xcompiler "/wd4819" fftWiki_2.obj
[注意我无法访问 Windows 开发平台来检查命令的准确性]
第一个命令编译并发出目标文件。第二个执行主机和设备代码链接并发出一个您应该能够 运行
的可执行文件