为什么这个基数排序 CUDA 代码只对 32 个元素进行排序?
Why this Radix-sort CUDA code sorts only 32 elements?
当我在 Visual Studio 15 中执行这段代码时,它只对 32 个元素进行排序。如果我将 WSIZE 设置为大于 32 或小于 32,它会显示与输出相同的未排序元素。谁能帮帮我?
我的系统信息。
处理器 - Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz,2400 Mhz,2 个核心,4 个逻辑处理器
内存 - 8GB
专用显卡 - NVIDIA GeForce 940M 4GB(384 CUDA 内核)
这是 WSIZE 设置为 32 的输出 https://i.imgur.com/geQ9VUZ.jpg
这是 WSIZE 设置为 19 的输出 https://i.imgur.com/4K0xkep.jpg
这是 WSIZE 设置为 50 的输出 https://i.imgur.com/M8irQhs.jpg
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
#define WSIZE 32 /*to set No. of elements to sort and also set No. of threads*/
#define LOOPS 2
#define UPPER_BIT 4
#define LOWER_BIT 0
__device__ unsigned int ddata[WSIZE];
__device__ int ddata_s[WSIZE];
__global__ void parallelRadix()
{
// This data in shared memory
__shared__ volatile unsigned int sdata[WSIZE];
// Load from global into shared variable
sdata[threadIdx.x] = ddata[threadIdx.x];
unsigned int bitmask = 1;
unsigned int offset = 0;
// -1, -2, -4, -8, -16, -32, -64, -128, -256,...
unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x;
unsigned int mypos;
// For each LSB to MSB
for (int i = LOWER_BIT; i < UPPER_BIT; i++)
{
unsigned int mydata = sdata[((WSIZE - 1) - threadIdx.x) + offset];
unsigned int mybit = mydata&bitmask;
// Get population of ones and zeroes
unsigned int ones = __ballot(mybit);
unsigned int zeroes = ~ones;
// Switch ping-pong buffers
offset ^= WSIZE;
// Do zeroes, then ones
if (!mybit)
{
mypos = __popc(zeroes&thrmask);
}
else { // Threads with a one bit
// Get my position in ping-pong buffer
mypos = __popc(zeroes) + __popc(ones&thrmask);
}
// Move to buffer (or use shfl for cc 3.0)
sdata[mypos - 1 + offset] = mydata;
// Repeat for next bit
bitmask <<= 1;
}
// Put results to global
ddata[threadIdx.x] = sdata[threadIdx.x + offset];
}
int main() {
/* Parallel Radix Sort */
unsigned int hdata[WSIZE];
float totalTime = 0;
LARGE_INTEGER cicles;
for (int lcount = 0; lcount < LOOPS; lcount++)
{
//srand(time(NULL));
// Array elements have value in range of 1024
unsigned int range = 1U << UPPER_BIT;
// Fill array with random elements
// Range = 1024
//srand(time(0));
QueryPerformanceCounter(&cicles);
srand(cicles.QuadPart);
printf("\n input array %d\n", lcount);
for (int i = 0; i < WSIZE; i++)
{
hdata[i] = rand() % range;
printf("%u ", hdata[i]);
}
// Copy data from host to device
cudaMemcpyToSymbol(ddata, hdata, WSIZE * sizeof(unsigned int));
// Execution time measurement, that point starts the clock
high_resolution_clock::time_point t1 = high_resolution_clock::now();
parallelRadix << < 1, WSIZE >> >();
// Make kernel function synchronous
cudaDeviceSynchronize();
// Execution time measurement, that point stops the clock
high_resolution_clock::time_point t2 = high_resolution_clock::now();
// Execution time measurement, that is the result
auto duration = duration_cast<milliseconds>(t2 - t1).count();
// Summination of each loops' execution time
totalTime += (float)duration / 1000.00;
// Copy data from device to host
cudaMemcpyFromSymbol(hdata, ddata, WSIZE * sizeof(unsigned int));
printf("\n sorted array %d\n", lcount);
for (int i = 0; i < WSIZE; i++)
printf("%u ", hdata[i]);
}
printf("\n\n");
printf("Parallel Radix Sort:\n");
printf("Array size = %d\n", WSIZE * LOOPS);
printf("Time elapsed = %fseconds\n", totalTime);
}
当我在 Visual Studio 15 中执行这段代码时,它只对 32 个元素进行排序。如果我将 WSIZE 设置为大于 32 或小于 32,它会显示与输出相同的未排序元素。谁能帮帮我?
我的系统信息。
处理器 - Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz,2400 Mhz,2 个核心,4 个逻辑处理器
内存 - 8GB
专用显卡 - NVIDIA GeForce 940M 4GB(384 CUDA 内核)
这是 WSIZE 设置为 32 的输出 https://i.imgur.com/geQ9VUZ.jpg
这是 WSIZE 设置为 19 的输出 https://i.imgur.com/4K0xkep.jpg
这是 WSIZE 设置为 50 的输出 https://i.imgur.com/M8irQhs.jpg
#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
#define WSIZE 32 /*to set No. of elements to sort and also set No. of threads*/
#define LOOPS 2
#define UPPER_BIT 4
#define LOWER_BIT 0
__device__ unsigned int ddata[WSIZE];
__device__ int ddata_s[WSIZE];
__global__ void parallelRadix()
{
// This data in shared memory
__shared__ volatile unsigned int sdata[WSIZE];
// Load from global into shared variable
sdata[threadIdx.x] = ddata[threadIdx.x];
unsigned int bitmask = 1;
unsigned int offset = 0;
// -1, -2, -4, -8, -16, -32, -64, -128, -256,...
unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x;
unsigned int mypos;
// For each LSB to MSB
for (int i = LOWER_BIT; i < UPPER_BIT; i++)
{
unsigned int mydata = sdata[((WSIZE - 1) - threadIdx.x) + offset];
unsigned int mybit = mydata&bitmask;
// Get population of ones and zeroes
unsigned int ones = __ballot(mybit);
unsigned int zeroes = ~ones;
// Switch ping-pong buffers
offset ^= WSIZE;
// Do zeroes, then ones
if (!mybit)
{
mypos = __popc(zeroes&thrmask);
}
else { // Threads with a one bit
// Get my position in ping-pong buffer
mypos = __popc(zeroes) + __popc(ones&thrmask);
}
// Move to buffer (or use shfl for cc 3.0)
sdata[mypos - 1 + offset] = mydata;
// Repeat for next bit
bitmask <<= 1;
}
// Put results to global
ddata[threadIdx.x] = sdata[threadIdx.x + offset];
}
int main() {
/* Parallel Radix Sort */
unsigned int hdata[WSIZE];
float totalTime = 0;
LARGE_INTEGER cicles;
for (int lcount = 0; lcount < LOOPS; lcount++)
{
//srand(time(NULL));
// Array elements have value in range of 1024
unsigned int range = 1U << UPPER_BIT;
// Fill array with random elements
// Range = 1024
//srand(time(0));
QueryPerformanceCounter(&cicles);
srand(cicles.QuadPart);
printf("\n input array %d\n", lcount);
for (int i = 0; i < WSIZE; i++)
{
hdata[i] = rand() % range;
printf("%u ", hdata[i]);
}
// Copy data from host to device
cudaMemcpyToSymbol(ddata, hdata, WSIZE * sizeof(unsigned int));
// Execution time measurement, that point starts the clock
high_resolution_clock::time_point t1 = high_resolution_clock::now();
parallelRadix << < 1, WSIZE >> >();
// Make kernel function synchronous
cudaDeviceSynchronize();
// Execution time measurement, that point stops the clock
high_resolution_clock::time_point t2 = high_resolution_clock::now();
// Execution time measurement, that is the result
auto duration = duration_cast<milliseconds>(t2 - t1).count();
// Summination of each loops' execution time
totalTime += (float)duration / 1000.00;
// Copy data from device to host
cudaMemcpyFromSymbol(hdata, ddata, WSIZE * sizeof(unsigned int));
printf("\n sorted array %d\n", lcount);
for (int i = 0; i < WSIZE; i++)
printf("%u ", hdata[i]);
}
printf("\n\n");
printf("Parallel Radix Sort:\n");
printf("Array size = %d\n", WSIZE * LOOPS);
printf("Time elapsed = %fseconds\n", totalTime);
}