为什么这个基数排序 CUDA 代码只对 32 个元素进行排序？

Question

当我在 Visual Studio 15 中执行这段代码时，它只对 32 个元素进行排序。如果我将 WSIZE 设置为大于 32 或小于 32，它会显示与输出相同的未排序元素。谁能帮帮我？

我的系统信息。

处理器 - Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz，2400 Mhz，2 个核心，4 个逻辑处理器

内存 - 8GB

专用显卡 - NVIDIA GeForce 940M 4GB（384 CUDA 内核）

这是 WSIZE 设置为 32 的输出 https://i.imgur.com/geQ9VUZ.jpg

这是 WSIZE 设置为 19 的输出 https://i.imgur.com/4K0xkep.jpg

这是 WSIZE 设置为 50 的输出 https://i.imgur.com/M8irQhs.jpg

#pragma once
#ifdef __INTELLISENSE__
void __syncthreads();
#endif

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

#define WSIZE 32 /*to set No. of elements to sort and also set No. of threads*/
#define LOOPS 2
#define UPPER_BIT 4
#define LOWER_BIT 0

__device__ unsigned int ddata[WSIZE];
__device__ int ddata_s[WSIZE];

__global__ void parallelRadix()
{
    // This data in shared memory
    __shared__ volatile unsigned int sdata[WSIZE];

    // Load from global into shared variable
    sdata[threadIdx.x] = ddata[threadIdx.x];

    unsigned int bitmask = 1;
    unsigned int offset = 0;
    // -1, -2, -4, -8, -16, -32, -64, -128, -256,...
    unsigned int thrmask = 0xFFFFFFFFU << threadIdx.x;
    unsigned int mypos;

    // For each LSB to MSB
    for (int i = LOWER_BIT; i < UPPER_BIT; i++)
    {
        unsigned int mydata = sdata[((WSIZE - 1) - threadIdx.x) + offset];
        unsigned int mybit = mydata&bitmask;
        // Get population of ones and zeroes
        unsigned int ones = __ballot(mybit);
        unsigned int zeroes = ~ones;
        // Switch ping-pong buffers
        offset ^= WSIZE;

        // Do zeroes, then ones
        if (!mybit)
        {
            mypos = __popc(zeroes&thrmask);
        }
        else {      // Threads with a one bit
                    // Get my position in ping-pong buffer
            mypos = __popc(zeroes) + __popc(ones&thrmask);
        }

        // Move to buffer  (or use shfl for cc 3.0)
        sdata[mypos - 1 + offset] = mydata;
        // Repeat for next bit
        bitmask <<= 1;
    }
    // Put results to global
    ddata[threadIdx.x] = sdata[threadIdx.x + offset];
}

int main() {

    /* Parallel Radix Sort */

    unsigned int hdata[WSIZE];
    float totalTime = 0;
    LARGE_INTEGER cicles;
    for (int lcount = 0; lcount < LOOPS; lcount++)
    {
        //srand(time(NULL));
        // Array elements have value in range of 1024
        unsigned int range = 1U << UPPER_BIT;

        // Fill array with random elements
        // Range = 1024
        //srand(time(0));
        QueryPerformanceCounter(&cicles);
        srand(cicles.QuadPart);
        printf("\n input array  %d\n", lcount);
        for (int i = 0; i < WSIZE; i++)
        {
            hdata[i] = rand() % range;
            printf("%u ", hdata[i]);

        }


        // Copy data from host to device
        cudaMemcpyToSymbol(ddata, hdata, WSIZE * sizeof(unsigned int));

        // Execution time measurement, that point starts the clock
        high_resolution_clock::time_point t1 = high_resolution_clock::now();
        parallelRadix << < 1, WSIZE >> >();
        // Make kernel function synchronous
        cudaDeviceSynchronize();
        // Execution time measurement, that point stops the clock
        high_resolution_clock::time_point t2 = high_resolution_clock::now();

        // Execution time measurement, that is the result
        auto duration = duration_cast<milliseconds>(t2 - t1).count();

        // Summination of each loops' execution time
        totalTime += (float)duration / 1000.00;

        // Copy data from device to host
        cudaMemcpyFromSymbol(hdata, ddata, WSIZE * sizeof(unsigned int));

        printf("\n sorted array %d\n", lcount);
        for (int i = 0; i < WSIZE; i++)
            printf("%u ", hdata[i]);


    }
    printf("\n\n");
    printf("Parallel Radix Sort:\n");
    printf("Array size = %d\n", WSIZE * LOOPS);
    printf("Time elapsed = %fseconds\n", totalTime);

}

Answer 1

代码来自here。我写的。

它仅设计用于对 32 个元素进行排序，因为它设计用于在 CUDA 中仅使用单个 warp，即 32 个线程。它从未被设计或打算对超过 32 个元素进行排序，并且它在 warp 级别使用的许多技巧必须重新编写才能在大于 warp 的规模上工作。 WSIZE 代表 WARP SIZE，在 CUDA 中，warp 的大小是 32 个线程。您无法更改扭曲大小。

如果要对更大的数据集进行排序，请使用来自 thrust or cub 的操作。

为什么这个基数排序 CUDA 代码只对 32 个元素进行排序？

Why this Radix-sort CUDA code sorts only 32 elements?

sorting

cuda

gpu

radix-sort

visual-studio