OpenCL 中的暴力破解(来自 CUDA 的端口)不起作用

bruteforce in OpenCL (port from CUDA) isn't working

*代码和问题的另一个更新*

刚开始学习 openCL 大约 1 周左右,我尝试移植一个关于暴力破解 MD5 哈希以从中获取实际字符串的 CUDA 程序。我使用 2 个文件:kernel.cl 和 main.cpp.

//this is kernel.cl

{...*defining some md5 variables*...}

void IncrementBruteGPU(unsigned char* ourBrute, unsigned int charSetLen, unsigned int bruteLength, unsigned int incrementBy){
int i = 0;
while(incrementBy > 0 && i < bruteLength)
{
    int add = incrementBy + ourBrute[i];
    ourBrute[i] = add % charSetLen;
    incrementBy = add / charSetLen;
    i++;
}}

void md5_vfy(unsigned char* data, unsigned int length, unsigned int *a1, unsigned int *b1, unsigned int *c1, unsigned int *d1){
{...*some md5 hashing function*...}}

__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
                unsigned int bruteLength, unsigned int v1,
                unsigned int v2, unsigned int v3, unsigned int v4,
                __constant unsigned char *cudaBrute, 
                __constant unsigned char *cudaCharSet,
                __global unsigned char *correctPass){
//count index
unsigned int idx = get_global_id(0);
int totalLen = 0;
int bruteStart = 0;

unsigned char word[14];
unsigned char ourBrute[14];

int i = 0;

for(i = 0; i < 14; i++)
{
    ourBrute[i] = cudaBrute[i];
}

i = 0;
bruteStart = i;
i+= bruteLength;
totalLen = i;

IncrementBruteGPU(ourBrute, charSetLen, bruteLength, idx);
int timer = 0;
for(timer = 0; timer < 200; timer++)
{
    //substitute into string
    for(i = 0; i < bruteLength; i++)
    {
        word[i+bruteStart] = cudaCharSet[ourBrute[i]];
    }

    unsigned int c1 = 0, c2 = 0, c3 = 0, c4 = 0;
    //find MD5 hash from word
    md5_vfy(word,totalLen, &c1, &c2, &c3, &c4);

    //compare hash with the input one
    if(c1 == v1 && c2 == v2 && c3 == v3 && c4 == v4)
    {
        //place the right string into first index of array
        int j;
        for(j= 0; j < 14; j++)
        {
            correctPass[j] = word[j];
        }
        correctPass[totalLen] = 0;
    }
    IncrementBruteGPU(ourBrute, charSetLen, bruteLength, numThreads);
}}

这是主要的:

//just the main, not the entire main.cpp
int main( int argc, char** argv){
int digit=1;
int charSetLen = 0;
char hash[32];
char *strhash[32];

printf("Insert Hash: ");
scanf("%s", strhash);
system("cls");

int numThreads = BLOCKS * THREADS_PER_BLOCK;

unsigned char currentBrute[14];
unsigned char cpuCorrectPass[14];

ZeroFill(currentBrute, 14);
ZeroFill(cpuCorrectPass, 14);

charSetLen = 65;
unsigned char charSet[65];
memcpy(charSet, " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789@_", charSetLen);
memcpy(hash, strhash, 32);

//break hash into 4 processes of MD5
unsigned int v1, v2, v3, v4;
md5_to_ints(hash,&v1,&v2,&v3,&v4);

//openCL starts here
cl_platform_id cpPlatform;        // OpenCL platform
cl_device_id device_id;           // device ID
cl_context context;               // context
cl_command_queue queue;           // command queue
cl_program program;               // program
cl_kernel kernel;                 // kernel

cl_int err;
cl_mem correctPass;
cl_mem cudaCharSet;
cl_mem cudaBrute;

size_t globalSize, localSize;
size_t bytes = 14*sizeof(char);

//5 work-groups
localSize = 10;
globalSize = 50;

 // Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
if(err < 0) {
  perror("Couldn't identify a platform");
  exit(1);
} 

// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
  err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
}
if(err < 0) {
  perror("Couldn't access any devices");
  exit(1);   
}

// Create a context  
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if(err < 0) {
  perror("Couldn't create a context");
  exit(1);   
}

// Create a command queue 
queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if(err < 0) {
  perror("Couldn't create a command queue");
  exit(1);   
}

// Build the program executable 
program = build_program(context, device_id, PROGRAM_FILE);

 // Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
  perror("Couldn't create a kernel");
  exit(1);
}

// Create the input and output arrays in device memory for our calculation
cudaBrute = clCreateBuffer(context, CL_MEM_READ_ONLY, 14, NULL, NULL);
cudaCharSet = clCreateBuffer(context, CL_MEM_READ_ONLY, 95, NULL, NULL);
correctPass = clCreateBuffer(context, CL_MEM_READ_WRITE, 14, NULL, NULL);

// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, correctPass, CL_TRUE, 0,
    bytes, cpuCorrectPass, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
    bytes, charSet, 0, NULL, NULL);

// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(unsigned int), &numThreads);
err  |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &charSetLen);
err  |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);
err  |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err  |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err  |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err  |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err  |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err  |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err  |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);

bool finished = false;
int ct = 0;
while(true){
do{
    err = clEnqueueWriteBuffer(queue, cudaBrute, CL_TRUE, 0,
        bytes, currentBrute, 0, NULL, NULL);

// Execute the kernel over the entire range of the data set  
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                          0, NULL, NULL);

// Wait for the command queue to get serviced before reading back results
    clFinish(queue);

// Read the results from the device
    clEnqueueReadBuffer(queue, correctPass, CL_TRUE, 0, bytes, cpuCorrectPass, 0, NULL, NULL );

    if(cpuCorrectPass[0] != 0)
    {       
        printf("MD5 Cracked---->\t");
        int k = 0;
        while(cpuCorrectPass[k] != 0)
        {
            printf("%c", cpuCorrectPass[k]);
            k++;
        }
        printf("\n\n");
        return 0;
    }
    finished = BruteIncrement(currentBrute, charSetLen, digit, numThreads * 200);
    if(ct % OUTPUT_INTERVAL == 0)
    {
        printf("STATUS: ");
        int k = 0;
        for(k = 0; k < digit; k++)
            printf("%c",charSet[currentBrute[k]]);
        printf("\n");
    }
    ct++;
} while(!finished);
    digit=digit+1;
}   
// release OpenCL resources
clReleaseMemObject(correctPass);
clReleaseMemObject(cudaCharSet);
clReleaseMemObject(cudaBrute);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);

return 0;}

这个程序的问题是它永远找不到正确的字符串比较暴力哈希和输入哈希的想法似乎行不通。我得到的 CUDA 版本完美运行。

请告诉我是什么导致这 运行 不正确。我怀疑内核根本不工作,或者我对 openCL 中的 read/write 内存和缓冲区缺乏了解,或者通常导致这个。

*如果您想查看所有文件,请问我。,因为我认为如果我 post 在这里会太长。 之前谢谢,抱歉格式错误。

您的内核正在从 OpenCL 内核源代码 (cudaBrutecudaCharSetcorrectPass) 的程序范围内定义的常量数组进行读写。这些数组未初始化,主机将永远无法从内核获得输出。要将输入数据从主机传输到内核并从内核检索结果,您需要使用内核参数,而不是程序范围变量。

您的内核定义应如下所示:

__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
                    unsigned int bruteLength, unsigned int v1,
                    unsigned int v2, unsigned int v3, unsigned int v4,
                    __global uchar *cudaBrute, 
                    __global uchar *cudaCharSet,
                    __global uchar *correctPass)
{
  ...
  (do stuff with the arguments)
  ...
}

要从您的主机代码设置参数,您可以这样做:

// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(int), &numThreads);
err  |= clSetKernelArg(kernel, 1, sizeof(int), &charSetLen);
err  |= clSetKernelArg(kernel, 2, sizeof(int), &digit);
err  |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err  |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err  |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err  |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err  |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err  |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err  |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);

注意第二个参数,它是内核定义中的参数索引,以及我们现在如何将最后三个参数传递到我们使用 clCreateBuffer 创建的缓冲区中。


(编辑:进一步调试后发现了几个问题)

您正在更新主机上 digit 的值。为了将这个更新后的值传递给每次内核调用的设备,您需要重新设置内核参数。您只需将此行移动到 clEnqueueNDRangeKernel 调用之前即可:

err  |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);

当您将数据写入 cudaCharSet 缓冲区时,您需要确保写入的数据量正确。您的代码当前使用 bytes(即 14),但这实际上应该是 charSetLen(即 65):

err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
                           charSetLen, charSet, 0, NULL, NULL);